auto-update: surface pending updates in dashboard + include manager

This commit is contained in:
müde 2026-05-15 13:31:33 +02:00
parent a4e1556f90
commit e777576528
3 changed files with 149 additions and 51 deletions

View file

@ -159,16 +159,28 @@ docs/damocles-migration.md options for moving damocles onto hyperhive
## Auto-update on startup
`hive-c0re serve` runs `auto_update::run` in a background task right after
opening the coordinator. It enumerates sub-agent containers (manager
excluded — its config comes from the host's NixOS module) and rebuilds any
whose recorded hyperhive rev differs from the current one. Rev = canonical
filesystem path of `cfg.hyperhiveFlake` (so `/etc/hyperhive` resolving to a
new `/nix/store/...-source` triggers a rebuild). Marker file:
opening the coordinator. It enumerates managed containers and rebuilds any
whose recorded hyperhive rev differs from the current one:
- **Sub-agents** rebuild via `lifecycle::rebuild` (regenerates
`applied/<name>/flake.nix`, sets nspawn flags, `nixos-container update --flake`).
- **Manager** runs `nixos-container update hm1nd` (no `--flake`). The
manager's config lives in the host's NixOS module; this is belt-and-braces
on top of NixOS's own container activation. Idempotent when nothing has
actually changed.
"Rev" = canonical filesystem path of `cfg.hyperhiveFlake` (so `/etc/hyperhive`
resolving to a new `/nix/store/...-source` triggers a rebuild). Marker file:
`/var/lib/hyperhive/applied/.<name>.hyperhive-rev`. If the flake input has
no canonical path (e.g. a `github:` URL), auto-update is a no-op — rebuild
manually. The task is async and never blocks the admin socket; failures are
logged and don't take the daemon down.
The dashboard surfaces pending updates per agent: a clickable "needs update
↻" badge appears whenever the marker differs from current rev. The badge
POSTs `/rebuild/<name>`, calling the same `auto_update::rebuild_agent` /
`rebuild_manager` path so manual triggers and the startup scan can't drift.
## Build / deploy / test
```sh

View file

@ -11,22 +11,25 @@
use std::path::{Path, PathBuf};
use std::sync::Arc;
use anyhow::Result;
use anyhow::{Context, Result, bail};
use tokio::process::Command;
use crate::coordinator::Coordinator;
use crate::lifecycle::{self, AGENT_PREFIX};
use crate::lifecycle::{self, AGENT_PREFIX, MANAGER_NAME};
/// Marker file recording the hyperhive rev a sub-agent's container was last
/// built against. Sibling of `applied/<name>/` (rather than inside it) to
/// keep it out of the applied repo's git history.
fn rev_marker_path(name: &str) -> PathBuf {
/// keep it out of the applied repo's git history. Uses a leading dot so a
/// glob over `applied/*` doesn't include it.
pub fn rev_marker_path(name: &str) -> PathBuf {
PathBuf::from(format!("/var/lib/hyperhive/applied/.{name}.hyperhive-rev"))
}
/// Resolve the current rev of `hyperhive_flake`. For a path on disk we
/// canonicalize (following symlinks) so a /etc/hyperhive → /nix/store/...
/// update yields a different string. For anything else we return None.
fn current_flake_rev(hyperhive_flake: &str) -> Option<String> {
#[must_use]
pub fn current_flake_rev(hyperhive_flake: &str) -> Option<String> {
let path = Path::new(hyperhive_flake);
if !path.exists() {
return None;
@ -36,6 +39,62 @@ fn current_flake_rev(hyperhive_flake: &str) -> Option<String> {
.map(|p| p.display().to_string())
}
/// Read the marker for `name` and return whether the recorded rev matches
/// `current_rev`. Missing/unreadable marker counts as out-of-date.
#[must_use]
pub fn agent_needs_update(name: &str, current_rev: &str) -> bool {
let prev = std::fs::read_to_string(rev_marker_path(name))
.ok()
.map(|s| s.trim().to_owned());
prev.as_deref() != Some(current_rev)
}
/// Rebuild one sub-agent and refresh its marker. Used by both the startup
/// scanner and the dashboard's manual "update" button so the two paths
/// can't diverge.
pub async fn rebuild_agent(coord: &Arc<Coordinator>, name: &str, current_rev: &str) -> Result<()> {
tracing::info!(%name, rev = %current_rev, "rebuild agent");
let agent_dir = coord
.register_agent(name)
.with_context(|| format!("register_agent {name}"))?;
let applied_dir = Coordinator::agent_applied_dir(name);
let claude_dir = Coordinator::agent_claude_dir(name);
lifecycle::rebuild(
name,
&coord.hyperhive_flake,
&agent_dir,
&applied_dir,
&claude_dir,
)
.await?;
std::fs::write(rev_marker_path(name), current_rev)
.with_context(|| format!("write rev marker for {name}"))?;
Ok(())
}
/// Apply the manager's host-declared config: `nixos-container update hm1nd`
/// (no `--flake`) re-reads `/etc/nixos-containers/hm1nd.conf`, which the
/// host's `nixos-rebuild switch` rewrites to point at the new `SYSTEM_PATH`.
/// Idempotent when nothing has changed.
pub async fn rebuild_manager(current_rev: &str) -> Result<()> {
tracing::info!(rev = %current_rev, "rebuild manager (nixos-container update hm1nd)");
let out = Command::new("nixos-container")
.args(["update", MANAGER_NAME])
.output()
.await
.context("invoke nixos-container update hm1nd")?;
if !out.status.success() {
bail!(
"nixos-container update {MANAGER_NAME} failed ({}): {}",
out.status,
String::from_utf8_lossy(&out.stderr).trim()
);
}
std::fs::write(rev_marker_path(MANAGER_NAME), current_rev)
.with_context(|| format!("write rev marker for {MANAGER_NAME}"))?;
Ok(())
}
/// Rebuild every sub-agent whose marker differs from the current rev. Logs
/// per-agent outcomes and continues past failures. Returns Ok even if some
/// rebuilds failed — startup shouldn't be blocked by a broken agent.
@ -58,55 +117,38 @@ pub async fn run(coord: Arc<Coordinator>) -> Result<()> {
};
let mut tasks = Vec::new();
let mut manager_present = false;
for container in containers {
if container == MANAGER_NAME {
manager_present = true;
continue;
}
let Some(name) = container.strip_prefix(AGENT_PREFIX) else {
continue;
};
let name = name.to_owned();
let marker = rev_marker_path(&name);
let prev = std::fs::read_to_string(&marker).ok();
if prev.as_deref().map(str::trim) == Some(current_rev.as_str()) {
if !agent_needs_update(&name, &current_rev) {
tracing::debug!(%name, "auto-update: up-to-date");
continue;
}
let coord = coord.clone();
let current_rev = current_rev.clone();
tasks.push(tokio::spawn(async move {
tracing::info!(
%name,
prev = ?prev,
rev = %current_rev,
"auto-update: rebuilding agent",
);
let agent_dir = match coord.register_agent(&name) {
Ok(d) => d,
Err(e) => {
tracing::warn!(%name, error = ?e, "auto-update: register_agent failed");
return;
}
};
let applied_dir = Coordinator::agent_applied_dir(&name);
let claude_dir = Coordinator::agent_claude_dir(&name);
match lifecycle::rebuild(
&name,
&coord.hyperhive_flake,
&agent_dir,
&applied_dir,
&claude_dir,
)
.await
{
Ok(()) => {
if let Err(e) = std::fs::write(&marker, &current_rev) {
tracing::warn!(%name, error = ?e, "auto-update: write rev marker failed");
} else {
tracing::info!(%name, "auto-update: agent rebuilt");
}
}
Err(e) => {
tracing::warn!(%name, error = ?e, "auto-update: rebuild failed");
}
if let Err(e) = rebuild_agent(&coord, &name, &current_rev).await {
tracing::warn!(%name, error = ?e, "auto-update: rebuild failed");
}
}));
}
// Manager runs unconditionally when its marker differs: even if the host
// hasn't been rebuilt yet, `nixos-container update hm1nd` is a no-op, so
// there's no harm. The host's own activation already updates declarative
// containers — this is belt-and-braces for hive-c0re restarts.
if manager_present && agent_needs_update(MANAGER_NAME, &current_rev) {
let current_rev = current_rev.clone();
tasks.push(tokio::spawn(async move {
if let Err(e) = rebuild_manager(&current_rev).await {
tracing::warn!(error = ?e, "auto-update: manager rebuild failed");
}
}));
}

View file

@ -42,6 +42,7 @@ pub async fn serve(port: u16, coord: Arc<Coordinator>) -> Result<()> {
.route("/approve/{id}", post(post_approve))
.route("/deny/{id}", post(post_deny))
.route("/destroy/{name}", post(post_destroy))
.route("/rebuild/{name}", post(post_rebuild))
.route("/request-spawn", post(post_request_spawn))
.route("/send", post(post_send))
.route("/messages/stream", get(messages_stream))
@ -64,6 +65,7 @@ async fn index(headers: HeaderMap, State(state): State<AppState>) -> Html<String
let containers = lifecycle::list().await.unwrap_or_default();
let transient = state.coord.transient_snapshot();
let current_rev = crate::auto_update::current_flake_rev(&state.coord.hyperhive_flake);
let approvals = gc_orphans(
&state.coord,
state.coord.approvals.pending().unwrap_or_default(),
@ -82,7 +84,7 @@ async fn index(headers: HeaderMap, State(state): State<AppState>) -> Html<String
Html(format!(
"<!doctype html>\n<html lang=\"en\">\n<head>\n<meta charset=\"utf-8\">\n<title>hyperhive // h1ve-c0re</title>\n{refresh}\n{STYLE}\n</head>\n<body>\n{BANNER}\n{containers}\n{talk}\n{approvals_html}\n{MSG_FLOW}\n{FOOTER}\n{MSG_FLOW_JS}\n</body>\n</html>\n",
containers = render_containers(&containers, &transient, &hostname),
containers = render_containers(&containers, &transient, current_rev.as_deref(), &hostname),
talk = render_talk(&containers),
))
}
@ -163,6 +165,24 @@ async fn post_request_spawn(
}
}
async fn post_rebuild(State(state): State<AppState>, AxumPath(name): AxumPath<String>) -> Response {
let Some(current_rev) = crate::auto_update::current_flake_rev(&state.coord.hyperhive_flake)
else {
return error_response(
"rebuild: hyperhive_flake has no canonical path; manual rebuild only via `hive-c0re rebuild`",
);
};
let result = if name == lifecycle::MANAGER_NAME {
crate::auto_update::rebuild_manager(&current_rev).await
} else {
crate::auto_update::rebuild_agent(&state.coord, &name, &current_rev).await
};
match result {
Ok(()) => Redirect::to("/").into_response(),
Err(e) => error_response(&format!("rebuild {name} failed: {e:#}")),
}
}
async fn post_destroy(State(state): State<AppState>, AxumPath(name): AxumPath<String>) -> Response {
match actions::destroy(&state.coord, &name).await {
Ok(()) => Redirect::to("/").into_response(),
@ -184,6 +204,7 @@ fn error_response(message: &str) -> Response {
fn render_containers(
containers: &[String],
transient: &std::collections::HashMap<String, crate::coordinator::TransientState>,
current_rev: Option<&str>,
hostname: &str,
) -> String {
let mut out = String::from(
@ -217,9 +238,10 @@ fn render_containers(
out.push_str("<ul>\n");
for container in containers {
if container == MANAGER_NAME {
let update_badge = update_badge_for(MANAGER_NAME, current_rev);
let _ = writeln!(
out,
"<li><span class=\"glyph\">▓█▓▒░</span> <a href=\"http://{hostname}:{MANAGER_PORT}/\">{container}</a> <span class=\"role role-m1nd\">m1nd</span> <span class=\"meta\">:{MANAGER_PORT}</span></li>",
"<li><span class=\"glyph\">▓█▓▒░</span> <a href=\"http://{hostname}:{MANAGER_PORT}/\">{container}</a> <span class=\"role role-m1nd\">m1nd</span>{update_badge} <span class=\"meta\">:{MANAGER_PORT}</span></li>",
);
} else if let Some(name) = container.strip_prefix(AGENT_PREFIX) {
let port = lifecycle::agent_web_port(name);
@ -231,9 +253,10 @@ fn render_containers(
" <a class=\"role role-pending\" href=\"http://{hostname}:{port}/\">needs login →</a>",
)
};
let update_badge = update_badge_for(name, current_rev);
let _ = writeln!(
out,
"<li><span class=\"glyph\">▒░▒░░</span> <a href=\"http://{hostname}:{port}/\">{name}</a> <span class=\"role role-ag3nt\">ag3nt</span>{login_badge} <span class=\"meta\">{container} :{port}</span>\n <form method=\"POST\" action=\"/destroy/{name}\" class=\"inline\" onsubmit=\"return confirm('destroy {name}? container is removed; state + creds kept.');\"><button class=\"btn btn-destroy\" type=\"submit\">DESTR0Y</button></form>\n</li>",
"<li><span class=\"glyph\">▒░▒░░</span> <a href=\"http://{hostname}:{port}/\">{name}</a> <span class=\"role role-ag3nt\">ag3nt</span>{login_badge}{update_badge} <span class=\"meta\">{container} :{port}</span>\n <form method=\"POST\" action=\"/destroy/{name}\" class=\"inline\" onsubmit=\"return confirm('destroy {name}? container is removed; state + creds kept.');\"><button class=\"btn btn-destroy\" type=\"submit\">DESTR0Y</button></form>\n</li>",
);
}
}
@ -319,6 +342,20 @@ fn gc_orphans(coord: &Coordinator, approvals: Vec<Approval>) -> Vec<Approval> {
.collect()
}
/// Returns either an empty string (agent is up-to-date / no rev known) or
/// a clickable "needs update" badge whose form POSTs to /rebuild/<name>.
fn update_badge_for(name: &str, current_rev: Option<&str>) -> String {
let Some(rev) = current_rev else {
return String::new();
};
if !crate::auto_update::agent_needs_update(name, rev) {
return String::new();
}
format!(
" <form method=\"POST\" action=\"/rebuild/{name}\" class=\"inline\" onsubmit=\"return confirm('rebuild {name}? hot-reloads the container.');\"><button class=\"role role-pending btn-inline\" type=\"submit\" title=\"agent's last build is older than current hyperhive rev\">needs update ↻</button></form>",
)
}
/// Host-side mirror of `hive_ag3nt::login::has_session`. Returns true if the
/// agent's bound `~/.claude/` dir on disk contains any regular file. The
/// dashboard reads this each render so logins driven from the agent web UI
@ -550,6 +587,13 @@ const STYLE: &str = r#"
.spawnform input::placeholder { color: var(--muted); }
.spawnform input:focus { outline: 1px solid var(--purple); }
.role-pending { color: var(--amber); border-color: var(--amber); }
.btn-inline {
font-family: inherit;
background: transparent;
cursor: pointer;
margin-left: 0.4em;
}
.btn-inline:hover { background: rgba(255, 184, 77, 0.1); }
.kind {
display: inline-block;
margin-left: 0.4em;