diff --git a/CLAUDE.md b/CLAUDE.md index f180de7..1a42edb 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -156,6 +156,19 @@ docs/damocles-migration.md options for moving damocles onto hyperhive marks them `failed` with note `"agent state dir missing"` so they fall out of `pending`. They stay in sqlite for audit. +## Auto-update on startup + +`hive-c0re serve` runs `auto_update::run` in a background task right after +opening the coordinator. It enumerates sub-agent containers (manager +excluded — its config comes from the host's NixOS module) and rebuilds any +whose recorded hyperhive rev differs from the current one. Rev = canonical +filesystem path of `cfg.hyperhiveFlake` (so `/etc/hyperhive` resolving to a +new `/nix/store/...-source` triggers a rebuild). Marker file: +`/var/lib/hyperhive/applied/..hyperhive-rev`. If the flake input has +no canonical path (e.g. a `github:` URL), auto-update is a no-op — rebuild +manually. The task is async and never blocks the admin socket; failures are +logged and don't take the daemon down. + ## Build / deploy / test ```sh diff --git a/hive-c0re/src/auto_update.rs b/hive-c0re/src/auto_update.rs new file mode 100644 index 0000000..609e064 --- /dev/null +++ b/hive-c0re/src/auto_update.rs @@ -0,0 +1,118 @@ +//! Startup auto-update: on `hive-c0re serve` boot, rebuild any sub-agent +//! container whose recorded "hyperhive rev" differs from the current one, +//! then write the new rev as the marker. Skips rebuild when nothing changed +//! so warm restarts are near-free. +//! +//! "Rev" is the canonical filesystem path of the configured hyperhive flake +//! (e.g. `/nix/store/-source` when `/etc/hyperhive` is a symlink the +//! NixOS module wires up). For non-path flake URLs we don't have a cheap rev +//! signal, so auto-update is a no-op — operators rebuild manually. + +use std::path::{Path, PathBuf}; +use std::sync::Arc; + +use anyhow::Result; + +use crate::coordinator::Coordinator; +use crate::lifecycle::{self, AGENT_PREFIX}; + +/// Marker file recording the hyperhive rev a sub-agent's container was last +/// built against. Sibling of `applied//` (rather than inside it) to +/// keep it out of the applied repo's git history. +fn rev_marker_path(name: &str) -> PathBuf { + PathBuf::from(format!("/var/lib/hyperhive/applied/.{name}.hyperhive-rev")) +} + +/// Resolve the current rev of `hyperhive_flake`. For a path on disk we +/// canonicalize (following symlinks) so a /etc/hyperhive → /nix/store/... +/// update yields a different string. For anything else we return None. +fn current_flake_rev(hyperhive_flake: &str) -> Option { + let path = Path::new(hyperhive_flake); + if !path.exists() { + return None; + } + std::fs::canonicalize(path) + .ok() + .map(|p| p.display().to_string()) +} + +/// Rebuild every sub-agent whose marker differs from the current rev. Logs +/// per-agent outcomes and continues past failures. Returns Ok even if some +/// rebuilds failed — startup shouldn't be blocked by a broken agent. +pub async fn run(coord: Arc) -> Result<()> { + let Some(current_rev) = current_flake_rev(&coord.hyperhive_flake) else { + tracing::info!( + flake = %coord.hyperhive_flake, + "auto-update: hyperhive_flake has no canonical path; skipping", + ); + return Ok(()); + }; + tracing::info!(rev = %current_rev, "auto-update: scanning agents"); + + let containers = match lifecycle::list().await { + Ok(c) => c, + Err(e) => { + tracing::warn!(error = ?e, "auto-update: nixos-container list failed"); + return Ok(()); + } + }; + + let mut tasks = Vec::new(); + for container in containers { + let Some(name) = container.strip_prefix(AGENT_PREFIX) else { + continue; + }; + let name = name.to_owned(); + let marker = rev_marker_path(&name); + let prev = std::fs::read_to_string(&marker).ok(); + if prev.as_deref().map(str::trim) == Some(current_rev.as_str()) { + tracing::debug!(%name, "auto-update: up-to-date"); + continue; + } + + let coord = coord.clone(); + let current_rev = current_rev.clone(); + tasks.push(tokio::spawn(async move { + tracing::info!( + %name, + prev = ?prev, + rev = %current_rev, + "auto-update: rebuilding agent", + ); + let agent_dir = match coord.register_agent(&name) { + Ok(d) => d, + Err(e) => { + tracing::warn!(%name, error = ?e, "auto-update: register_agent failed"); + return; + } + }; + let applied_dir = Coordinator::agent_applied_dir(&name); + let claude_dir = Coordinator::agent_claude_dir(&name); + match lifecycle::rebuild( + &name, + &coord.hyperhive_flake, + &agent_dir, + &applied_dir, + &claude_dir, + ) + .await + { + Ok(()) => { + if let Err(e) = std::fs::write(&marker, ¤t_rev) { + tracing::warn!(%name, error = ?e, "auto-update: write rev marker failed"); + } else { + tracing::info!(%name, "auto-update: agent rebuilt"); + } + } + Err(e) => { + tracing::warn!(%name, error = ?e, "auto-update: rebuild failed"); + } + } + })); + } + + for t in tasks { + let _ = t.await; + } + Ok(()) +} diff --git a/hive-c0re/src/main.rs b/hive-c0re/src/main.rs index 1db132d..d47f424 100644 --- a/hive-c0re/src/main.rs +++ b/hive-c0re/src/main.rs @@ -8,6 +8,7 @@ use hive_sh4re::{HostRequest, HostResponse}; mod actions; mod agent_server; mod approvals; +mod auto_update; mod broker; mod client; mod coordinator; @@ -85,6 +86,15 @@ async fn main() -> Result<()> { dashboard_port, } => { let coord = Arc::new(Coordinator::open(&db, hyperhive_flake)?); + // Run auto-update in the background — don't block service start. + // Operators sometimes need the admin socket up to debug a stuck + // agent, and the rebuild loop can take tens of seconds. + let update_coord = coord.clone(); + tokio::spawn(async move { + if let Err(e) = auto_update::run(update_coord).await { + tracing::warn!(error = ?e, "auto-update task failed"); + } + }); manager_server::start(coord.clone())?; let dash_coord = coord.clone(); tokio::spawn(async move {