//! Startup auto-update: on `hive-c0re serve` boot, rebuild every known //! container unconditionally. `nixos-container update` is a no-op at the //! nix level when nothing changed (same store path), so the cost of always //! running it on startup is low and avoids the complexity of rev-marker //! staleness (issue #179: all agents always needed update when any meta //! commit landed). use std::path::{Path, PathBuf}; use std::sync::Arc; use anyhow::{Context, Result}; use crate::coordinator::Coordinator; use crate::lifecycle::{self, AGENT_PREFIX, MANAGER_NAME}; /// Marker file recording the hyperhive rev a sub-agent's container was last /// built against. Sibling of `applied//` (rather than inside it) to /// keep it out of the applied repo's git history. Uses a leading dot so a /// glob over `applied/*` doesn't include it. pub fn rev_marker_path(name: &str) -> PathBuf { PathBuf::from(format!("/var/lib/hyperhive/applied/.{name}.hyperhive-rev")) } /// Resolve the current rev of `hyperhive_flake`. For a path on disk we /// canonicalize (following symlinks) so a /etc/hyperhive → /nix/store/... /// update yields a different string. For anything else we return None. #[must_use] pub fn current_flake_rev(hyperhive_flake: &str) -> Option { let path = Path::new(hyperhive_flake); if !path.exists() { return None; } std::fs::canonicalize(path) .ok() .map(|p| p.display().to_string()) } /// Returns true when the applied repo has commits that have not yet been /// deployed (i.e. the applied HEAD differs from the sha currently locked in /// meta's flake.lock). This is the semantic the dashboard `needs_update` chip /// conveys: "there is a config change ready to apply via rebuild." #[must_use] pub fn agent_config_pending(name: &str, deployed_sha: Option<&str>) -> bool { let applied_head = std::process::Command::new("git") .args([ "-C", &format!("/var/lib/hyperhive/applied/{name}"), "rev-parse", "HEAD", ]) .output() .ok() .filter(|o| o.status.success()) .and_then(|o| String::from_utf8(o.stdout).ok()) .map(|s| s.trim().to_owned()); match (applied_head.as_deref(), deployed_sha) { (Some(head), Some(sha)) => !head.starts_with(sha) && !sha.starts_with(head), _ => false, } } /// Rebuild one sub-agent and refresh its marker. Used by both the startup /// scanner and the dashboard's manual "update" button so the two paths /// can't diverge. pub async fn rebuild_agent(coord: &Arc, name: &str, current_rev: &str) -> Result<()> { tracing::info!(%name, rev = %current_rev, "rebuild agent"); let agent_dir = coord .ensure_runtime(name) .with_context(|| format!("ensure_runtime {name}"))?; let applied_dir = Coordinator::agent_applied_dir(name); let claude_dir = Coordinator::agent_claude_dir(name); let notes_dir = Coordinator::agent_notes_dir(name); // Suppress crash_watch during the stop+start window inside // lifecycle::rebuild. Dashboard rebuilds already do this via // lifecycle_action; this catches the auto-update scan + any // other direct caller. let guard = coord.transient_guard(name, crate::coordinator::TransientKind::Rebuilding); let result = lifecycle::rebuild( name, &coord.hyperhive_flake, &agent_dir, &applied_dir, &claude_dir, ¬es_dir, coord.dashboard_port, &coord.operator_pronouns, &coord.context_window_tokens, ) .await; drop(guard); match &result { Ok(()) => { if let Err(e) = std::fs::write(rev_marker_path(name), current_rev) { tracing::warn!(%name, error = ?e, "write rev marker failed"); } coord.notify_manager(&hive_sh4re::HelperEvent::Rebuilt { agent: name.to_owned(), ok: true, note: None, sha: None, tag: None, }); // Run the full forge sync on every successful rebuild so // the rebuild path is equivalent to the hive-c0re startup // sweep: token, config-repo mirror, meta read access, and // meta remote are all kept in sync. Recovers missing tokens // (e.g. first-spawn seeding failed transiently) without // requiring a full hive-c0re restart. crate::forge::sync_agent(name, crate::forge::core_token().as_deref()).await; // Wake the agent on its next turn so claude sees a // "you were rebuilt — check /state/ for notes, --continue // session intact" hint. Covers dashboard rebuild, admin // CLI rebuild, auto-update startup scan, and the // dashboard's meta-input update path — all of which // route through rebuild_agent. coord.kick_agent(name, "container rebuilt"); // Container state (needs_update, deployed_sha) may have // shifted — rescan so dashboards drop the "needs update" // chip without waiting for the next /api/state poll. coord.rescan_containers_and_emit().await; // Lock bump → meta-inputs panel needs to re-render. crate::dashboard::emit_meta_inputs_snapshot(coord); } Err(e) => { coord.notify_manager(&hive_sh4re::HelperEvent::Rebuilt { agent: name.to_owned(), ok: false, note: Some(format!("{e:#}")), sha: None, tag: None, }); coord.rescan_containers_and_emit().await; } } result } /// Auto-create the manager container on startup if it isn't already there. /// hive-c0re manages hm1nd end-to-end (Phase 8 follow-up): operators no /// longer declare `containers.hm1nd` in their host NixOS config. Bypasses /// the approval queue — manager is required infrastructure. Idempotent. pub async fn ensure_manager(coord: &Arc) -> Result<()> { let existing = lifecycle::list().await.unwrap_or_default(); let current_rev = current_flake_rev(&coord.hyperhive_flake); if existing.iter().any(|c| c == MANAGER_NAME) { // Container exists already. If it predates the unified lifecycle // (no applied flake on disk) we must rebuild — otherwise it's // running whatever the host-declarative config was at create // time, with a wrong systemd unit and port. let applied_flake = Coordinator::agent_applied_dir(MANAGER_NAME).join("flake.nix"); if !applied_flake.exists() && let Some(rev) = current_rev.as_ref() { tracing::warn!( "manager container exists but no applied flake — forcing rebuild to migrate" ); let coord_clone = coord.clone(); if let Err(e) = rebuild_agent(&coord_clone, MANAGER_NAME, rev.as_str()).await { tracing::warn!(error = ?e, "manager migration rebuild failed"); } } else { tracing::debug!("manager container already present"); } return Ok(()); } tracing::info!("manager container missing — spawning"); let runtime = coord.ensure_runtime(MANAGER_NAME)?; let proposed = Coordinator::agent_proposed_dir(MANAGER_NAME); let applied = Coordinator::agent_applied_dir(MANAGER_NAME); let claude_dir = Coordinator::agent_claude_dir(MANAGER_NAME); let notes_dir = Coordinator::agent_notes_dir(MANAGER_NAME); lifecycle::spawn( MANAGER_NAME, &coord.hyperhive_flake, &runtime, &proposed, &applied, &claude_dir, ¬es_dir, coord.dashboard_port, &coord.operator_pronouns, &coord.context_window_tokens, ) .await?; if let Some(rev) = current_rev { let _ = std::fs::write(rev_marker_path(MANAGER_NAME), &rev); } Ok(()) } /// Rebuild every container on startup. Sequential to avoid nix-store sqlite /// races and keep logs readable. Returns Ok even if some rebuilds failed. pub async fn run(coord: Arc) -> Result<()> { // Bump meta's hyperhive input up-front so per-agent rebuilds build // against the latest base. Non-fatal on failure. if let Err(e) = crate::meta::lock_update_hyperhive().await { tracing::warn!(error = ?e, "auto-update: meta lock_update_hyperhive failed"); } let containers = match lifecycle::list().await { Ok(c) => c, Err(e) => { tracing::warn!(error = ?e, "auto-update: nixos-container list failed"); return Ok(()); } }; let _current_rev = current_flake_rev(&coord.hyperhive_flake).unwrap_or_default(); tracing::info!(agents = containers.len(), "auto-update: queueing all on startup"); for container in containers { let logical = if container == MANAGER_NAME { Some(MANAGER_NAME.to_owned()) } else { container.strip_prefix(AGENT_PREFIX).map(str::to_owned) }; let Some(name) = logical else { continue }; coord.rebuild_queue.enqueue( crate::rebuild_queue::QueueKind::Rebuild, name, crate::rebuild_queue::QueueSource::AutoUpdate, "startup sweep".to_owned(), None, ); } coord.emit_rebuild_queue_snapshot(); Ok(()) }