hyperhive/hive-c0re/src/auto_update.rs

//! Startup auto-update: on `hive-c0re serve` boot, rebuild every known
//! container unconditionally. `nixos-container update` is a no-op at the
//! nix level when nothing changed (same store path), so the cost of always
//! running it on startup is low and avoids the complexity of rev-marker
//! staleness (issue #179: all agents always needed update when any meta
//! commit landed).

use std::path::{Path, PathBuf};
use std::sync::Arc;

use anyhow::{Context, Result};

use crate::coordinator::Coordinator;
use crate::lifecycle::{self, AGENT_PREFIX, MANAGER_NAME};

/// Marker file recording the hyperhive rev a sub-agent's container was last
/// built against. Sibling of `applied/<name>/` (rather than inside it) to
/// keep it out of the applied repo's git history. Uses a leading dot so a
/// glob over `applied/*` doesn't include it.
pub fn rev_marker_path(name: &str) -> PathBuf {
    PathBuf::from(format!("/var/lib/hyperhive/applied/.{name}.hyperhive-rev"))
}

/// Resolve the current rev of `hyperhive_flake`. For a path on disk we
/// canonicalize (following symlinks) so a /etc/hyperhive → /nix/store/...
/// update yields a different string. For anything else we return None.
#[must_use]
pub fn current_flake_rev(hyperhive_flake: &str) -> Option<String> {
    let path = Path::new(hyperhive_flake);
    if !path.exists() {
        return None;
    }
    std::fs::canonicalize(path)
        .ok()
        .map(|p| p.display().to_string())
}

/// Returns true when the applied repo has commits that have not yet been
/// deployed (i.e. the applied HEAD differs from the sha currently locked in
/// meta's flake.lock). This is the semantic the dashboard `needs_update` chip
/// conveys: "there is a config change ready to apply via rebuild."
#[must_use]
pub fn agent_config_pending(name: &str, deployed_sha: Option<&str>) -> bool {
    let applied_head = std::process::Command::new("git")
        .args([
            "-C",
            &format!("/var/lib/hyperhive/applied/{name}"),
            "rev-parse",
            "HEAD",
        ])
        .output()
        .ok()
        .filter(|o| o.status.success())
        .and_then(|o| String::from_utf8(o.stdout).ok())
        .map(|s| s.trim().to_owned());

    match (applied_head.as_deref(), deployed_sha) {
        (Some(head), Some(sha)) => !head.starts_with(sha) && !sha.starts_with(head),
        _ => false,
    }
}

/// Rebuild one sub-agent and refresh its marker. Used by both the startup
/// scanner and the dashboard's manual "update" button so the two paths
/// can't diverge.
pub async fn rebuild_agent(coord: &Arc<Coordinator>, name: &str, current_rev: &str) -> Result<()> {
    tracing::info!(%name, rev = %current_rev, "rebuild agent");
    let agent_dir = coord
        .ensure_runtime(name)
        .with_context(|| format!("ensure_runtime {name}"))?;
    let applied_dir = Coordinator::agent_applied_dir(name);
    let claude_dir = Coordinator::agent_claude_dir(name);
    let notes_dir = Coordinator::agent_notes_dir(name);
    // Suppress crash_watch during the stop+start window inside
    // lifecycle::rebuild. Dashboard rebuilds already do this via
    // lifecycle_action; this catches the auto-update scan + any
    // other direct caller.
    let guard = coord.transient_guard(name, crate::coordinator::TransientKind::Rebuilding);
    let result = lifecycle::rebuild(
        name,
        &coord.hyperhive_flake,
        &agent_dir,
        &applied_dir,
        &claude_dir,
        &notes_dir,
        coord.dashboard_port,
        &coord.operator_pronouns,
        &coord.context_window_tokens,
    )
    .await;
    drop(guard);
    match &result {
        Ok(()) => {
            if let Err(e) = std::fs::write(rev_marker_path(name), current_rev) {
                tracing::warn!(%name, error = ?e, "write rev marker failed");
            }
            coord.notify_manager(&hive_sh4re::HelperEvent::Rebuilt {
                agent: name.to_owned(),
                ok: true,
                note: None,
                sha: None,
                tag: None,
            });
            // Run the full forge sync on every successful rebuild so
            // the rebuild path is equivalent to the hive-c0re startup
            // sweep: token, config-repo mirror, meta read access, and
            // meta remote are all kept in sync. Recovers missing tokens
            // (e.g. first-spawn seeding failed transiently) without
            // requiring a full hive-c0re restart.
            crate::forge::sync_agent(name, crate::forge::core_token().as_deref()).await;
            // Wake the agent on its next turn so claude sees a
            // "you were rebuilt — check /state/ for notes, --continue
            // session intact" hint. Covers dashboard rebuild, admin
            // CLI rebuild, auto-update startup scan, and the
            // dashboard's meta-input update path — all of which
            // route through rebuild_agent.
            coord.kick_agent(name, "container rebuilt");
            // Container state (needs_update, deployed_sha) may have
            // shifted — rescan so dashboards drop the "needs update"
            // chip without waiting for the next /api/state poll.
            coord.rescan_containers_and_emit().await;
            // Lock bump → meta-inputs panel needs to re-render.
            crate::dashboard::emit_meta_inputs_snapshot(coord);
        }
        Err(e) => {
            coord.notify_manager(&hive_sh4re::HelperEvent::Rebuilt {
                agent: name.to_owned(),
                ok: false,
                note: Some(format!("{e:#}")),
                sha: None,
                tag: None,
            });
            coord.rescan_containers_and_emit().await;
        }
    }
    result
}

/// Auto-create the manager container on startup if it isn't already there.
/// hive-c0re manages hm1nd end-to-end (Phase 8 follow-up): operators no
/// longer declare `containers.hm1nd` in their host NixOS config. Bypasses
/// the approval queue — manager is required infrastructure. Idempotent.
pub async fn ensure_manager(coord: &Arc<Coordinator>) -> Result<()> {
    let existing = lifecycle::list().await.unwrap_or_default();
    let current_rev = current_flake_rev(&coord.hyperhive_flake);
    if existing.iter().any(|c| c == MANAGER_NAME) {
        // Container exists already. If it predates the unified lifecycle
        // (no applied flake on disk) we must rebuild — otherwise it's
        // running whatever the host-declarative config was at create
        // time, with a wrong systemd unit and port.
        let applied_flake = Coordinator::agent_applied_dir(MANAGER_NAME).join("flake.nix");
        if !applied_flake.exists()
            && let Some(rev) = current_rev.as_ref()
        {
            tracing::warn!(
                "manager container exists but no applied flake — forcing rebuild to migrate"
            );
            let coord_clone = coord.clone();
            if let Err(e) = rebuild_agent(&coord_clone, MANAGER_NAME, rev.as_str()).await {
                tracing::warn!(error = ?e, "manager migration rebuild failed");
            }
        } else {
            tracing::debug!("manager container already present");
        }
        return Ok(());
    }
    tracing::info!("manager container missing — spawning");
    let runtime = coord.ensure_runtime(MANAGER_NAME)?;
    let proposed = Coordinator::agent_proposed_dir(MANAGER_NAME);
    let applied = Coordinator::agent_applied_dir(MANAGER_NAME);
    let claude_dir = Coordinator::agent_claude_dir(MANAGER_NAME);
    let notes_dir = Coordinator::agent_notes_dir(MANAGER_NAME);
    lifecycle::spawn(
        MANAGER_NAME,
        &coord.hyperhive_flake,
        &runtime,
        &proposed,
        &applied,
        &claude_dir,
        &notes_dir,
        coord.dashboard_port,
        &coord.operator_pronouns,
        &coord.context_window_tokens,
    )
    .await?;
    if let Some(rev) = current_rev {
        let _ = std::fs::write(rev_marker_path(MANAGER_NAME), &rev);
    }
    Ok(())
}

/// Rebuild every container on startup. Sequential to avoid nix-store sqlite
/// races and keep logs readable. Returns Ok even if some rebuilds failed.
pub async fn run(coord: Arc<Coordinator>) -> Result<()> {
    // Bump meta's hyperhive input up-front so per-agent rebuilds build
    // against the latest base. Non-fatal on failure.
    if let Err(e) = crate::meta::lock_update_hyperhive().await {
        tracing::warn!(error = ?e, "auto-update: meta lock_update_hyperhive failed");
    }

    let containers = match lifecycle::list().await {
        Ok(c) => c,
        Err(e) => {
            tracing::warn!(error = ?e, "auto-update: nixos-container list failed");
            return Ok(());
        }
    };

    let _current_rev = current_flake_rev(&coord.hyperhive_flake).unwrap_or_default();

    tracing::info!(agents = containers.len(), "auto-update: queueing all on startup");
    for container in containers {
        let logical = if container == MANAGER_NAME {
            Some(MANAGER_NAME.to_owned())
        } else {
            container.strip_prefix(AGENT_PREFIX).map(str::to_owned)
        };
        let Some(name) = logical else { continue };
        coord.rebuild_queue.enqueue(
            crate::rebuild_queue::QueueKind::Rebuild,
            name,
            crate::rebuild_queue::QueueSource::AutoUpdate,
            "startup sweep".to_owned(),
            None,
        );
    }
    coord.emit_rebuild_queue_snapshot();
    Ok(())
}