auto_update: rebuild all on startup, needs_update = applied HEAD vs deployed sha

This commit is contained in:
damocles 2026-05-21 19:40:41 +02:00
parent 4814aaefdb
commit 433bc85b91
4 changed files with 44 additions and 139 deletions

View file

@ -1,12 +1,9 @@
//! Startup auto-update: on `hive-c0re serve` boot, rebuild any sub-agent
//! container whose recorded "hyperhive rev" differs from the current one,
//! then write the new rev as the marker. Skips rebuild when nothing changed
//! so warm restarts are near-free.
//!
//! "Rev" is the canonical filesystem path of the configured hyperhive flake
//! (e.g. `/nix/store/<hash>-source` when `/etc/hyperhive` is a symlink the
//! NixOS module wires up). For non-path flake URLs we don't have a cheap rev
//! signal, so auto-update is a no-op — operators rebuild manually.
//! Startup auto-update: on `hive-c0re serve` boot, rebuild every known
//! container unconditionally. `nixos-container update` is a no-op at the
//! nix level when nothing changed (same store path), so the cost of always
//! running it on startup is low and avoids the complexity of rev-marker
//! staleness (issue #179: all agents always needed update when any meta
//! commit landed).
use std::path::{Path, PathBuf};
use std::sync::Arc;
@ -38,50 +35,29 @@ pub fn current_flake_rev(hyperhive_flake: &str) -> Option<String> {
.map(|p| p.display().to_string())
}
/// Read the current git HEAD of the meta flake at
/// `/var/lib/hyperhive/meta`. Returns `None` when the repo does not exist
/// or `git rev-parse HEAD` fails (non-path flake, first-boot before
/// `sync_agents` has run, etc.). Callers treat `None` as "unknown" and
/// skip the meta-rev component of the combined marker.
/// Returns true when the applied repo has commits that have not yet been
/// deployed (i.e. the applied HEAD differs from the sha currently locked in
/// meta's flake.lock). This is the semantic the dashboard `needs_update` chip
/// conveys: "there is a config change ready to apply via rebuild."
#[must_use]
pub fn current_meta_rev() -> Option<String> {
let out = std::process::Command::new("git")
.args(["-C", "/var/lib/hyperhive/meta", "rev-parse", "HEAD"])
pub fn agent_config_pending(name: &str, deployed_sha: Option<&str>) -> bool {
let applied_head = std::process::Command::new("git")
.args([
"-C",
&format!("/var/lib/hyperhive/applied/{name}"),
"rev-parse",
"HEAD",
])
.output()
.ok()?;
if !out.status.success() {
return None;
}
let rev = String::from_utf8(out.stdout).ok()?;
let rev = rev.trim().to_owned();
if rev.is_empty() {
None
} else {
Some(rev)
}
}
/// Combine the hyperhive package rev and the optional meta flake rev into
/// one opaque marker string stored on disk. Including the meta rev means a
/// `sync_agents` run that rewrites the meta flake (e.g. adding a new
/// `HIVE_CONTEXT_WINDOW_TOKENS_*` env var) is detected and triggers a
/// container rebuild on the next hive-c0re boot.
#[must_use]
pub fn combined_rev(hyperhive_rev: &str, meta_rev: Option<&str>) -> String {
match meta_rev {
Some(m) => format!("{hyperhive_rev}:{m}"),
None => hyperhive_rev.to_owned(),
}
}
/// Read the marker for `name` and return whether the recorded rev matches
/// `current_rev`. Missing/unreadable marker counts as out-of-date.
#[must_use]
pub fn agent_needs_update(name: &str, current_rev: &str) -> bool {
let prev = std::fs::read_to_string(rev_marker_path(name))
.ok()
.filter(|o| o.status.success())
.and_then(|o| String::from_utf8(o.stdout).ok())
.map(|s| s.trim().to_owned());
prev.as_deref() != Some(current_rev)
match (applied_head.as_deref(), deployed_sha) {
(Some(head), Some(sha)) => !head.starts_with(sha) && !sha.starts_with(head),
_ => false,
}
}
/// Rebuild one sub-agent and refresh its marker. Used by both the startup
@ -159,10 +135,7 @@ pub async fn rebuild_agent(coord: &Arc<Coordinator>, name: &str, current_rev: &s
/// the approval queue — manager is required infrastructure. Idempotent.
pub async fn ensure_manager(coord: &Arc<Coordinator>) -> Result<()> {
let existing = lifecycle::list().await.unwrap_or_default();
let flake_rev = current_flake_rev(&coord.hyperhive_flake);
let meta_rev = current_meta_rev();
let current_rev =
flake_rev.as_deref().map(|f| combined_rev(f, meta_rev.as_deref()));
let current_rev = current_flake_rev(&coord.hyperhive_flake);
if existing.iter().any(|c| c == MANAGER_NAME) {
// Container exists already. If it predates the unified lifecycle
// (no applied flake on disk) we must rebuild — otherwise it's
@ -176,7 +149,7 @@ pub async fn ensure_manager(coord: &Arc<Coordinator>) -> Result<()> {
"manager container exists but no applied flake — forcing rebuild to migrate"
);
let coord_clone = coord.clone();
if let Err(e) = rebuild_agent(&coord_clone, MANAGER_NAME, rev).await {
if let Err(e) = rebuild_agent(&coord_clone, MANAGER_NAME, rev.as_str()).await {
tracing::warn!(error = ?e, "manager migration rebuild failed");
}
} else {
@ -204,29 +177,16 @@ pub async fn ensure_manager(coord: &Arc<Coordinator>) -> Result<()> {
)
.await?;
if let Some(rev) = current_rev {
let _ = std::fs::write(rev_marker_path(MANAGER_NAME), rev);
let _ = std::fs::write(rev_marker_path(MANAGER_NAME), &rev);
}
Ok(())
}
/// Rebuild every sub-agent whose marker differs from the current rev. Logs
/// per-agent outcomes and continues past failures. Returns Ok even if some
/// rebuilds failed — startup shouldn't be blocked by a broken agent.
/// Rebuild every container on startup. Sequential to avoid nix-store sqlite
/// races and keep logs readable. Returns Ok even if some rebuilds failed.
pub async fn run(coord: Arc<Coordinator>) -> Result<()> {
let Some(flake_rev) = current_flake_rev(&coord.hyperhive_flake) else {
tracing::info!(
flake = %coord.hyperhive_flake,
"auto-update: hyperhive_flake has no canonical path; skipping",
);
return Ok(());
};
let meta_rev = current_meta_rev();
let current_rev = combined_rev(&flake_rev, meta_rev.as_deref());
tracing::info!(rev = %current_rev, "auto-update: scanning agents");
// Bump meta's hyperhive input up-front so the per-agent rebuilds
// below build against the new base. Failure here is logged but
// not fatal — individual rebuilds will surface concrete errors.
// Bump meta's hyperhive input up-front so per-agent rebuilds build
// against the latest base. Non-fatal on failure.
if let Err(e) = crate::meta::lock_update_hyperhive().await {
tracing::warn!(error = ?e, "auto-update: meta lock_update_hyperhive failed");
}
@ -239,27 +199,17 @@ pub async fn run(coord: Arc<Coordinator>) -> Result<()> {
}
};
// Sequential, one agent at a time. Parallel rebuilds collide on
// nix-store's sqlite cache (the "sqlite db busy, not using
// cache" warning) and also race the meta-lock mutex; the
// resulting log interleave was bad enough on its own. Builds
// serialize on nix-daemon internally anyway, so this isn't a
// throughput loss in practice.
let current_rev =
current_flake_rev(&coord.hyperhive_flake).unwrap_or_default();
tracing::info!(agents = containers.len(), "auto-update: rebuilding all on startup");
for container in containers {
// Manager and sub-agents share the same lifecycle now; both go
// through rebuild_agent with name-derived paths.
let logical = if container == MANAGER_NAME {
Some(MANAGER_NAME.to_owned())
} else {
container.strip_prefix(AGENT_PREFIX).map(str::to_owned)
};
let Some(name) = logical else {
continue;
};
if !agent_needs_update(&name, &current_rev) {
tracing::debug!(%name, "auto-update: up-to-date");
continue;
}
let Some(name) = logical else { continue };
if let Err(e) = rebuild_agent(&coord, &name, &current_rev).await {
tracing::warn!(%name, error = ?e, "auto-update: rebuild failed");
}