agents weren't being woken with the 'you were rebuilt — check /state/ for notes, --continue intact' system message after several recent rebuild surfaces: - auto_update::rebuild_agent — used by the dashboard rebuild button, admin-CLI rebuild via lifecycle_action, the startup rev-scan, AND the new meta-input update batch loop. kick moves *into* rebuild_agent's success arm so all four paths benefit. (the dashboard's lifecycle_action extra closure was already firing kick — now it's a no-op for the rebuild path since rebuild_agent does it.) - actions::run_apply_commit — apply-commit approve flow built + tagged deployed/<id> but never kicked. add kick on success with the more specific 'config update applied' hint. - server.rs::HostRequest::Rebuild — the admin-CLI direct path calls lifecycle::rebuild bypassing rebuild_agent. add kick on success. dashboard's restart / start lifecycle_action extras still kick via their own closures since they don't route through rebuild_agent. stop / kill / destroy intentionally don't kick — there's nothing to wake.
218 lines
8.8 KiB
Rust
218 lines
8.8 KiB
Rust
//! Startup auto-update: on `hive-c0re serve` boot, rebuild any sub-agent
|
|
//! container whose recorded "hyperhive rev" differs from the current one,
|
|
//! then write the new rev as the marker. Skips rebuild when nothing changed
|
|
//! so warm restarts are near-free.
|
|
//!
|
|
//! "Rev" is the canonical filesystem path of the configured hyperhive flake
|
|
//! (e.g. `/nix/store/<hash>-source` when `/etc/hyperhive` is a symlink the
|
|
//! NixOS module wires up). For non-path flake URLs we don't have a cheap rev
|
|
//! signal, so auto-update is a no-op — operators rebuild manually.
|
|
|
|
use std::path::{Path, PathBuf};
|
|
use std::sync::Arc;
|
|
|
|
use anyhow::{Context, Result};
|
|
|
|
use crate::coordinator::Coordinator;
|
|
use crate::lifecycle::{self, AGENT_PREFIX, MANAGER_NAME};
|
|
|
|
/// Marker file recording the hyperhive rev a sub-agent's container was last
|
|
/// built against. Sibling of `applied/<name>/` (rather than inside it) to
|
|
/// keep it out of the applied repo's git history. Uses a leading dot so a
|
|
/// glob over `applied/*` doesn't include it.
|
|
pub fn rev_marker_path(name: &str) -> PathBuf {
|
|
PathBuf::from(format!("/var/lib/hyperhive/applied/.{name}.hyperhive-rev"))
|
|
}
|
|
|
|
/// Resolve the current rev of `hyperhive_flake`. For a path on disk we
|
|
/// canonicalize (following symlinks) so a /etc/hyperhive → /nix/store/...
|
|
/// update yields a different string. For anything else we return None.
|
|
#[must_use]
|
|
pub fn current_flake_rev(hyperhive_flake: &str) -> Option<String> {
|
|
let path = Path::new(hyperhive_flake);
|
|
if !path.exists() {
|
|
return None;
|
|
}
|
|
std::fs::canonicalize(path)
|
|
.ok()
|
|
.map(|p| p.display().to_string())
|
|
}
|
|
|
|
/// Read the marker for `name` and return whether the recorded rev matches
|
|
/// `current_rev`. Missing/unreadable marker counts as out-of-date.
|
|
#[must_use]
|
|
pub fn agent_needs_update(name: &str, current_rev: &str) -> bool {
|
|
let prev = std::fs::read_to_string(rev_marker_path(name))
|
|
.ok()
|
|
.map(|s| s.trim().to_owned());
|
|
prev.as_deref() != Some(current_rev)
|
|
}
|
|
|
|
/// Rebuild one sub-agent and refresh its marker. Used by both the startup
|
|
/// scanner and the dashboard's manual "update" button so the two paths
|
|
/// can't diverge.
|
|
pub async fn rebuild_agent(coord: &Arc<Coordinator>, name: &str, current_rev: &str) -> Result<()> {
|
|
tracing::info!(%name, rev = %current_rev, "rebuild agent");
|
|
let agent_dir = coord
|
|
.ensure_runtime(name)
|
|
.with_context(|| format!("ensure_runtime {name}"))?;
|
|
let applied_dir = Coordinator::agent_applied_dir(name);
|
|
let claude_dir = Coordinator::agent_claude_dir(name);
|
|
let notes_dir = Coordinator::agent_notes_dir(name);
|
|
// Suppress crash_watch during the stop+start window inside
|
|
// lifecycle::rebuild. Dashboard rebuilds already do this via
|
|
// lifecycle_action; this catches the auto-update scan + any
|
|
// other direct caller.
|
|
coord.set_transient(name, crate::coordinator::TransientKind::Rebuilding);
|
|
let result = lifecycle::rebuild(
|
|
name,
|
|
&coord.hyperhive_flake,
|
|
&agent_dir,
|
|
&applied_dir,
|
|
&claude_dir,
|
|
¬es_dir,
|
|
coord.dashboard_port,
|
|
&coord.operator_pronouns,
|
|
)
|
|
.await;
|
|
coord.clear_transient(name);
|
|
match &result {
|
|
Ok(()) => {
|
|
if let Err(e) = std::fs::write(rev_marker_path(name), current_rev) {
|
|
tracing::warn!(%name, error = ?e, "write rev marker failed");
|
|
}
|
|
coord.notify_manager(&hive_sh4re::HelperEvent::Rebuilt {
|
|
agent: name.to_owned(),
|
|
ok: true,
|
|
note: None,
|
|
sha: None,
|
|
tag: None,
|
|
});
|
|
// Wake the agent on its next turn so claude sees a
|
|
// "you were rebuilt — check /state/ for notes, --continue
|
|
// session intact" hint. Covers dashboard rebuild, admin
|
|
// CLI rebuild, auto-update startup scan, and the
|
|
// dashboard's meta-input update path — all of which
|
|
// route through rebuild_agent.
|
|
coord.kick_agent(name, "container rebuilt");
|
|
}
|
|
Err(e) => {
|
|
coord.notify_manager(&hive_sh4re::HelperEvent::Rebuilt {
|
|
agent: name.to_owned(),
|
|
ok: false,
|
|
note: Some(format!("{e:#}")),
|
|
sha: None,
|
|
tag: None,
|
|
});
|
|
}
|
|
}
|
|
result
|
|
}
|
|
|
|
/// Auto-create the manager container on startup if it isn't already there.
|
|
/// hive-c0re manages hm1nd end-to-end (Phase 8 follow-up): operators no
|
|
/// longer declare `containers.hm1nd` in their host NixOS config. Bypasses
|
|
/// the approval queue — manager is required infrastructure. Idempotent.
|
|
pub async fn ensure_manager(coord: &Arc<Coordinator>) -> Result<()> {
|
|
let existing = lifecycle::list().await.unwrap_or_default();
|
|
let current_rev = current_flake_rev(&coord.hyperhive_flake);
|
|
if existing.iter().any(|c| c == MANAGER_NAME) {
|
|
// Container exists already. If it predates the unified lifecycle
|
|
// (no applied flake on disk) we must rebuild — otherwise it's
|
|
// running whatever the host-declarative config was at create
|
|
// time, with a wrong systemd unit and port.
|
|
let applied_flake = Coordinator::agent_applied_dir(MANAGER_NAME).join("flake.nix");
|
|
if !applied_flake.exists()
|
|
&& let Some(rev) = current_rev.as_ref()
|
|
{
|
|
tracing::warn!(
|
|
"manager container exists but no applied flake — forcing rebuild to migrate"
|
|
);
|
|
let coord_clone = coord.clone();
|
|
if let Err(e) = rebuild_agent(&coord_clone, MANAGER_NAME, rev).await {
|
|
tracing::warn!(error = ?e, "manager migration rebuild failed");
|
|
}
|
|
} else {
|
|
tracing::debug!("manager container already present");
|
|
}
|
|
return Ok(());
|
|
}
|
|
tracing::info!("manager container missing — spawning");
|
|
let runtime = coord.ensure_runtime(MANAGER_NAME)?;
|
|
let proposed = Coordinator::agent_proposed_dir(MANAGER_NAME);
|
|
let applied = Coordinator::agent_applied_dir(MANAGER_NAME);
|
|
let claude_dir = Coordinator::agent_claude_dir(MANAGER_NAME);
|
|
let notes_dir = Coordinator::agent_notes_dir(MANAGER_NAME);
|
|
lifecycle::spawn(
|
|
MANAGER_NAME,
|
|
&coord.hyperhive_flake,
|
|
&runtime,
|
|
&proposed,
|
|
&applied,
|
|
&claude_dir,
|
|
¬es_dir,
|
|
coord.dashboard_port,
|
|
&coord.operator_pronouns,
|
|
)
|
|
.await?;
|
|
if let Some(rev) = current_rev {
|
|
let _ = std::fs::write(rev_marker_path(MANAGER_NAME), rev);
|
|
}
|
|
Ok(())
|
|
}
|
|
|
|
/// Rebuild every sub-agent whose marker differs from the current rev. Logs
|
|
/// per-agent outcomes and continues past failures. Returns Ok even if some
|
|
/// rebuilds failed — startup shouldn't be blocked by a broken agent.
|
|
pub async fn run(coord: Arc<Coordinator>) -> Result<()> {
|
|
let Some(current_rev) = current_flake_rev(&coord.hyperhive_flake) else {
|
|
tracing::info!(
|
|
flake = %coord.hyperhive_flake,
|
|
"auto-update: hyperhive_flake has no canonical path; skipping",
|
|
);
|
|
return Ok(());
|
|
};
|
|
tracing::info!(rev = %current_rev, "auto-update: scanning agents");
|
|
|
|
// Bump meta's hyperhive input up-front so the per-agent rebuilds
|
|
// below build against the new base. Failure here is logged but
|
|
// not fatal — individual rebuilds will surface concrete errors.
|
|
if let Err(e) = crate::meta::lock_update_hyperhive().await {
|
|
tracing::warn!(error = ?e, "auto-update: meta lock_update_hyperhive failed");
|
|
}
|
|
|
|
let containers = match lifecycle::list().await {
|
|
Ok(c) => c,
|
|
Err(e) => {
|
|
tracing::warn!(error = ?e, "auto-update: nixos-container list failed");
|
|
return Ok(());
|
|
}
|
|
};
|
|
|
|
// Sequential, one agent at a time. Parallel rebuilds collide on
|
|
// nix-store's sqlite cache (the "sqlite db busy, not using
|
|
// cache" warning) and also race the meta-lock mutex; the
|
|
// resulting log interleave was bad enough on its own. Builds
|
|
// serialize on nix-daemon internally anyway, so this isn't a
|
|
// throughput loss in practice.
|
|
for container in containers {
|
|
// Manager and sub-agents share the same lifecycle now; both go
|
|
// through rebuild_agent with name-derived paths.
|
|
let logical = if container == MANAGER_NAME {
|
|
Some(MANAGER_NAME.to_owned())
|
|
} else {
|
|
container.strip_prefix(AGENT_PREFIX).map(str::to_owned)
|
|
};
|
|
let Some(name) = logical else {
|
|
continue;
|
|
};
|
|
if !agent_needs_update(&name, ¤t_rev) {
|
|
tracing::debug!(%name, "auto-update: up-to-date");
|
|
continue;
|
|
}
|
|
if let Err(e) = rebuild_agent(&coord, &name, ¤t_rev).await {
|
|
tracing::warn!(%name, error = ?e, "auto-update: rebuild failed");
|
|
}
|
|
}
|
|
Ok(())
|
|
}
|