manager: same lifecycle as agents; auto-spawn on hive-c0re start

This commit is contained in:
müde 2026-05-15 13:43:32 +02:00
parent d81a845dbe
commit f99ed3fe7a
8 changed files with 168 additions and 65 deletions

View file

@ -11,8 +11,7 @@
use std::path::{Path, PathBuf};
use std::sync::Arc;
use anyhow::{Context, Result, bail};
use tokio::process::Command;
use anyhow::{Context, Result};
use crate::coordinator::Coordinator;
use crate::lifecycle::{self, AGENT_PREFIX, MANAGER_NAME};
@ -55,8 +54,8 @@ pub fn agent_needs_update(name: &str, current_rev: &str) -> bool {
pub async fn rebuild_agent(coord: &Arc<Coordinator>, name: &str, current_rev: &str) -> Result<()> {
tracing::info!(%name, rev = %current_rev, "rebuild agent");
let agent_dir = coord
.register_agent(name)
.with_context(|| format!("register_agent {name}"))?;
.ensure_runtime(name)
.with_context(|| format!("ensure_runtime {name}"))?;
let applied_dir = Coordinator::agent_applied_dir(name);
let claude_dir = Coordinator::agent_claude_dir(name);
lifecycle::rebuild(
@ -72,26 +71,34 @@ pub async fn rebuild_agent(coord: &Arc<Coordinator>, name: &str, current_rev: &s
Ok(())
}
/// Apply the manager's host-declared config: `nixos-container update hm1nd`
/// (no `--flake`) re-reads `/etc/nixos-containers/hm1nd.conf`, which the
/// host's `nixos-rebuild switch` rewrites to point at the new `SYSTEM_PATH`.
/// Idempotent when nothing has changed.
pub async fn rebuild_manager(current_rev: &str) -> Result<()> {
tracing::info!(rev = %current_rev, "rebuild manager (nixos-container update hm1nd)");
let out = Command::new("nixos-container")
.args(["update", MANAGER_NAME])
.output()
.await
.context("invoke nixos-container update hm1nd")?;
if !out.status.success() {
bail!(
"nixos-container update {MANAGER_NAME} failed ({}): {}",
out.status,
String::from_utf8_lossy(&out.stderr).trim()
);
/// Auto-create the manager container on startup if it isn't already there.
/// hive-c0re manages hm1nd end-to-end (Phase 8 follow-up): operators no
/// longer declare `containers.hm1nd` in their host NixOS config. Bypasses
/// the approval queue — manager is required infrastructure. Idempotent.
pub async fn ensure_manager(coord: &Arc<Coordinator>) -> Result<()> {
let existing = lifecycle::list().await.unwrap_or_default();
if existing.iter().any(|c| c == MANAGER_NAME) {
tracing::debug!("manager container already present");
return Ok(());
}
tracing::info!("manager container missing — spawning");
let runtime = coord.ensure_runtime(MANAGER_NAME)?;
let proposed = Coordinator::agent_proposed_dir(MANAGER_NAME);
let applied = Coordinator::agent_applied_dir(MANAGER_NAME);
let claude_dir = Coordinator::agent_claude_dir(MANAGER_NAME);
lifecycle::spawn(
MANAGER_NAME,
&coord.hyperhive_flake,
&runtime,
&proposed,
&applied,
&claude_dir,
)
.await?;
if let Some(rev) = current_flake_rev(&coord.hyperhive_flake) {
let _ = std::fs::write(rev_marker_path(MANAGER_NAME), rev);
}
std::fs::write(rev_marker_path(MANAGER_NAME), current_rev)
.with_context(|| format!("write rev marker for {MANAGER_NAME}"))?;
Ok(())
}
@ -117,16 +124,17 @@ pub async fn run(coord: Arc<Coordinator>) -> Result<()> {
};
let mut tasks = Vec::new();
let mut manager_present = false;
for container in containers {
if container == MANAGER_NAME {
manager_present = true;
continue;
}
let Some(name) = container.strip_prefix(AGENT_PREFIX) else {
// Manager and sub-agents share the same lifecycle now; both go
// through rebuild_agent with name-derived paths.
let logical = if container == MANAGER_NAME {
Some(MANAGER_NAME.to_owned())
} else {
container.strip_prefix(AGENT_PREFIX).map(str::to_owned)
};
let Some(name) = logical else {
continue;
};
let name = name.to_owned();
if !agent_needs_update(&name, &current_rev) {
tracing::debug!(%name, "auto-update: up-to-date");
continue;
@ -140,19 +148,6 @@ pub async fn run(coord: Arc<Coordinator>) -> Result<()> {
}));
}
// Manager runs unconditionally when its marker differs: even if the host
// hasn't been rebuilt yet, `nixos-container update hm1nd` is a no-op, so
// there's no harm. The host's own activation already updates declarative
// containers — this is belt-and-braces for hive-c0re restarts.
if manager_present && agent_needs_update(MANAGER_NAME, &current_rev) {
let current_rev = current_rev.clone();
tasks.push(tokio::spawn(async move {
if let Err(e) = rebuild_manager(&current_rev).await {
tracing::warn!(error = ?e, "auto-update: manager rebuild failed");
}
}));
}
for t in tasks {
let _ = t.await;
}