From f99ed3fe7a802e23614c05549177964e9e2e16a0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?m=C3=BCde?= Date: Fri, 15 May 2026 13:43:32 +0200 Subject: [PATCH] manager: same lifecycle as agents; auto-spawn on hive-c0re start --- CLAUDE.md | 28 ++++++++++++ hive-c0re/src/actions.rs | 2 +- hive-c0re/src/auto_update.rs | 81 +++++++++++++++++------------------ hive-c0re/src/coordinator.rs | 15 +++++++ hive-c0re/src/dashboard.rs | 6 +-- hive-c0re/src/lifecycle.rs | 82 +++++++++++++++++++++++++++++++----- hive-c0re/src/main.rs | 15 +++++-- hive-c0re/src/server.rs | 4 +- 8 files changed, 168 insertions(+), 65 deletions(-) diff --git a/CLAUDE.md b/CLAUDE.md index ef10b1f..a2a0875 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -156,6 +156,34 @@ docs/damocles-migration.md options for moving damocles onto hyperhive marks them `failed` with note `"agent state dir missing"` so they fall out of `pending`. They stay in sqlite for audit. +## Manager (hm1nd) is hive-c0re-managed + +The manager container runs through the **same lifecycle as sub-agents** — +no separate code path. On `hive-c0re serve` startup, if `nixos-container +list` doesn't include `hm1nd`, hive-c0re creates it. The manager's flake +lives at `/var/lib/hyperhive/applied/hm1nd/`; its proposed (manager-editable) +config at `/var/lib/hyperhive/agents/hm1nd/config/`. Manager can edit its +own `agent.nix` (visible inside the container at `/agents/hm1nd/config/`), +commit, and submit `request-apply-commit hm1nd ` for operator +approval — same flow as for sub-agents. + +Differences from sub-agents: +- `flake.nix` extends `hyperhive.nixosConfigurations.manager` (vs + `agent-base`). +- Container name is `hm1nd` (no `h-` prefix). +- Fixed web UI port (`MANAGER_PORT = 8000`). +- `set_nspawn_flags` adds an extra bind: `/var/lib/hyperhive/agents` → + `/agents` (RW), so the manager can edit per-agent proposed repos. +- First-deploy spawn bypasses the approval queue (manager is required + infrastructure). +- Per-agent socket is the manager socket at `/run/hyperhive/manager/`, owned + by `manager_server::start`. `coordinator::ensure_runtime` returns that + path for manager and the usual `/run/hyperhive/agents//` for the + rest. + +**Migration note:** drop any `containers.hm1nd = { ... }` block from your +host NixOS config. hyperhive creates and updates the manager itself now. + ## Auto-update on startup `hive-c0re serve` runs `auto_update::run` in a background task right after diff --git a/hive-c0re/src/actions.rs b/hive-c0re/src/actions.rs index c365557..212e254 100644 --- a/hive-c0re/src/actions.rs +++ b/hive-c0re/src/actions.rs @@ -33,7 +33,7 @@ pub async fn approve(coord: Arc, id: i64) -> Result<()> { "approval: running action", ); - let agent_dir = coord.register_agent(&approval.agent)?; + let agent_dir = coord.ensure_runtime(&approval.agent)?; let proposed_dir = Coordinator::agent_proposed_dir(&approval.agent); let applied_dir = Coordinator::agent_applied_dir(&approval.agent); let claude_dir = Coordinator::agent_claude_dir(&approval.agent); diff --git a/hive-c0re/src/auto_update.rs b/hive-c0re/src/auto_update.rs index 235004d..e970fda 100644 --- a/hive-c0re/src/auto_update.rs +++ b/hive-c0re/src/auto_update.rs @@ -11,8 +11,7 @@ use std::path::{Path, PathBuf}; use std::sync::Arc; -use anyhow::{Context, Result, bail}; -use tokio::process::Command; +use anyhow::{Context, Result}; use crate::coordinator::Coordinator; use crate::lifecycle::{self, AGENT_PREFIX, MANAGER_NAME}; @@ -55,8 +54,8 @@ pub fn agent_needs_update(name: &str, current_rev: &str) -> bool { pub async fn rebuild_agent(coord: &Arc, name: &str, current_rev: &str) -> Result<()> { tracing::info!(%name, rev = %current_rev, "rebuild agent"); let agent_dir = coord - .register_agent(name) - .with_context(|| format!("register_agent {name}"))?; + .ensure_runtime(name) + .with_context(|| format!("ensure_runtime {name}"))?; let applied_dir = Coordinator::agent_applied_dir(name); let claude_dir = Coordinator::agent_claude_dir(name); lifecycle::rebuild( @@ -72,26 +71,34 @@ pub async fn rebuild_agent(coord: &Arc, name: &str, current_rev: &s Ok(()) } -/// Apply the manager's host-declared config: `nixos-container update hm1nd` -/// (no `--flake`) re-reads `/etc/nixos-containers/hm1nd.conf`, which the -/// host's `nixos-rebuild switch` rewrites to point at the new `SYSTEM_PATH`. -/// Idempotent when nothing has changed. -pub async fn rebuild_manager(current_rev: &str) -> Result<()> { - tracing::info!(rev = %current_rev, "rebuild manager (nixos-container update hm1nd)"); - let out = Command::new("nixos-container") - .args(["update", MANAGER_NAME]) - .output() - .await - .context("invoke nixos-container update hm1nd")?; - if !out.status.success() { - bail!( - "nixos-container update {MANAGER_NAME} failed ({}): {}", - out.status, - String::from_utf8_lossy(&out.stderr).trim() - ); + +/// Auto-create the manager container on startup if it isn't already there. +/// hive-c0re manages hm1nd end-to-end (Phase 8 follow-up): operators no +/// longer declare `containers.hm1nd` in their host NixOS config. Bypasses +/// the approval queue — manager is required infrastructure. Idempotent. +pub async fn ensure_manager(coord: &Arc) -> Result<()> { + let existing = lifecycle::list().await.unwrap_or_default(); + if existing.iter().any(|c| c == MANAGER_NAME) { + tracing::debug!("manager container already present"); + return Ok(()); + } + tracing::info!("manager container missing — spawning"); + let runtime = coord.ensure_runtime(MANAGER_NAME)?; + let proposed = Coordinator::agent_proposed_dir(MANAGER_NAME); + let applied = Coordinator::agent_applied_dir(MANAGER_NAME); + let claude_dir = Coordinator::agent_claude_dir(MANAGER_NAME); + lifecycle::spawn( + MANAGER_NAME, + &coord.hyperhive_flake, + &runtime, + &proposed, + &applied, + &claude_dir, + ) + .await?; + if let Some(rev) = current_flake_rev(&coord.hyperhive_flake) { + let _ = std::fs::write(rev_marker_path(MANAGER_NAME), rev); } - std::fs::write(rev_marker_path(MANAGER_NAME), current_rev) - .with_context(|| format!("write rev marker for {MANAGER_NAME}"))?; Ok(()) } @@ -117,16 +124,17 @@ pub async fn run(coord: Arc) -> Result<()> { }; let mut tasks = Vec::new(); - let mut manager_present = false; for container in containers { - if container == MANAGER_NAME { - manager_present = true; - continue; - } - let Some(name) = container.strip_prefix(AGENT_PREFIX) else { + // Manager and sub-agents share the same lifecycle now; both go + // through rebuild_agent with name-derived paths. + let logical = if container == MANAGER_NAME { + Some(MANAGER_NAME.to_owned()) + } else { + container.strip_prefix(AGENT_PREFIX).map(str::to_owned) + }; + let Some(name) = logical else { continue; }; - let name = name.to_owned(); if !agent_needs_update(&name, ¤t_rev) { tracing::debug!(%name, "auto-update: up-to-date"); continue; @@ -140,19 +148,6 @@ pub async fn run(coord: Arc) -> Result<()> { })); } - // Manager runs unconditionally when its marker differs: even if the host - // hasn't been rebuilt yet, `nixos-container update hm1nd` is a no-op, so - // there's no harm. The host's own activation already updates declarative - // containers — this is belt-and-braces for hive-c0re restarts. - if manager_present && agent_needs_update(MANAGER_NAME, ¤t_rev) { - let current_rev = current_rev.clone(); - tasks.push(tokio::spawn(async move { - if let Err(e) = rebuild_manager(¤t_rev).await { - tracing::warn!(error = ?e, "auto-update: manager rebuild failed"); - } - })); - } - for t in tasks { let _ = t.await; } diff --git a/hive-c0re/src/coordinator.rs b/hive-c0re/src/coordinator.rs index 6bc2e06..8267ac5 100644 --- a/hive-c0re/src/coordinator.rs +++ b/hive-c0re/src/coordinator.rs @@ -118,6 +118,21 @@ impl Coordinator { Self::manager_dir().join("mcp.sock") } + /// Ensure a runtime dir + (for sub-agents) per-agent socket exists. For + /// the manager, `manager_server::start` owns the socket — just return + /// the dir. For sub-agents this is `register_agent` (creates a fresh + /// listener bound to `socket_path(name)`). Source directory of the + /// `/run/hive/mcp.sock` bind that ends up in `set_nspawn_flags`. + pub fn ensure_runtime(&self, name: &str) -> Result { + if name == crate::lifecycle::MANAGER_NAME { + let dir = Self::manager_dir(); + std::fs::create_dir_all(&dir) + .with_context(|| format!("create manager dir {}", dir.display()))?; + return Ok(dir); + } + self.register_agent(name) + } + /// Per-agent state root (parent of `config/`, future `prompts/`, etc.). pub fn agent_state_root(name: &str) -> PathBuf { PathBuf::from(format!("{AGENT_STATE_ROOT}/{name}")) diff --git a/hive-c0re/src/dashboard.rs b/hive-c0re/src/dashboard.rs index 1450d34..1dc2971 100644 --- a/hive-c0re/src/dashboard.rs +++ b/hive-c0re/src/dashboard.rs @@ -172,11 +172,7 @@ async fn post_rebuild(State(state): State, AxumPath(name): AxumPath Redirect::to("/").into_response(), Err(e) => error_response(&format!("rebuild {name} failed: {e:#}")), diff --git a/hive-c0re/src/lifecycle.rs b/hive-c0re/src/lifecycle.rs index cc80066..110079e 100644 --- a/hive-c0re/src/lifecycle.rs +++ b/hive-c0re/src/lifecycle.rs @@ -10,9 +10,15 @@ use tokio::process::Command; /// name itself can be at most `MAX_AGENT_NAME` chars. pub const AGENT_PREFIX: &str = "h-"; pub const MAX_AGENT_NAME: usize = 9; -/// Container name of the manager (a separate slot from sub-agents). +/// Container name of the manager. Lives in the same path scheme as sub-agents +/// (`/var/lib/hyperhive/agents/hm1nd/`, `/var/lib/hyperhive/applied/hm1nd/`), +/// but its container has no `h-` prefix and extends a different +/// nixosConfiguration (`manager`, not `agent-base`). pub const MANAGER_NAME: &str = "hm1nd"; +/// Web UI port reserved for the manager (sub-agents hash into 8100..8999). +pub const MANAGER_PORT: u16 = 8000; + /// Mount point of the per-agent runtime directory inside the container. pub const CONTAINER_RUNTIME_MOUNT: &str = "/run/hive"; @@ -35,9 +41,13 @@ const DEFAULT_MEMORY_MAX: &str = "2G"; const DEFAULT_CPU_QUOTA: &str = "50%"; /// Returns the per-agent web UI port. Same hash on both sides — manager, -/// dashboard, and agent harness all agree. +/// dashboard, and agent harness all agree. Manager is fixed at +/// `MANAGER_PORT`. #[must_use] pub fn agent_web_port(name: &str) -> u16 { + if name == MANAGER_NAME { + return MANAGER_PORT; + } let mut hash: u32 = 2_166_136_261; for b in name.bytes() { hash ^= u32::from(b); @@ -47,14 +57,34 @@ pub fn agent_web_port(name: &str) -> u16 { WEB_PORT_BASE + u16::try_from(hash % u32::from(WEB_PORT_RANGE)).unwrap_or(0) } +#[must_use] pub fn container_name(name: &str) -> String { - format!("{AGENT_PREFIX}{name}") + if name == MANAGER_NAME { + MANAGER_NAME.to_owned() + } else { + format!("{AGENT_PREFIX}{name}") + } +} + +#[must_use] +pub fn is_manager(name: &str) -> bool { + name == MANAGER_NAME +} + +/// The nixosConfiguration in the hyperhive flake the agent's `flake.nix` +/// extends. Manager → `manager`; everyone else → `agent-base`. +#[must_use] +pub fn flake_base(name: &str) -> &'static str { + if is_manager(name) { "manager" } else { "agent-base" } } fn validate(name: &str) -> Result<()> { if name.is_empty() { bail!("agent name must not be empty"); } + if is_manager(name) { + return Ok(()); + } if name.len() > MAX_AGENT_NAME { bail!( "agent name '{name}' is too long ({} chars); max {MAX_AGENT_NAME}", @@ -180,14 +210,25 @@ pub async fn setup_applied(applied_dir: &Path, name: &str, hyperhive_flake: &str .with_context(|| format!("create {}", applied_dir.display()))?; let port = agent_web_port(name); + let base = flake_base(name); + let service = if is_manager(name) { + "hive-m1nd" + } else { + "hive-ag3nt" + }; + let description = if is_manager(name) { + format!("hyperhive manager {name}") + } else { + format!("hyperhive sub-agent {name}") + }; let flake_body = format!( r#"{{ - description = "hyperhive sub-agent {name}"; + description = "{description}"; inputs.hyperhive.url = "{hyperhive_flake}"; outputs = {{ hyperhive, ... }}: {{ - nixosConfigurations.default = hyperhive.nixosConfigurations.agent-base.extendModules {{ + nixosConfigurations.default = hyperhive.nixosConfigurations.{base}.extendModules {{ modules = [ ./agent.nix {{ @@ -198,7 +239,7 @@ pub async fn setup_applied(applied_dir: &Path, name: &str, hyperhive_flake: &str [init] defaultBranch = main ''; - systemd.services.hive-ag3nt.environment = {{ + systemd.services.{service}.environment = {{ HIVE_PORT = "{port}"; HIVE_LABEL = "{name}"; }}; @@ -372,14 +413,35 @@ async fn systemd_daemon_reload() -> Result<()> { /// is reachable on the host) and `EXTRA_NSPAWN_FLAGS` (the runtime-dir bind). /// The start script expands `$EXTRA_NSPAWN_FLAGS` unquoted into the /// `systemd-nspawn` command. -fn set_nspawn_flags(container: &str, agent_dir: &Path, claude_dir: &Path) -> Result<()> { +/// Where in the container's filesystem the manager sees its agents tree. +/// Matches the `/agents` path that pre-Phase-8 hosts declared via +/// `containers.hm1nd.bindMounts."/agents"`. +pub const CONTAINER_MANAGER_AGENTS_MOUNT: &str = "/agents"; + +/// The on-host root that gets bind-mounted to `/agents` inside the manager. +/// Hard-coded to match `AGENT_STATE_ROOT` in coordinator.rs (kept duplicated +/// here so lifecycle stays usable as a leaf module). +const HOST_AGENTS_ROOT: &str = "/var/lib/hyperhive/agents"; + +fn set_nspawn_flags(container: &str, runtime_dir: &Path, claude_dir: &Path) -> Result<()> { let path = format!("/etc/nixos-containers/{container}.conf"); let original = std::fs::read_to_string(&path).with_context(|| format!("read {path}"))?; - let bind_flag = format!( - "EXTRA_NSPAWN_FLAGS=\"--bind={runtime}:{CONTAINER_RUNTIME_MOUNT} --bind={claude}:{CONTAINER_CLAUDE_MOUNT}\"", - runtime = agent_dir.display(), + let mut binds = format!( + "--bind={runtime}:{CONTAINER_RUNTIME_MOUNT} --bind={claude}:{CONTAINER_CLAUDE_MOUNT}", + runtime = runtime_dir.display(), claude = claude_dir.display(), ); + if container == MANAGER_NAME { + // Manager edits sub-agent proposed/ repos and its own. RW so it can + // git-commit. Sub-agents see only their own /run/hive socket and + // /root/.claude (no /agents). + use std::fmt::Write as _; + let _ = write!( + binds, + " --bind={HOST_AGENTS_ROOT}:{CONTAINER_MANAGER_AGENTS_MOUNT}" + ); + } + let bind_flag = format!("EXTRA_NSPAWN_FLAGS=\"{binds}\""); let mut lines: Vec = original .lines() .filter(|line| { diff --git a/hive-c0re/src/main.rs b/hive-c0re/src/main.rs index d47f424..d27ac4c 100644 --- a/hive-c0re/src/main.rs +++ b/hive-c0re/src/main.rs @@ -86,16 +86,23 @@ async fn main() -> Result<()> { dashboard_port, } => { let coord = Arc::new(Coordinator::open(&db, hyperhive_flake)?); - // Run auto-update in the background — don't block service start. - // Operators sometimes need the admin socket up to debug a stuck - // agent, and the rebuild loop can take tens of seconds. + manager_server::start(coord.clone())?; + // Auto-create the manager container if it isn't there yet. Block + // on this — without hm1nd the system has no manager harness. + // Failures are logged but allowed: a broken auto-spawn shouldn't + // make the dashboard unreachable for debugging. + if let Err(e) = auto_update::ensure_manager(&coord).await { + tracing::warn!(error = ?e, "auto-spawn manager failed"); + } + // Auto-update in the background — don't block service start. + // Sub-agent rebuilds can take tens of seconds; we want the admin + // socket up immediately. let update_coord = coord.clone(); tokio::spawn(async move { if let Err(e) = auto_update::run(update_coord).await { tracing::warn!(error = ?e, "auto-update task failed"); } }); - manager_server::start(coord.clone())?; let dash_coord = coord.clone(); tokio::spawn(async move { if let Err(e) = dashboard::serve(dashboard_port, dash_coord).await { diff --git a/hive-c0re/src/server.rs b/hive-c0re/src/server.rs index 9a96030..afbc2f1 100644 --- a/hive-c0re/src/server.rs +++ b/hive-c0re/src/server.rs @@ -61,7 +61,7 @@ async fn dispatch(req: &HostRequest, coord: Arc) -> HostResponse { Ok(match req { HostRequest::Spawn { name } => { tracing::info!(%name, "spawn"); - let agent_dir = coord.register_agent(name)?; + let agent_dir = coord.ensure_runtime(name)?; let proposed_dir = Coordinator::agent_proposed_dir(name); let applied_dir = Coordinator::agent_applied_dir(name); let claude_dir = Coordinator::agent_claude_dir(name); @@ -101,7 +101,7 @@ async fn dispatch(req: &HostRequest, coord: Arc) -> HostResponse { } HostRequest::Rebuild { name } => { tracing::info!(%name, "rebuild"); - let agent_dir = coord.register_agent(name)?; + let agent_dir = coord.ensure_runtime(name)?; let applied_dir = Coordinator::agent_applied_dir(name); let claude_dir = Coordinator::agent_claude_dir(name); lifecycle::rebuild(