manager: same lifecycle as agents; auto-spawn on hive-c0re start

This commit is contained in:
müde 2026-05-15 13:43:32 +02:00
parent d81a845dbe
commit f99ed3fe7a
8 changed files with 168 additions and 65 deletions

View file

@ -156,6 +156,34 @@ docs/damocles-migration.md options for moving damocles onto hyperhive
marks them `failed` with note `"agent state dir missing"` so they fall out marks them `failed` with note `"agent state dir missing"` so they fall out
of `pending`. They stay in sqlite for audit. of `pending`. They stay in sqlite for audit.
## Manager (hm1nd) is hive-c0re-managed
The manager container runs through the **same lifecycle as sub-agents**
no separate code path. On `hive-c0re serve` startup, if `nixos-container
list` doesn't include `hm1nd`, hive-c0re creates it. The manager's flake
lives at `/var/lib/hyperhive/applied/hm1nd/`; its proposed (manager-editable)
config at `/var/lib/hyperhive/agents/hm1nd/config/`. Manager can edit its
own `agent.nix` (visible inside the container at `/agents/hm1nd/config/`),
commit, and submit `request-apply-commit hm1nd <sha>` for operator
approval — same flow as for sub-agents.
Differences from sub-agents:
- `flake.nix` extends `hyperhive.nixosConfigurations.manager` (vs
`agent-base`).
- Container name is `hm1nd` (no `h-` prefix).
- Fixed web UI port (`MANAGER_PORT = 8000`).
- `set_nspawn_flags` adds an extra bind: `/var/lib/hyperhive/agents`
`/agents` (RW), so the manager can edit per-agent proposed repos.
- First-deploy spawn bypasses the approval queue (manager is required
infrastructure).
- Per-agent socket is the manager socket at `/run/hyperhive/manager/`, owned
by `manager_server::start`. `coordinator::ensure_runtime` returns that
path for manager and the usual `/run/hyperhive/agents/<name>/` for the
rest.
**Migration note:** drop any `containers.hm1nd = { ... }` block from your
host NixOS config. hyperhive creates and updates the manager itself now.
## Auto-update on startup ## Auto-update on startup
`hive-c0re serve` runs `auto_update::run` in a background task right after `hive-c0re serve` runs `auto_update::run` in a background task right after

View file

@ -33,7 +33,7 @@ pub async fn approve(coord: Arc<Coordinator>, id: i64) -> Result<()> {
"approval: running action", "approval: running action",
); );
let agent_dir = coord.register_agent(&approval.agent)?; let agent_dir = coord.ensure_runtime(&approval.agent)?;
let proposed_dir = Coordinator::agent_proposed_dir(&approval.agent); let proposed_dir = Coordinator::agent_proposed_dir(&approval.agent);
let applied_dir = Coordinator::agent_applied_dir(&approval.agent); let applied_dir = Coordinator::agent_applied_dir(&approval.agent);
let claude_dir = Coordinator::agent_claude_dir(&approval.agent); let claude_dir = Coordinator::agent_claude_dir(&approval.agent);

View file

@ -11,8 +11,7 @@
use std::path::{Path, PathBuf}; use std::path::{Path, PathBuf};
use std::sync::Arc; use std::sync::Arc;
use anyhow::{Context, Result, bail}; use anyhow::{Context, Result};
use tokio::process::Command;
use crate::coordinator::Coordinator; use crate::coordinator::Coordinator;
use crate::lifecycle::{self, AGENT_PREFIX, MANAGER_NAME}; use crate::lifecycle::{self, AGENT_PREFIX, MANAGER_NAME};
@ -55,8 +54,8 @@ pub fn agent_needs_update(name: &str, current_rev: &str) -> bool {
pub async fn rebuild_agent(coord: &Arc<Coordinator>, name: &str, current_rev: &str) -> Result<()> { pub async fn rebuild_agent(coord: &Arc<Coordinator>, name: &str, current_rev: &str) -> Result<()> {
tracing::info!(%name, rev = %current_rev, "rebuild agent"); tracing::info!(%name, rev = %current_rev, "rebuild agent");
let agent_dir = coord let agent_dir = coord
.register_agent(name) .ensure_runtime(name)
.with_context(|| format!("register_agent {name}"))?; .with_context(|| format!("ensure_runtime {name}"))?;
let applied_dir = Coordinator::agent_applied_dir(name); let applied_dir = Coordinator::agent_applied_dir(name);
let claude_dir = Coordinator::agent_claude_dir(name); let claude_dir = Coordinator::agent_claude_dir(name);
lifecycle::rebuild( lifecycle::rebuild(
@ -72,26 +71,34 @@ pub async fn rebuild_agent(coord: &Arc<Coordinator>, name: &str, current_rev: &s
Ok(()) Ok(())
} }
/// Apply the manager's host-declared config: `nixos-container update hm1nd`
/// (no `--flake`) re-reads `/etc/nixos-containers/hm1nd.conf`, which the /// Auto-create the manager container on startup if it isn't already there.
/// host's `nixos-rebuild switch` rewrites to point at the new `SYSTEM_PATH`. /// hive-c0re manages hm1nd end-to-end (Phase 8 follow-up): operators no
/// Idempotent when nothing has changed. /// longer declare `containers.hm1nd` in their host NixOS config. Bypasses
pub async fn rebuild_manager(current_rev: &str) -> Result<()> { /// the approval queue — manager is required infrastructure. Idempotent.
tracing::info!(rev = %current_rev, "rebuild manager (nixos-container update hm1nd)"); pub async fn ensure_manager(coord: &Arc<Coordinator>) -> Result<()> {
let out = Command::new("nixos-container") let existing = lifecycle::list().await.unwrap_or_default();
.args(["update", MANAGER_NAME]) if existing.iter().any(|c| c == MANAGER_NAME) {
.output() tracing::debug!("manager container already present");
.await return Ok(());
.context("invoke nixos-container update hm1nd")?; }
if !out.status.success() { tracing::info!("manager container missing — spawning");
bail!( let runtime = coord.ensure_runtime(MANAGER_NAME)?;
"nixos-container update {MANAGER_NAME} failed ({}): {}", let proposed = Coordinator::agent_proposed_dir(MANAGER_NAME);
out.status, let applied = Coordinator::agent_applied_dir(MANAGER_NAME);
String::from_utf8_lossy(&out.stderr).trim() let claude_dir = Coordinator::agent_claude_dir(MANAGER_NAME);
); lifecycle::spawn(
MANAGER_NAME,
&coord.hyperhive_flake,
&runtime,
&proposed,
&applied,
&claude_dir,
)
.await?;
if let Some(rev) = current_flake_rev(&coord.hyperhive_flake) {
let _ = std::fs::write(rev_marker_path(MANAGER_NAME), rev);
} }
std::fs::write(rev_marker_path(MANAGER_NAME), current_rev)
.with_context(|| format!("write rev marker for {MANAGER_NAME}"))?;
Ok(()) Ok(())
} }
@ -117,16 +124,17 @@ pub async fn run(coord: Arc<Coordinator>) -> Result<()> {
}; };
let mut tasks = Vec::new(); let mut tasks = Vec::new();
let mut manager_present = false;
for container in containers { for container in containers {
if container == MANAGER_NAME { // Manager and sub-agents share the same lifecycle now; both go
manager_present = true; // through rebuild_agent with name-derived paths.
continue; let logical = if container == MANAGER_NAME {
} Some(MANAGER_NAME.to_owned())
let Some(name) = container.strip_prefix(AGENT_PREFIX) else { } else {
container.strip_prefix(AGENT_PREFIX).map(str::to_owned)
};
let Some(name) = logical else {
continue; continue;
}; };
let name = name.to_owned();
if !agent_needs_update(&name, &current_rev) { if !agent_needs_update(&name, &current_rev) {
tracing::debug!(%name, "auto-update: up-to-date"); tracing::debug!(%name, "auto-update: up-to-date");
continue; continue;
@ -140,19 +148,6 @@ pub async fn run(coord: Arc<Coordinator>) -> Result<()> {
})); }));
} }
// Manager runs unconditionally when its marker differs: even if the host
// hasn't been rebuilt yet, `nixos-container update hm1nd` is a no-op, so
// there's no harm. The host's own activation already updates declarative
// containers — this is belt-and-braces for hive-c0re restarts.
if manager_present && agent_needs_update(MANAGER_NAME, &current_rev) {
let current_rev = current_rev.clone();
tasks.push(tokio::spawn(async move {
if let Err(e) = rebuild_manager(&current_rev).await {
tracing::warn!(error = ?e, "auto-update: manager rebuild failed");
}
}));
}
for t in tasks { for t in tasks {
let _ = t.await; let _ = t.await;
} }

View file

@ -118,6 +118,21 @@ impl Coordinator {
Self::manager_dir().join("mcp.sock") Self::manager_dir().join("mcp.sock")
} }
/// Ensure a runtime dir + (for sub-agents) per-agent socket exists. For
/// the manager, `manager_server::start` owns the socket — just return
/// the dir. For sub-agents this is `register_agent` (creates a fresh
/// listener bound to `socket_path(name)`). Source directory of the
/// `/run/hive/mcp.sock` bind that ends up in `set_nspawn_flags`.
pub fn ensure_runtime(&self, name: &str) -> Result<PathBuf> {
if name == crate::lifecycle::MANAGER_NAME {
let dir = Self::manager_dir();
std::fs::create_dir_all(&dir)
.with_context(|| format!("create manager dir {}", dir.display()))?;
return Ok(dir);
}
self.register_agent(name)
}
/// Per-agent state root (parent of `config/`, future `prompts/`, etc.). /// Per-agent state root (parent of `config/`, future `prompts/`, etc.).
pub fn agent_state_root(name: &str) -> PathBuf { pub fn agent_state_root(name: &str) -> PathBuf {
PathBuf::from(format!("{AGENT_STATE_ROOT}/{name}")) PathBuf::from(format!("{AGENT_STATE_ROOT}/{name}"))

View file

@ -172,11 +172,7 @@ async fn post_rebuild(State(state): State<AppState>, AxumPath(name): AxumPath<St
"rebuild: hyperhive_flake has no canonical path; manual rebuild only via `hive-c0re rebuild`", "rebuild: hyperhive_flake has no canonical path; manual rebuild only via `hive-c0re rebuild`",
); );
}; };
let result = if name == lifecycle::MANAGER_NAME { let result = crate::auto_update::rebuild_agent(&state.coord, &name, &current_rev).await;
crate::auto_update::rebuild_manager(&current_rev).await
} else {
crate::auto_update::rebuild_agent(&state.coord, &name, &current_rev).await
};
match result { match result {
Ok(()) => Redirect::to("/").into_response(), Ok(()) => Redirect::to("/").into_response(),
Err(e) => error_response(&format!("rebuild {name} failed: {e:#}")), Err(e) => error_response(&format!("rebuild {name} failed: {e:#}")),

View file

@ -10,9 +10,15 @@ use tokio::process::Command;
/// name itself can be at most `MAX_AGENT_NAME` chars. /// name itself can be at most `MAX_AGENT_NAME` chars.
pub const AGENT_PREFIX: &str = "h-"; pub const AGENT_PREFIX: &str = "h-";
pub const MAX_AGENT_NAME: usize = 9; pub const MAX_AGENT_NAME: usize = 9;
/// Container name of the manager (a separate slot from sub-agents). /// Container name of the manager. Lives in the same path scheme as sub-agents
/// (`/var/lib/hyperhive/agents/hm1nd/`, `/var/lib/hyperhive/applied/hm1nd/`),
/// but its container has no `h-` prefix and extends a different
/// nixosConfiguration (`manager`, not `agent-base`).
pub const MANAGER_NAME: &str = "hm1nd"; pub const MANAGER_NAME: &str = "hm1nd";
/// Web UI port reserved for the manager (sub-agents hash into 8100..8999).
pub const MANAGER_PORT: u16 = 8000;
/// Mount point of the per-agent runtime directory inside the container. /// Mount point of the per-agent runtime directory inside the container.
pub const CONTAINER_RUNTIME_MOUNT: &str = "/run/hive"; pub const CONTAINER_RUNTIME_MOUNT: &str = "/run/hive";
@ -35,9 +41,13 @@ const DEFAULT_MEMORY_MAX: &str = "2G";
const DEFAULT_CPU_QUOTA: &str = "50%"; const DEFAULT_CPU_QUOTA: &str = "50%";
/// Returns the per-agent web UI port. Same hash on both sides — manager, /// Returns the per-agent web UI port. Same hash on both sides — manager,
/// dashboard, and agent harness all agree. /// dashboard, and agent harness all agree. Manager is fixed at
/// `MANAGER_PORT`.
#[must_use] #[must_use]
pub fn agent_web_port(name: &str) -> u16 { pub fn agent_web_port(name: &str) -> u16 {
if name == MANAGER_NAME {
return MANAGER_PORT;
}
let mut hash: u32 = 2_166_136_261; let mut hash: u32 = 2_166_136_261;
for b in name.bytes() { for b in name.bytes() {
hash ^= u32::from(b); hash ^= u32::from(b);
@ -47,14 +57,34 @@ pub fn agent_web_port(name: &str) -> u16 {
WEB_PORT_BASE + u16::try_from(hash % u32::from(WEB_PORT_RANGE)).unwrap_or(0) WEB_PORT_BASE + u16::try_from(hash % u32::from(WEB_PORT_RANGE)).unwrap_or(0)
} }
#[must_use]
pub fn container_name(name: &str) -> String { pub fn container_name(name: &str) -> String {
format!("{AGENT_PREFIX}{name}") if name == MANAGER_NAME {
MANAGER_NAME.to_owned()
} else {
format!("{AGENT_PREFIX}{name}")
}
}
#[must_use]
pub fn is_manager(name: &str) -> bool {
name == MANAGER_NAME
}
/// The nixosConfiguration in the hyperhive flake the agent's `flake.nix`
/// extends. Manager → `manager`; everyone else → `agent-base`.
#[must_use]
pub fn flake_base(name: &str) -> &'static str {
if is_manager(name) { "manager" } else { "agent-base" }
} }
fn validate(name: &str) -> Result<()> { fn validate(name: &str) -> Result<()> {
if name.is_empty() { if name.is_empty() {
bail!("agent name must not be empty"); bail!("agent name must not be empty");
} }
if is_manager(name) {
return Ok(());
}
if name.len() > MAX_AGENT_NAME { if name.len() > MAX_AGENT_NAME {
bail!( bail!(
"agent name '{name}' is too long ({} chars); max {MAX_AGENT_NAME}", "agent name '{name}' is too long ({} chars); max {MAX_AGENT_NAME}",
@ -180,14 +210,25 @@ pub async fn setup_applied(applied_dir: &Path, name: &str, hyperhive_flake: &str
.with_context(|| format!("create {}", applied_dir.display()))?; .with_context(|| format!("create {}", applied_dir.display()))?;
let port = agent_web_port(name); let port = agent_web_port(name);
let base = flake_base(name);
let service = if is_manager(name) {
"hive-m1nd"
} else {
"hive-ag3nt"
};
let description = if is_manager(name) {
format!("hyperhive manager {name}")
} else {
format!("hyperhive sub-agent {name}")
};
let flake_body = format!( let flake_body = format!(
r#"{{ r#"{{
description = "hyperhive sub-agent {name}"; description = "{description}";
inputs.hyperhive.url = "{hyperhive_flake}"; inputs.hyperhive.url = "{hyperhive_flake}";
outputs = outputs =
{{ hyperhive, ... }}: {{ hyperhive, ... }}:
{{ {{
nixosConfigurations.default = hyperhive.nixosConfigurations.agent-base.extendModules {{ nixosConfigurations.default = hyperhive.nixosConfigurations.{base}.extendModules {{
modules = [ modules = [
./agent.nix ./agent.nix
{{ {{
@ -198,7 +239,7 @@ pub async fn setup_applied(applied_dir: &Path, name: &str, hyperhive_flake: &str
[init] [init]
defaultBranch = main defaultBranch = main
''; '';
systemd.services.hive-ag3nt.environment = {{ systemd.services.{service}.environment = {{
HIVE_PORT = "{port}"; HIVE_PORT = "{port}";
HIVE_LABEL = "{name}"; HIVE_LABEL = "{name}";
}}; }};
@ -372,14 +413,35 @@ async fn systemd_daemon_reload() -> Result<()> {
/// is reachable on the host) and `EXTRA_NSPAWN_FLAGS` (the runtime-dir bind). /// is reachable on the host) and `EXTRA_NSPAWN_FLAGS` (the runtime-dir bind).
/// The start script expands `$EXTRA_NSPAWN_FLAGS` unquoted into the /// The start script expands `$EXTRA_NSPAWN_FLAGS` unquoted into the
/// `systemd-nspawn` command. /// `systemd-nspawn` command.
fn set_nspawn_flags(container: &str, agent_dir: &Path, claude_dir: &Path) -> Result<()> { /// Where in the container's filesystem the manager sees its agents tree.
/// Matches the `/agents` path that pre-Phase-8 hosts declared via
/// `containers.hm1nd.bindMounts."/agents"`.
pub const CONTAINER_MANAGER_AGENTS_MOUNT: &str = "/agents";
/// The on-host root that gets bind-mounted to `/agents` inside the manager.
/// Hard-coded to match `AGENT_STATE_ROOT` in coordinator.rs (kept duplicated
/// here so lifecycle stays usable as a leaf module).
const HOST_AGENTS_ROOT: &str = "/var/lib/hyperhive/agents";
fn set_nspawn_flags(container: &str, runtime_dir: &Path, claude_dir: &Path) -> Result<()> {
let path = format!("/etc/nixos-containers/{container}.conf"); let path = format!("/etc/nixos-containers/{container}.conf");
let original = std::fs::read_to_string(&path).with_context(|| format!("read {path}"))?; let original = std::fs::read_to_string(&path).with_context(|| format!("read {path}"))?;
let bind_flag = format!( let mut binds = format!(
"EXTRA_NSPAWN_FLAGS=\"--bind={runtime}:{CONTAINER_RUNTIME_MOUNT} --bind={claude}:{CONTAINER_CLAUDE_MOUNT}\"", "--bind={runtime}:{CONTAINER_RUNTIME_MOUNT} --bind={claude}:{CONTAINER_CLAUDE_MOUNT}",
runtime = agent_dir.display(), runtime = runtime_dir.display(),
claude = claude_dir.display(), claude = claude_dir.display(),
); );
if container == MANAGER_NAME {
// Manager edits sub-agent proposed/ repos and its own. RW so it can
// git-commit. Sub-agents see only their own /run/hive socket and
// /root/.claude (no /agents).
use std::fmt::Write as _;
let _ = write!(
binds,
" --bind={HOST_AGENTS_ROOT}:{CONTAINER_MANAGER_AGENTS_MOUNT}"
);
}
let bind_flag = format!("EXTRA_NSPAWN_FLAGS=\"{binds}\"");
let mut lines: Vec<String> = original let mut lines: Vec<String> = original
.lines() .lines()
.filter(|line| { .filter(|line| {

View file

@ -86,16 +86,23 @@ async fn main() -> Result<()> {
dashboard_port, dashboard_port,
} => { } => {
let coord = Arc::new(Coordinator::open(&db, hyperhive_flake)?); let coord = Arc::new(Coordinator::open(&db, hyperhive_flake)?);
// Run auto-update in the background — don't block service start. manager_server::start(coord.clone())?;
// Operators sometimes need the admin socket up to debug a stuck // Auto-create the manager container if it isn't there yet. Block
// agent, and the rebuild loop can take tens of seconds. // on this — without hm1nd the system has no manager harness.
// Failures are logged but allowed: a broken auto-spawn shouldn't
// make the dashboard unreachable for debugging.
if let Err(e) = auto_update::ensure_manager(&coord).await {
tracing::warn!(error = ?e, "auto-spawn manager failed");
}
// Auto-update in the background — don't block service start.
// Sub-agent rebuilds can take tens of seconds; we want the admin
// socket up immediately.
let update_coord = coord.clone(); let update_coord = coord.clone();
tokio::spawn(async move { tokio::spawn(async move {
if let Err(e) = auto_update::run(update_coord).await { if let Err(e) = auto_update::run(update_coord).await {
tracing::warn!(error = ?e, "auto-update task failed"); tracing::warn!(error = ?e, "auto-update task failed");
} }
}); });
manager_server::start(coord.clone())?;
let dash_coord = coord.clone(); let dash_coord = coord.clone();
tokio::spawn(async move { tokio::spawn(async move {
if let Err(e) = dashboard::serve(dashboard_port, dash_coord).await { if let Err(e) = dashboard::serve(dashboard_port, dash_coord).await {

View file

@ -61,7 +61,7 @@ async fn dispatch(req: &HostRequest, coord: Arc<Coordinator>) -> HostResponse {
Ok(match req { Ok(match req {
HostRequest::Spawn { name } => { HostRequest::Spawn { name } => {
tracing::info!(%name, "spawn"); tracing::info!(%name, "spawn");
let agent_dir = coord.register_agent(name)?; let agent_dir = coord.ensure_runtime(name)?;
let proposed_dir = Coordinator::agent_proposed_dir(name); let proposed_dir = Coordinator::agent_proposed_dir(name);
let applied_dir = Coordinator::agent_applied_dir(name); let applied_dir = Coordinator::agent_applied_dir(name);
let claude_dir = Coordinator::agent_claude_dir(name); let claude_dir = Coordinator::agent_claude_dir(name);
@ -101,7 +101,7 @@ async fn dispatch(req: &HostRequest, coord: Arc<Coordinator>) -> HostResponse {
} }
HostRequest::Rebuild { name } => { HostRequest::Rebuild { name } => {
tracing::info!(%name, "rebuild"); tracing::info!(%name, "rebuild");
let agent_dir = coord.register_agent(name)?; let agent_dir = coord.ensure_runtime(name)?;
let applied_dir = Coordinator::agent_applied_dir(name); let applied_dir = Coordinator::agent_applied_dir(name);
let claude_dir = Coordinator::agent_claude_dir(name); let claude_dir = Coordinator::agent_claude_dir(name);
lifecycle::rebuild( lifecycle::rebuild(