//! `nixos-container` lifecycle + per-agent config flake generation. use std::path::Path; use anyhow::{Context, Result, bail}; use tokio::process::Command; /// Sub-agent container prefix. `nixos-container` caps the total container name /// at 11 chars (it gets encoded into network interface names), so the agent /// name itself can be at most `MAX_AGENT_NAME` chars. pub const AGENT_PREFIX: &str = "h-"; pub const MAX_AGENT_NAME: usize = 9; /// Container name of the manager. Lives in the same path scheme as sub-agents /// (`/var/lib/hyperhive/agents/hm1nd/`, `/var/lib/hyperhive/applied/hm1nd/`), /// but its container has no `h-` prefix and extends a different /// nixosConfiguration (`manager`, not `agent-base`). pub const MANAGER_NAME: &str = "hm1nd"; /// Web UI port reserved for the manager (sub-agents hash into 8100..8999). pub const MANAGER_PORT: u16 = 8000; /// Mount point of the per-agent runtime directory inside the container. pub const CONTAINER_RUNTIME_MOUNT: &str = "/run/hive"; /// Mount point of the per-agent Claude credentials dir inside the container. /// Persistent across destroy/recreate so OAuth login survives. pub const CONTAINER_CLAUDE_MOUNT: &str = "/root/.claude"; /// Mount point of the per-agent durable knowledge dir inside the container. /// Agents are told (system prompt) to keep `notes.md` and any other scratch /// state here; persists across destroy/recreate. pub const CONTAINER_NOTES_MOUNT: &str = "/state"; const GIT_NAME: &str = "hive-c0re"; const GIT_EMAIL: &str = "hive-c0re@hyperhive"; /// Sub-agent web UI port range. Deterministic from the agent's name (FNV-1a /// hash mod range size), so the dashboard can compute the same port without /// asking hive-c0re. const WEB_PORT_BASE: u16 = 8100; const WEB_PORT_RANGE: u16 = 900; /// Default resource caps applied to every managed container via a systemd /// drop-in under `/run/systemd/system/container@.service.d/`. const DEFAULT_MEMORY_MAX: &str = "2G"; const DEFAULT_CPU_QUOTA: &str = "50%"; /// Per-agent web UI port. Manager is fixed at `MANAGER_PORT`; every /// sub-agent is `WEB_PORT_BASE + FNV-1a(name) % WEB_PORT_RANGE`, /// pure and reproducible from just the name. Collisions are /// possible (birthday paradox at ~30 agents); the operator resolves /// them by renaming an agent (different hash → different port). /// Stable across hosts, restarts, and dashboard renders — no /// state-file dance. #[must_use] pub fn agent_web_port(name: &str) -> u16 { if name == MANAGER_NAME { return MANAGER_PORT; } let mut hash: u32 = 2_166_136_261; for b in name.bytes() { hash ^= u32::from(b); hash = hash.wrapping_mul(16_777_619); } // Modulo of a u32 by a u16's value is guaranteed < u16::MAX, so try_from never fails. WEB_PORT_BASE + u16::try_from(hash % u32::from(WEB_PORT_RANGE)).unwrap_or(0) } #[must_use] pub fn container_name(name: &str) -> String { if name == MANAGER_NAME { MANAGER_NAME.to_owned() } else { format!("{AGENT_PREFIX}{name}") } } #[must_use] pub fn is_manager(name: &str) -> bool { name == MANAGER_NAME } /// The nixosConfiguration in the hyperhive flake the agent's `flake.nix` /// extends. Manager → `manager`; everyone else → `agent-base`. #[must_use] pub fn flake_base(name: &str) -> &'static str { if is_manager(name) { "manager" } else { "agent-base" } } fn validate(name: &str) -> Result<()> { if name.is_empty() { bail!("agent name must not be empty"); } if is_manager(name) { return Ok(()); } if name.len() > MAX_AGENT_NAME { bail!( "agent name '{name}' is too long ({} chars); max {MAX_AGENT_NAME}", name.len() ); } Ok(()) } /// First name (≠ `self_name`) currently running whose hashed port /// matches this agent's. The harness inside the colliding container /// would otherwise loop on `AddrInUse` forever; we surface the /// conflict here so spawn / rebuild fails loudly with an actionable /// message instead. async fn port_collision(self_name: &str) -> Option { let port = agent_web_port(self_name); let raw = list().await.unwrap_or_default(); for c in raw { let other = if c == MANAGER_NAME { MANAGER_NAME.to_owned() } else if let Some(n) = c.strip_prefix(AGENT_PREFIX) { n.to_owned() } else { continue; }; if other == self_name { continue; } if agent_web_port(&other) == port && is_running(&other).await { return Some(other); } } None } #[allow(clippy::too_many_arguments)] pub async fn spawn( name: &str, hyperhive_flake: &str, agent_dir: &Path, proposed_dir: &Path, applied_dir: &Path, claude_dir: &Path, notes_dir: &Path, dashboard_port: u16, ) -> Result<()> { validate(name)?; if let Some(other) = port_collision(name).await { bail!( "port {} is already taken by '{other}' — rename one of them and retry", agent_web_port(name) ); } setup_proposed(proposed_dir, name).await?; setup_applied( applied_dir, Some(proposed_dir), name, hyperhive_flake, dashboard_port, ) .await?; ensure_claude_dir(claude_dir)?; ensure_state_dir(notes_dir)?; let container = container_name(name); let flake_ref = format!("{}#default", applied_dir.display()); run(&["create", &container, "--flake", &flake_ref]).await?; set_nspawn_flags(&container, agent_dir, claude_dir, notes_dir)?; set_resource_limits(&container)?; systemd_daemon_reload().await?; run(&["start", &container]).await } pub async fn kill(name: &str) -> Result<()> { validate(name)?; let container = container_name(name); run(&["stop", &container]).await } pub async fn start(name: &str) -> Result<()> { validate(name)?; let container = container_name(name); run(&["start", &container]).await } /// Stop + start without regenerating any config. For "kick the container" /// without touching the flake or nspawn flags. pub async fn restart(name: &str) -> Result<()> { kill(name).await?; start(name).await } /// True when the container's systemd unit is active. Used by the dashboard /// to gate stop/restart buttons. pub async fn is_running(name: &str) -> bool { let container = container_name(name); let unit = format!("container@{container}.service"); Command::new("systemctl") .args(["is-active", "--quiet", &unit]) .status() .await .map(|s| s.success()) .unwrap_or(false) } /// Fully tear down a sub-agent's container: stop + remove via `nixos-container /// destroy`, then clean our own systemd drop-in. Leaves it to the caller to /// wipe `/var/lib/hyperhive/...` state and the per-agent runtime dir. pub async fn destroy(name: &str) -> Result<()> { validate(name)?; let container = container_name(name); // nixos-container destroy handles stop + removal of /var/lib/nixos-containers/ // and /etc/nixos-containers/.conf. Tolerate "no such container". if let Err(e) = run(&["destroy", &container]).await { tracing::warn!(error = ?e, "nixos-container destroy returned an error; continuing cleanup"); } let dropin_dir = format!("/run/systemd/system/container@{container}.service.d"); if std::path::Path::new(&dropin_dir).exists() { std::fs::remove_dir_all(&dropin_dir).with_context(|| format!("remove {dropin_dir}"))?; } Ok(()) } pub async fn rebuild( name: &str, hyperhive_flake: &str, agent_dir: &Path, applied_dir: &Path, claude_dir: &Path, notes_dir: &Path, dashboard_port: u16, ) -> Result<()> { validate(name)?; if let Some(other) = port_collision(name).await { bail!( "port {} is already taken by '{other}' — rename one of them and retry", agent_web_port(name) ); } setup_applied(applied_dir, None, name, hyperhive_flake, dashboard_port).await?; ensure_claude_dir(claude_dir)?; ensure_state_dir(notes_dir)?; let container = container_name(name); let flake_ref = format!("{}#default", applied_dir.display()); set_nspawn_flags(&container, agent_dir, claude_dir, notes_dir)?; set_resource_limits(&container)?; systemd_daemon_reload().await?; run(&["update", &container, "--flake", &flake_ref]).await?; // Restart so any nspawn-level changes (bind mounts, networking, etc.) apply. run(&["stop", &container]).await?; run(&["start", &container]).await } pub async fn list() -> Result> { let out = Command::new("nixos-container") .arg("list") .output() .await .context("invoke nixos-container list")?; if !out.status.success() { bail!( "nixos-container list exited with status {}: {}", out.status, String::from_utf8_lossy(&out.stderr).trim() ); } Ok(String::from_utf8_lossy(&out.stdout) .lines() .map(str::trim) .filter(|line| line.starts_with(AGENT_PREFIX) || *line == MANAGER_NAME) .map(str::to_owned) .collect()) } /// Initialize the manager-editable proposed repo. Contains only `agent.nix` /// (the file the manager edits). Touched by hive-c0re only on first spawn — /// never again — so the manager can't be surprised by hive-c0re commits or /// working-tree resets. pub async fn setup_proposed(proposed_dir: &Path, name: &str) -> Result<()> { if proposed_dir.join(".git").exists() { return Ok(()); } std::fs::create_dir_all(proposed_dir) .with_context(|| format!("create {}", proposed_dir.display()))?; let agent_path = proposed_dir.join("agent.nix"); if !agent_path.exists() { std::fs::write(&agent_path, initial_agent_nix(name)) .with_context(|| format!("write {}", agent_path.display()))?; } git(proposed_dir, &["init", "--initial-branch=main"]).await?; git(proposed_dir, &["add", "agent.nix"]).await?; git_commit(proposed_dir, "hive-c0re init").await?; Ok(()) } /// Set up the applied repo. Two responsibilities: /// - First-spawn only: init the repo, pull proposed's initial commit /// in via `git fetch`, tag it `deployed/0`. This is the *only* time /// hive-c0re reads from `proposed` for an agent — subsequent /// proposals are fetched at `request_apply_commit` time and tagged /// `proposal/` (see `actions::approve` for the tag state /// machine). /// - Every call: regenerate the untracked `flake.nix` so flake-url / /// dashboard-port changes pick up on rebuild without churning the /// git log. /// /// `proposed_dir` is `None` on rebuild paths that just want the flake /// refreshed. pub async fn setup_applied( applied_dir: &Path, proposed_dir: Option<&Path>, name: &str, hyperhive_flake: &str, dashboard_port: u16, ) -> Result<()> { std::fs::create_dir_all(applied_dir) .with_context(|| format!("create {}", applied_dir.display()))?; // 1. First-spawn git init from proposed (or pre-overhaul detection). if !applied_dir.join(".git").exists() { let Some(proposed) = proposed_dir else { bail!( "applied repo at {} is missing its .git directory; \ cannot rebuild without a proposed source to seed from. \ destroy --purge and re-spawn this agent.", applied_dir.display() ); }; git(applied_dir, &["init", "--initial-branch=main"]).await?; let proposed_str = proposed.display().to_string(); git( applied_dir, &["fetch", "--no-tags", &proposed_str, "main:refs/heads/main"], ) .await?; git_read_tree_reset(applied_dir, "refs/heads/main").await?; git_tag(applied_dir, "deployed/0", "refs/heads/main").await?; } else if git_rev_parse(applied_dir, "refs/tags/deployed/0") .await .is_err() { // Pre-overhaul applied repo — agent.nix is tracked directly, // commits authored by hive-c0re, no deployed/* tag scheme. // No in-place migration; fail loudly so the operator purges. bail!( "applied repo at {} predates the tag-driven config flow. \ Run `hive-c0re destroy --purge {name}` and re-spawn.", applied_dir.display() ); } // 2. (Re)write the untracked wrapper flake. Tracked files in the // working tree (agent.nix and anything the manager committed) are // untouched. let port = agent_web_port(name); let base = flake_base(name); let service = if is_manager(name) { "hive-m1nd" } else { "hive-ag3nt" }; let description = if is_manager(name) { format!("hyperhive manager {name}") } else { format!("hyperhive sub-agent {name}") }; let flake_body = format!( r#"{{ description = "{description}"; inputs.hyperhive.url = "{hyperhive_flake}"; outputs = {{ hyperhive, ... }}: {{ nixosConfigurations.default = hyperhive.nixosConfigurations.{base}.extendModules {{ modules = [ ./agent.nix {{ programs.git.config.user = {{ name = "{name}"; email = "{name}@hyperhive"; }}; systemd.services.{service}.environment = {{ HIVE_PORT = "{port}"; HIVE_LABEL = "{name}"; HIVE_DASHBOARD_PORT = "{dashboard_port}"; }}; }} ]; }}; }}; }} "#, ); std::fs::write(applied_dir.join("flake.nix"), flake_body) .with_context(|| format!("write {}/flake.nix", applied_dir.display()))?; Ok(()) } /// Create the per-agent Claude credentials dir if missing. Mode 0700 — only /// root inside the container reads/writes it. Idempotent: existing dirs are /// left untouched (an agent's OAuth tokens survive `destroy`/recreate). fn ensure_claude_dir(claude_dir: &Path) -> Result<()> { if !claude_dir.exists() { std::fs::create_dir_all(claude_dir) .with_context(|| format!("create {}", claude_dir.display()))?; #[cfg(unix)] { use std::os::unix::fs::PermissionsExt; std::fs::set_permissions(claude_dir, std::fs::Permissions::from_mode(0o700)) .with_context(|| format!("chmod {}", claude_dir.display()))?; } } Ok(()) } fn ensure_state_dir(notes_dir: &Path) -> Result<()> { if !notes_dir.exists() { std::fs::create_dir_all(notes_dir) .with_context(|| format!("create {}", notes_dir.display()))?; } Ok(()) } fn initial_agent_nix(name: &str) -> String { format!( "{{ ... }}:\n{{\n # Per-agent overrides for {name}. The manager edits this\n # file (and commits) to customise the agent's NixOS config.\n}}\n", ) } async fn git_commit(dir: &Path, message: &str) -> Result<()> { git( dir, &[ "-c", &format!("user.name={GIT_NAME}"), "-c", &format!("user.email={GIT_EMAIL}"), "commit", "-m", message, ], ) .await } /// Spawn `git` honoring the `HYPERHIVE_GIT` env var (absolute path baked in /// by the NixOS module), falling back to bare `git` (PATH lookup) otherwise. #[must_use] pub fn git_command() -> Command { let exe = std::env::var("HYPERHIVE_GIT").unwrap_or_else(|_| "git".into()); Command::new(exe) } async fn git(dir: &Path, args: &[&str]) -> Result<()> { let out = git_command() .current_dir(dir) .args(args) .output() .await .with_context(|| format!("git {} in {}", args.join(" "), dir.display()))?; if !out.status.success() { bail!( "git {} failed ({}): {}", args.join(" "), out.status, String::from_utf8_lossy(&out.stderr).trim() ); } Ok(()) } /// Fetch `sha` from the `src` git repo into `dst` and pin it as /// `refs/tags/`. Used at request_apply_commit time so hive-c0re /// captures an immutable handle on the manager's commit; subsequent /// amendments / force-pushes in `src` no longer affect what gets /// built. Returns the resolved sha (which equals `sha` on success /// but normalised — short shas get expanded). #[allow(dead_code)] // wired up by manager_server in a later commit pub async fn git_fetch_to_tag(dst: &Path, src: &Path, sha: &str, tag: &str) -> Result { let src_str = src.display().to_string(); let refspec = format!("{sha}:refs/tags/{tag}"); git(dst, &["fetch", "--no-tags", &src_str, &refspec]).await?; git_rev_parse(dst, &format!("refs/tags/{tag}")).await } /// Resolve `refname` (a tag, branch, or sha) in `dir` to its full sha. #[allow(dead_code)] pub async fn git_rev_parse(dir: &Path, refname: &str) -> Result { let out = git_command() .current_dir(dir) .args(["rev-parse", refname]) .output() .await .with_context(|| format!("git rev-parse {refname} in {}", dir.display()))?; if !out.status.success() { bail!( "git rev-parse {refname} failed ({}): {}", out.status, String::from_utf8_lossy(&out.stderr).trim() ); } Ok(String::from_utf8_lossy(&out.stdout).trim().to_owned()) } /// Plant a lightweight tag at `target`. Errors if the tag already /// exists — we want loud failures on id reuse, not silent /// overwrites. #[allow(dead_code)] pub async fn git_tag(dir: &Path, name: &str, target: &str) -> Result<()> { git(dir, &["tag", name, target]).await } /// Plant an annotated tag with `body` as the message. Used for /// `failed/` (body = build error) and `denied/` (body = /// operator note). Multi-line bodies handled via stdin so we don't /// have to escape anything. #[allow(dead_code)] pub async fn git_tag_annotated(dir: &Path, name: &str, target: &str, body: &str) -> Result<()> { use tokio::io::AsyncWriteExt; let mut child = git_command() .current_dir(dir) .args(["tag", "-a", name, target, "-F", "-"]) .stdin(std::process::Stdio::piped()) .stdout(std::process::Stdio::piped()) .stderr(std::process::Stdio::piped()) .spawn() .with_context(|| format!("spawn git tag -a {name} in {}", dir.display()))?; if let Some(mut stdin) = child.stdin.take() { stdin .write_all(body.as_bytes()) .await .context("write tag body to git stdin")?; // Drop closes stdin so git can finish reading. drop(stdin); } let out = child.wait_with_output().await.context("wait git tag -a")?; if !out.status.success() { bail!( "git tag -a {name} failed ({}): {}", out.status, String::from_utf8_lossy(&out.stderr).trim() ); } Ok(()) } /// Replace working tree + index with the tree at `target` without /// moving HEAD. `applied/main` stays pointing at the last known-good /// `deployed/*` while we let `nixos-container update` evaluate the /// candidate. On build failure callers reset back to HEAD; on /// success they fast-forward main to `target`. #[allow(dead_code)] pub async fn git_read_tree_reset(dir: &Path, target: &str) -> Result<()> { git(dir, &["read-tree", "--reset", "-u", target]).await } /// Hard-set a ref to `target`. Used to fast-forward `refs/heads/main` /// to the just-deployed proposal commit. Uses `update-ref`, not /// `branch -f`, so it works regardless of where HEAD currently sits. #[allow(dead_code)] pub async fn git_update_ref(dir: &Path, refname: &str, target: &str) -> Result<()> { git(dir, &["update-ref", refname, target]).await } /// Write a systemd drop-in for `container@.service` that applies /// our default resource caps. Goes under `/run/systemd/system/...` so it's /// ephemeral (regenerated on every spawn / rebuild). fn set_resource_limits(container: &str) -> Result<()> { let dir = format!("/run/systemd/system/container@{container}.service.d"); std::fs::create_dir_all(&dir).with_context(|| format!("create {dir}"))?; let path = format!("{dir}/hyperhive-limits.conf"); let content = format!("[Service]\nMemoryMax={DEFAULT_MEMORY_MAX}\nCPUQuota={DEFAULT_CPU_QUOTA}\n",); std::fs::write(&path, content).with_context(|| format!("write {path}"))?; tracing::info!( %path, memory_max = DEFAULT_MEMORY_MAX, cpu_quota = DEFAULT_CPU_QUOTA, "wrote resource limits drop-in" ); Ok(()) } async fn systemd_daemon_reload() -> Result<()> { let out = Command::new("systemctl") .arg("daemon-reload") .output() .await .context("invoke systemctl daemon-reload")?; if !out.status.success() { bail!( "systemctl daemon-reload failed ({}): {}", out.status, String::from_utf8_lossy(&out.stderr).trim() ); } Ok(()) } /// Idempotently rewrite the lines in `/etc/nixos-containers/.conf` /// that hive-c0re owns: `PRIVATE_NETWORK` (forced 0 so the agent's web UI port /// is reachable on the host) and `EXTRA_NSPAWN_FLAGS` (the runtime-dir bind). /// The start script expands `$EXTRA_NSPAWN_FLAGS` unquoted into the /// `systemd-nspawn` command. /// Where in the container's filesystem the manager sees its agents tree. /// Matches the `/agents` path that pre-Phase-8 hosts declared via /// `containers.hm1nd.bindMounts."/agents"`. pub const CONTAINER_MANAGER_AGENTS_MOUNT: &str = "/agents"; /// The on-host root that gets bind-mounted to `/agents` inside the manager. /// Hard-coded to match `AGENT_STATE_ROOT` in coordinator.rs (kept duplicated /// here so lifecycle stays usable as a leaf module). const HOST_AGENTS_ROOT: &str = "/var/lib/hyperhive/agents"; fn set_nspawn_flags( container: &str, runtime_dir: &Path, claude_dir: &Path, notes_dir: &Path, ) -> Result<()> { let path = format!("/etc/nixos-containers/{container}.conf"); let original = std::fs::read_to_string(&path).with_context(|| format!("read {path}"))?; let mut binds = format!( "--bind={runtime}:{CONTAINER_RUNTIME_MOUNT} --bind={claude}:{CONTAINER_CLAUDE_MOUNT} --bind={notes}:{CONTAINER_NOTES_MOUNT}", runtime = runtime_dir.display(), claude = claude_dir.display(), notes = notes_dir.display(), ); if container == MANAGER_NAME { // Manager edits sub-agent proposed/ repos and its own. RW so it can // git-commit. Sub-agents see only their own /run/hive socket and // /root/.claude (no /agents). use std::fmt::Write as _; let _ = write!( binds, " --bind={HOST_AGENTS_ROOT}:{CONTAINER_MANAGER_AGENTS_MOUNT}" ); } let bind_flag = format!("EXTRA_NSPAWN_FLAGS=\"{binds}\""); let mut lines: Vec = original .lines() .filter(|line| { let trimmed = line.trim_start(); // Strip any network-namespace knobs nixos-container's create // might have populated. The start script adds `--network-veth` // whenever HOST_ADDRESS / LOCAL_ADDRESS (or their IPv6 cousins) // are non-empty — and veth implies a private netns, hiding our // web-UI port from the host. Force host netns. !trimmed.starts_with("EXTRA_NSPAWN_FLAGS=") && !trimmed.starts_with("PRIVATE_NETWORK=") && !trimmed.starts_with("HOST_ADDRESS=") && !trimmed.starts_with("LOCAL_ADDRESS=") && !trimmed.starts_with("HOST_ADDRESS6=") && !trimmed.starts_with("LOCAL_ADDRESS6=") && !trimmed.starts_with("HOST_BRIDGE=") }) .map(str::to_owned) .collect(); lines.push("PRIVATE_NETWORK=0".to_owned()); lines.push("HOST_ADDRESS=".to_owned()); lines.push("LOCAL_ADDRESS=".to_owned()); lines.push("HOST_ADDRESS6=".to_owned()); lines.push("LOCAL_ADDRESS6=".to_owned()); lines.push("HOST_BRIDGE=".to_owned()); lines.push(bind_flag); let mut content = lines.join("\n"); content.push('\n'); std::fs::write(&path, content).with_context(|| format!("write {path}"))?; tracing::info!(%path, "set PRIVATE_NETWORK=0 + EXTRA_NSPAWN_FLAGS"); Ok(()) } async fn run(args: &[&str]) -> Result<()> { let out = Command::new("nixos-container") .args(args) .output() .await .with_context(|| format!("invoke nixos-container {}", args.join(" ")))?; let stdout = String::from_utf8_lossy(&out.stdout); let stderr = String::from_utf8_lossy(&out.stderr); if !stdout.trim().is_empty() { tracing::info!(target: "nixos-container", "{}", stdout.trim()); } if !stderr.trim().is_empty() { tracing::warn!(target: "nixos-container", "{}", stderr.trim()); } if !out.status.success() { bail!( "nixos-container {} failed ({}): {}", args.join(" "), out.status, stderr.trim() ); } Ok(()) }