hyperhive/hive-c0re/src/lifecycle.rs
2026-05-17 01:40:28 +02:00

910 lines
36 KiB
Rust

//! `nixos-container` lifecycle + per-agent config flake generation.
use std::path::Path;
use anyhow::{Context, Result, bail};
use tokio::process::Command;
/// Sub-agent container prefix. `nixos-container` caps the total container name
/// at 11 chars (it gets encoded into network interface names), so the agent
/// name itself can be at most `MAX_AGENT_NAME` chars.
pub const AGENT_PREFIX: &str = "h-";
pub const MAX_AGENT_NAME: usize = 9;
/// Container name of the manager. Lives in the same path scheme as sub-agents
/// (`/var/lib/hyperhive/agents/hm1nd/`, `/var/lib/hyperhive/applied/hm1nd/`),
/// but its container has no `h-` prefix and extends a different
/// nixosConfiguration (`manager`, not `agent-base`).
pub const MANAGER_NAME: &str = "hm1nd";
/// Web UI port reserved for the manager (sub-agents hash into 8100..8999).
pub const MANAGER_PORT: u16 = 8000;
/// Mount point of the per-agent runtime directory inside the container.
pub const CONTAINER_RUNTIME_MOUNT: &str = "/run/hive";
/// Mount point of the per-agent Claude credentials dir inside the container.
/// Persistent across destroy/recreate so OAuth login survives.
pub const CONTAINER_CLAUDE_MOUNT: &str = "/root/.claude";
/// Mount point of the per-agent durable knowledge dir inside the container.
/// Agents are told (system prompt) to keep `notes.md` and any other scratch
/// state here; persists across destroy/recreate.
pub const CONTAINER_NOTES_MOUNT: &str = "/state";
/// Mount point of the shared directory accessible to all agents.
/// All agents can read/write here; agents should only put things they're
/// willing to lose (other agents may delete them).
pub const CONTAINER_SHARED_MOUNT: &str = "/shared";
const GIT_NAME: &str = "c0re";
const GIT_EMAIL: &str = "c0re@hyperhive";
/// Sub-agent web UI port range. Deterministic from the agent's name (FNV-1a
/// hash mod range size), so the dashboard can compute the same port without
/// asking hive-c0re.
const WEB_PORT_BASE: u16 = 8100;
const WEB_PORT_RANGE: u16 = 900;
/// Default resource caps applied to every managed container via a systemd
/// drop-in under `/run/systemd/system/container@<NAME>.service.d/`.
const DEFAULT_MEMORY_MAX: &str = "2G";
const DEFAULT_CPU_QUOTA: &str = "50%";
/// Per-agent web UI port. Manager is fixed at `MANAGER_PORT`; every
/// sub-agent is `WEB_PORT_BASE + FNV-1a(name) % WEB_PORT_RANGE`,
/// pure and reproducible from just the name. Collisions are
/// possible (birthday paradox at ~30 agents); the operator resolves
/// them by renaming an agent (different hash → different port).
/// Stable across hosts, restarts, and dashboard renders — no
/// state-file dance.
#[must_use]
pub fn agent_web_port(name: &str) -> u16 {
if name == MANAGER_NAME {
return MANAGER_PORT;
}
let mut hash: u32 = 2_166_136_261;
for b in name.bytes() {
hash ^= u32::from(b);
hash = hash.wrapping_mul(16_777_619);
}
// Modulo of a u32 by a u16's value is guaranteed < u16::MAX, so try_from never fails.
WEB_PORT_BASE + u16::try_from(hash % u32::from(WEB_PORT_RANGE)).unwrap_or(0)
}
#[must_use]
pub fn container_name(name: &str) -> String {
if name == MANAGER_NAME {
MANAGER_NAME.to_owned()
} else {
format!("{AGENT_PREFIX}{name}")
}
}
#[must_use]
pub fn is_manager(name: &str) -> bool {
name == MANAGER_NAME
}
fn validate(name: &str) -> Result<()> {
if name.is_empty() {
bail!("agent name must not be empty");
}
if is_manager(name) {
return Ok(());
}
if name.len() > MAX_AGENT_NAME {
bail!(
"agent name '{name}' is too long ({} chars); max {MAX_AGENT_NAME}",
name.len()
);
}
Ok(())
}
/// First name (≠ `self_name`) currently running whose hashed port
/// matches this agent's. The harness inside the colliding container
/// would otherwise loop on `AddrInUse` forever; we surface the
/// conflict here so spawn / rebuild fails loudly with an actionable
/// message instead.
async fn port_collision(self_name: &str) -> Option<String> {
let port = agent_web_port(self_name);
let raw = list().await.unwrap_or_default();
for c in raw {
let other = if c == MANAGER_NAME {
MANAGER_NAME.to_owned()
} else if let Some(n) = c.strip_prefix(AGENT_PREFIX) {
n.to_owned()
} else {
continue;
};
if other == self_name {
continue;
}
if agent_web_port(&other) == port && is_running(&other).await {
return Some(other);
}
}
None
}
#[allow(clippy::too_many_arguments)]
pub async fn spawn(
name: &str,
hyperhive_flake: &str,
agent_dir: &Path,
proposed_dir: &Path,
applied_dir: &Path,
claude_dir: &Path,
notes_dir: &Path,
dashboard_port: u16,
operator_pronouns: &str,
) -> Result<()> {
validate(name)?;
if let Some(other) = port_collision(name).await {
bail!(
"port {} is already taken by '{other}' — rename one of them and retry",
agent_web_port(name)
);
}
setup_proposed(proposed_dir, name).await?;
setup_applied(applied_dir, Some(proposed_dir), name).await?;
ensure_claude_dir(claude_dir)?;
ensure_state_dir(notes_dir)?;
// Meta flake gets the new agent's input + nixosConfiguration
// before `nixos-container create` so the `--flake meta#<name>`
// ref resolves.
let agents = agents_after_spawn(name).await?;
crate::meta::sync_agents(hyperhive_flake, dashboard_port, operator_pronouns, &agents).await?;
let container = container_name(name);
let flake_ref = format!("{}#{name}", crate::meta::meta_dir().display());
run(&["create", &container, "--flake", &flake_ref]).await?;
set_nspawn_flags(&container, agent_dir, claude_dir, notes_dir)?;
set_resource_limits(&container)?;
systemd_daemon_reload().await?;
run(&["start", &container]).await
}
/// Build the `AgentSpec` list for the meta flake from `nixos-container
/// list` + a hypothetical extra name not yet in the list (for spawn
/// where the new agent's container doesn't exist yet). Pass empty
/// `name_to_add` from rebuild paths where the agent is already in the
/// container list.
async fn agents_for_meta(name_to_add: Option<&str>) -> Result<Vec<crate::meta::AgentSpec>> {
let containers = list().await.unwrap_or_default();
let mut out: Vec<crate::meta::AgentSpec> = containers
.into_iter()
.filter_map(|c| {
let (name, is_manager) = if c == MANAGER_NAME {
(MANAGER_NAME.to_owned(), true)
} else if let Some(n) = c.strip_prefix(AGENT_PREFIX) {
(n.to_owned(), false)
} else {
return None;
};
Some(crate::meta::AgentSpec {
port: agent_web_port(&name),
name,
is_manager,
})
})
.collect();
if let Some(extra) = name_to_add
&& !out.iter().any(|a| a.name == extra)
{
out.push(crate::meta::AgentSpec {
name: extra.to_owned(),
is_manager: is_manager(extra),
port: agent_web_port(extra),
});
}
out.sort_by(|a, b| a.name.cmp(&b.name));
Ok(out)
}
async fn agents_after_spawn(name: &str) -> Result<Vec<crate::meta::AgentSpec>> {
agents_for_meta(Some(name)).await
}
/// Public enumeration of currently-existing agents (whatever
/// `nixos-container list` says), sorted, no extras. For callers
/// outside this module that need to reseed meta after lifecycle
/// changes — destroy, startup reconciliation, etc.
pub async fn agents_for_meta_listing() -> Result<Vec<crate::meta::AgentSpec>> {
agents_for_meta(None).await
}
pub async fn kill(name: &str) -> Result<()> {
validate(name)?;
let container = container_name(name);
run(&["stop", &container]).await
}
pub async fn start(name: &str) -> Result<()> {
validate(name)?;
let container = container_name(name);
run(&["start", &container]).await
}
/// Stop + start without regenerating any config. For "kick the container"
/// without touching the flake or nspawn flags.
pub async fn restart(name: &str) -> Result<()> {
kill(name).await?;
start(name).await
}
/// True when the container's systemd unit is active. Used by the dashboard
/// to gate stop/restart buttons.
pub async fn is_running(name: &str) -> bool {
let container = container_name(name);
let unit = format!("container@{container}.service");
Command::new("systemctl")
.args(["is-active", "--quiet", &unit])
.status()
.await
.map(|s| s.success())
.unwrap_or(false)
}
/// Fully tear down a sub-agent's container: stop + remove via `nixos-container
/// destroy`, then clean our own systemd drop-in. Leaves it to the caller to
/// wipe `/var/lib/hyperhive/...` state and the per-agent runtime dir.
pub async fn destroy(name: &str) -> Result<()> {
validate(name)?;
let container = container_name(name);
// nixos-container destroy handles stop + removal of /var/lib/nixos-containers/<C>
// and /etc/nixos-containers/<C>.conf. Tolerate "no such container".
if let Err(e) = run(&["destroy", &container]).await {
tracing::warn!(error = ?e, "nixos-container destroy returned an error; continuing cleanup");
}
let dropin_dir = format!("/run/systemd/system/container@{container}.service.d");
if std::path::Path::new(&dropin_dir).exists() {
std::fs::remove_dir_all(&dropin_dir).with_context(|| format!("remove {dropin_dir}"))?;
}
Ok(())
}
#[allow(clippy::too_many_arguments)]
pub async fn rebuild(
name: &str,
hyperhive_flake: &str,
agent_dir: &Path,
applied_dir: &Path,
claude_dir: &Path,
notes_dir: &Path,
dashboard_port: u16,
operator_pronouns: &str,
) -> Result<()> {
// Sync the meta flake (idempotent — no-op when the rendered
// flake matches disk) so a manual rebuild from the dashboard
// can also recover from a divergent meta repo (e.g. an agent
// got added directly via `nixos-container create` outside
// hive-c0re).
let agents = agents_for_meta(None).await?;
crate::meta::sync_agents(hyperhive_flake, dashboard_port, operator_pronouns, &agents).await?;
// Then bump just this agent's input — picks up whatever
// `applied/<n>/main` currently points at (deployed/<latest>).
// Commits the lock if it changed.
crate::meta::lock_update_for_rebuild(name).await?;
rebuild_no_meta(name, agent_dir, applied_dir, claude_dir, notes_dir).await
}
/// Container-level rebuild without touching the meta repo. Callers
/// that own the meta side themselves (`actions::run_apply_commit`
/// drives meta through the two-phase prepare/finalize/abort flow)
/// use this directly. Public `rebuild` wraps it with idempotent meta
/// sync + lock-bump-and-commit.
pub async fn rebuild_no_meta(
name: &str,
agent_dir: &Path,
applied_dir: &Path,
claude_dir: &Path,
notes_dir: &Path,
) -> Result<()> {
validate(name)?;
if let Some(other) = port_collision(name).await {
bail!(
"port {} is already taken by '{other}' — rename one of them and retry",
agent_web_port(name)
);
}
setup_applied(applied_dir, None, name).await?;
ensure_claude_dir(claude_dir)?;
ensure_state_dir(notes_dir)?;
let container = container_name(name);
let flake_ref = format!("{}#{name}", crate::meta::meta_dir().display());
set_nspawn_flags(&container, agent_dir, claude_dir, notes_dir)?;
set_resource_limits(&container)?;
systemd_daemon_reload().await?;
run(&["update", &container, "--flake", &flake_ref]).await?;
// Restart so any nspawn-level changes (bind mounts, networking, etc.) apply.
run(&["stop", &container]).await?;
run(&["start", &container]).await
}
pub async fn list() -> Result<Vec<String>> {
let out = Command::new("nixos-container")
.arg("list")
.output()
.await
.context("invoke nixos-container list")?;
if !out.status.success() {
bail!(
"nixos-container list exited with status {}: {}",
out.status,
String::from_utf8_lossy(&out.stderr).trim()
);
}
Ok(String::from_utf8_lossy(&out.stdout)
.lines()
.map(str::trim)
.filter(|line| line.starts_with(AGENT_PREFIX) || *line == MANAGER_NAME)
.map(str::to_owned)
.collect())
}
/// Initialize the manager-editable proposed repo. Seeds two tracked
/// files: `agent.nix` (the module the manager edits) and `flake.nix`
/// (the boilerplate that lets the meta flake import this repo as an
/// input — meta locks at a specific sha and reads
/// `nixosModules.default`, so `flake.nix` must be in the commit). The
/// manager shouldn't edit `flake.nix` (the prompt says so) but it's
/// visible so they can introspect.
///
/// Touched by hive-c0re only on first spawn — never again — so the
/// manager can't be surprised by hive-c0re commits or working-tree
/// resets.
pub async fn setup_proposed(proposed_dir: &Path, name: &str) -> Result<()> {
let fresh = !proposed_dir.join(".git").exists();
if fresh {
std::fs::create_dir_all(proposed_dir)
.with_context(|| format!("create {}", proposed_dir.display()))?;
let agent_path = proposed_dir.join("agent.nix");
if !agent_path.exists() {
std::fs::write(&agent_path, initial_agent_nix(name))
.with_context(|| format!("write {}", agent_path.display()))?;
}
let flake_path = proposed_dir.join("flake.nix");
if !flake_path.exists() {
std::fs::write(&flake_path, initial_flake_nix())
.with_context(|| format!("write {}", flake_path.display()))?;
}
git(proposed_dir, &["init", "--initial-branch=main"]).await?;
git(proposed_dir, &["add", "agent.nix", "flake.nix"]).await?;
git_commit(proposed_dir, "hive-c0re init").await?;
}
// Idempotently wire the `applied` remote — purely for the
// manager's ergonomics. The URL is the path inside the manager
// container (`/applied/<n>/.git`), where the RO bind in
// `set_nspawn_flags` makes it real. hive-c0re itself never
// dereferences this remote; the host-side fetch in
// `request_apply_commit` uses absolute host paths.
ensure_applied_remote(proposed_dir, name).await
}
async fn ensure_applied_remote(proposed_dir: &Path, name: &str) -> Result<()> {
let want = format!("/applied/{name}/.git");
let existing = git_command()
.current_dir(proposed_dir)
.args(["remote", "get-url", "applied"])
.output()
.await
.with_context(|| format!("git remote get-url applied in {}", proposed_dir.display()))?;
if existing.status.success() {
let current = String::from_utf8_lossy(&existing.stdout).trim().to_owned();
if current == want {
return Ok(());
}
// URL drifted (path scheme changed, etc.) — re-point it.
return git(proposed_dir, &["remote", "set-url", "applied", &want]).await;
}
git(proposed_dir, &["remote", "add", "applied", &want]).await
}
/// Set up the applied repo. First-spawn only: init the repo, pull
/// proposed's initial commit in via `git fetch`, tag it `deployed/0`.
/// This is the *only* time hive-c0re reads from `proposed` for an
/// agent — subsequent proposals are fetched at `request_apply_commit`
/// time and tagged `proposal/<id>` (see `actions::approve` for the
/// tag state machine).
///
/// `proposed_dir` is `None` on rebuild paths where the repo already
/// exists — we just verify it's the right shape and bail otherwise.
/// Unlike the pre-overhaul code path, `flake.nix` is no longer
/// regenerated at the host level: it's tracked in proposed (seeded by
/// `setup_proposed`) and rides along on every fetch.
pub async fn setup_applied(
applied_dir: &Path,
proposed_dir: Option<&Path>,
name: &str,
) -> Result<()> {
std::fs::create_dir_all(applied_dir)
.with_context(|| format!("create {}", applied_dir.display()))?;
if !applied_dir.join(".git").exists() {
let Some(proposed) = proposed_dir else {
bail!(
"applied repo at {} is missing its .git directory; \
cannot rebuild without a proposed source to seed from. \
destroy --purge and re-spawn this agent.",
applied_dir.display()
);
};
git(applied_dir, &["init", "--initial-branch=main"]).await?;
let proposed_str = proposed.display().to_string();
git(
applied_dir,
// --update-head-ok lets us fetch into refs/heads/main while
// HEAD still points there. git's default safeguard refuses
// to avoid index/working-tree desync, but the working tree
// is empty (we just `init`'d) and we read-tree-reset right
// after, so the safeguard is moot here.
&[
"fetch",
"--no-tags",
"--update-head-ok",
&proposed_str,
"main:refs/heads/main",
],
)
.await?;
git_read_tree_reset(applied_dir, "refs/heads/main").await?;
git_tag(applied_dir, "deployed/0", "refs/heads/main").await?;
} else if git_rev_parse(applied_dir, "refs/tags/deployed/0")
.await
.is_err()
{
// Pre-overhaul applied repo — no deployed/* tag scheme,
// flake.nix may be untracked, agent.nix possibly authored by
// hive-c0re directly. The startup auto-migration fixes this
// in place; if it didn't run (or got skipped), surface a
// clear error.
bail!(
"applied repo at {} predates the meta-flake layout. \
Restart hive-c0re to let the auto-migration run, or \
destroy --purge {name} and re-spawn.",
applied_dir.display()
);
}
Ok(())
}
/// Create the per-agent Claude credentials dir if missing. Mode 0700 — only
/// root inside the container reads/writes it. Idempotent: existing dirs are
/// left untouched (an agent's OAuth tokens survive `destroy`/recreate).
fn ensure_claude_dir(claude_dir: &Path) -> Result<()> {
if !claude_dir.exists() {
std::fs::create_dir_all(claude_dir)
.with_context(|| format!("create {}", claude_dir.display()))?;
#[cfg(unix)]
{
use std::os::unix::fs::PermissionsExt;
std::fs::set_permissions(claude_dir, std::fs::Permissions::from_mode(0o700))
.with_context(|| format!("chmod {}", claude_dir.display()))?;
}
}
Ok(())
}
fn ensure_state_dir(notes_dir: &Path) -> Result<()> {
if !notes_dir.exists() {
std::fs::create_dir_all(notes_dir)
.with_context(|| format!("create {}", notes_dir.display()))?;
}
Ok(())
}
fn initial_agent_nix(name: &str) -> String {
format!(
"{{ config, pkgs, lib, ... }}:\n{{\n # Per-agent overrides for {name}. This is a regular NixOS module\n # — add packages, services, modules, imports as needed.\n #\n # imports = [ ./extra-module.nix ];\n # environment.systemPackages = with pkgs; [ ];\n}}\n",
)
}
/// Module-only flake exposed by every agent's repo. Consumed by the
/// hive-c0re-owned meta flake at `/var/lib/hyperhive/meta/` as a flake
/// input. The wrapper is intentionally permissive:
///
/// - Manager edits `inputs.* = …` to add other flakes (e.g. an MCP
/// server's own flake) — the lock for those lands in the agent's
/// own `flake.lock` and rolls up into meta's lock transitively.
/// - The outputs block forwards every input (minus `self`) into
/// `agent.nix` as the `flakeInputs` module argument, so the
/// manager just references `flakeInputs.<name>.packages.${pkgs.system}.default`
/// without further plumbing.
///
/// Identity injection (`HIVE_PORT` / `HIVE_LABEL` / dashboard port /
/// git committer) still lives in the meta flake's wrapper.
pub fn initial_flake_nix() -> &'static str {
"{\n description = \"hyperhive agent\";\n inputs = { };\n outputs =\n { self, ... }@inputs:\n {\n nixosModules.default = {\n imports = [ ./agent.nix ];\n _module.args.flakeInputs = builtins.removeAttrs inputs [ \"self\" ];\n };\n };\n}\n"
}
async fn git_commit(dir: &Path, message: &str) -> Result<()> {
git(
dir,
&[
"-c",
&format!("user.name={GIT_NAME}"),
"-c",
&format!("user.email={GIT_EMAIL}"),
"commit",
"-m",
message,
],
)
.await
}
/// Spawn `git` honoring the `HYPERHIVE_GIT` env var (absolute path baked in
/// by the NixOS module), falling back to bare `git` (PATH lookup) otherwise.
#[must_use]
pub fn git_command() -> Command {
let exe = std::env::var("HYPERHIVE_GIT").unwrap_or_else(|_| "git".into());
Command::new(exe)
}
async fn git(dir: &Path, args: &[&str]) -> Result<()> {
let out = git_command()
.current_dir(dir)
.args(args)
.output()
.await
.with_context(|| format!("git {} in {}", args.join(" "), dir.display()))?;
if !out.status.success() {
bail!(
"git {} failed ({}): {}",
args.join(" "),
out.status,
String::from_utf8_lossy(&out.stderr).trim()
);
}
Ok(())
}
/// Fetch `sha` from the `src` git repo into `dst` and pin it as
/// `refs/tags/<tag>`. Used at `request_apply_commit` time so hive-c0re
/// captures an immutable handle on the manager's commit; subsequent
/// amendments / force-pushes in `src` no longer affect what gets
/// built. Returns the resolved sha (which equals `sha` on success
/// but normalised — short shas get expanded).
pub async fn git_fetch_to_tag(dst: &Path, src: &Path, sha: &str, tag: &str) -> Result<String> {
let src_str = src.display().to_string();
let refspec = format!("{sha}:refs/tags/{tag}");
git(dst, &["fetch", "--no-tags", &src_str, &refspec]).await?;
git_rev_parse(dst, &format!("refs/tags/{tag}")).await
}
/// Resolve `refname` (a tag, branch, or sha) in `dir` to its full sha.
pub async fn git_rev_parse(dir: &Path, refname: &str) -> Result<String> {
let out = git_command()
.current_dir(dir)
.args(["rev-parse", refname])
.output()
.await
.with_context(|| format!("git rev-parse {refname} in {}", dir.display()))?;
if !out.status.success() {
bail!(
"git rev-parse {refname} failed ({}): {}",
out.status,
String::from_utf8_lossy(&out.stderr).trim()
);
}
Ok(String::from_utf8_lossy(&out.stdout).trim().to_owned())
}
/// Plant a lightweight tag at `target`. Errors if the tag already
/// exists — we want loud failures on id reuse, not silent
/// overwrites.
pub async fn git_tag(dir: &Path, name: &str, target: &str) -> Result<()> {
git(dir, &["tag", name, target]).await
}
/// Plant an annotated tag with `body` as the message. Used for
/// `failed/<id>` (body = build error) and `denied/<id>` (body =
/// operator note). Multi-line bodies handled via stdin so we don't
/// have to escape anything.
pub async fn git_tag_annotated(dir: &Path, name: &str, target: &str, body: &str) -> Result<()> {
use tokio::io::AsyncWriteExt;
// Annotated tags are git objects, so they need a tagger identity
// (same constraint as a commit). Pass the hive-c0re identity
// inline rather than relying on a global git config — applied
// repos are hive-c0re-owned and the host's user might not have
// user.email set.
let mut child = git_command()
.current_dir(dir)
.args([
"-c",
&format!("user.name={GIT_NAME}"),
"-c",
&format!("user.email={GIT_EMAIL}"),
"tag",
"-a",
name,
target,
"-F",
"-",
])
.stdin(std::process::Stdio::piped())
.stdout(std::process::Stdio::piped())
.stderr(std::process::Stdio::piped())
.spawn()
.with_context(|| format!("spawn git tag -a {name} in {}", dir.display()))?;
if let Some(mut stdin) = child.stdin.take() {
stdin
.write_all(body.as_bytes())
.await
.context("write tag body to git stdin")?;
// Drop closes stdin so git can finish reading.
drop(stdin);
}
let out = child.wait_with_output().await.context("wait git tag -a")?;
if !out.status.success() {
bail!(
"git tag -a {name} failed ({}): {}",
out.status,
String::from_utf8_lossy(&out.stderr).trim()
);
}
Ok(())
}
/// Replace working tree + index with the tree at `target` without
/// moving HEAD. `applied/main` stays pointing at the last known-good
/// `deployed/*` while we let `nixos-container update` evaluate the
/// candidate. On build failure callers reset back to HEAD; on
/// success they fast-forward main to `target`.
pub async fn git_read_tree_reset(dir: &Path, target: &str) -> Result<()> {
git(dir, &["read-tree", "--reset", "-u", target]).await
}
/// Hard-set a ref to `target`. Used to fast-forward `refs/heads/main`
/// to the just-deployed proposal commit. Uses `update-ref`, not
/// `branch -f`, so it works regardless of where HEAD currently sits.
pub async fn git_update_ref(dir: &Path, refname: &str, target: &str) -> Result<()> {
git(dir, &["update-ref", refname, target]).await
}
/// Write a systemd drop-in for `container@<container>.service` that applies
/// our default resource caps. Goes under `/run/systemd/system/...` so it's
/// ephemeral (regenerated on every spawn / rebuild).
fn set_resource_limits(container: &str) -> Result<()> {
let dir = format!("/run/systemd/system/container@{container}.service.d");
std::fs::create_dir_all(&dir).with_context(|| format!("create {dir}"))?;
let path = format!("{dir}/hyperhive-limits.conf");
let content =
format!("[Service]\nMemoryMax={DEFAULT_MEMORY_MAX}\nCPUQuota={DEFAULT_CPU_QUOTA}\n",);
std::fs::write(&path, content).with_context(|| format!("write {path}"))?;
tracing::info!(
%path,
memory_max = DEFAULT_MEMORY_MAX,
cpu_quota = DEFAULT_CPU_QUOTA,
"wrote resource limits drop-in"
);
Ok(())
}
async fn systemd_daemon_reload() -> Result<()> {
let out = Command::new("systemctl")
.arg("daemon-reload")
.output()
.await
.context("invoke systemctl daemon-reload")?;
if !out.status.success() {
bail!(
"systemctl daemon-reload failed ({}): {}",
out.status,
String::from_utf8_lossy(&out.stderr).trim()
);
}
Ok(())
}
/// Idempotently rewrite the lines in `/etc/nixos-containers/<container>.conf`
/// that hive-c0re owns: `PRIVATE_NETWORK` (forced 0 so the agent's web UI port
/// is reachable on the host) and `EXTRA_NSPAWN_FLAGS` (the runtime-dir bind).
/// The start script expands `$EXTRA_NSPAWN_FLAGS` unquoted into the
/// `systemd-nspawn` command.
/// Where in the container's filesystem the manager sees its agents tree.
/// Matches the `/agents` path that pre-Phase-8 hosts declared via
/// `containers.hm1nd.bindMounts."/agents"`.
pub const CONTAINER_MANAGER_AGENTS_MOUNT: &str = "/agents";
/// Where the manager sees the applied trees of every agent, read-only.
/// Manager runs `git fetch /applied/<n>/.git refs/tags/*:refs/tags/applied/*`
/// to learn what hive-c0re deployed (or rejected, or failed to
/// build); the RO bind makes accidental writes impossible from
/// inside the container.
pub const CONTAINER_MANAGER_APPLIED_MOUNT: &str = "/applied";
/// The on-host root that gets bind-mounted to `/agents` inside the manager.
/// Hard-coded to match `AGENT_STATE_ROOT` in coordinator.rs (kept duplicated
/// here so lifecycle stays usable as a leaf module).
const HOST_AGENTS_ROOT: &str = "/var/lib/hyperhive/agents";
/// On-host applied repo root, mirrored RO into the manager. Matches
/// `APPLIED_STATE_ROOT` in coordinator.rs.
const HOST_APPLIED_ROOT: &str = "/var/lib/hyperhive/applied";
/// On-host meta repo root, mirrored RO into the manager. Matches
/// `meta::meta_dir()` but duplicated here so lifecycle stays a leaf.
const HOST_META_ROOT: &str = "/var/lib/hyperhive/meta";
/// Shared directory accessible to all agents. All agents bind-mount this RW.
const HOST_SHARED_ROOT: &str = "/var/lib/hyperhive/shared";
fn set_nspawn_flags(
container: &str,
runtime_dir: &Path,
claude_dir: &Path,
notes_dir: &Path,
) -> Result<()> {
// Ensure /shared directory exists before binding. systemd-nspawn requires the bind source to exist.
std::fs::create_dir_all(HOST_SHARED_ROOT)
.with_context(|| format!("create {HOST_SHARED_ROOT}"))?;
let path = format!("/etc/nixos-containers/{container}.conf");
let original = std::fs::read_to_string(&path).with_context(|| format!("read {path}"))?;
// Compute the in-container state mount point. Sub-agents get
// /agents/<name>/state; the manager keeps the legacy /state path.
// Claude credentials always land at /root/.claude for all agents so
// the `claude` CLI (which reads $HOME/.claude) finds them without any
// HOME override.
let notes_mount = if container == MANAGER_NAME {
CONTAINER_NOTES_MOUNT.to_owned()
} else {
let agent_name = container.strip_prefix(AGENT_PREFIX).unwrap_or(container);
format!("/agents/{agent_name}/state")
};
let claude_mount = CONTAINER_CLAUDE_MOUNT;
let mut binds = format!(
"--bind={runtime}:{CONTAINER_RUNTIME_MOUNT} --bind={claude}:{claude_mount} --bind={notes}:{notes_mount} --bind={shared}:{CONTAINER_SHARED_MOUNT}",
runtime = runtime_dir.display(),
claude = claude_dir.display(),
notes = notes_dir.display(),
shared = HOST_SHARED_ROOT,
);
if container == MANAGER_NAME {
use std::fmt::Write as _;
// systemd-nspawn refuses to start a container whose bind
// source doesn't exist. The meta repo is created by the
// startup migration, but make sure the directory is there
// before the manager comes up in case set_nspawn_flags fires
// first (e.g. cold start with no agents).
std::fs::create_dir_all(HOST_META_ROOT)
.with_context(|| format!("create {HOST_META_ROOT}"))?;
// Manager edits sub-agent proposed/ repos and its own. RW so it can
// git-commit. Sub-agents see only their own /run/hive socket and
// /root/.claude (no /agents or /applied).
//
// /applied is a separate RO mount of the hive-c0re-only applied
// repos so the manager can `git fetch /applied/<n>/.git
// refs/tags/*:refs/tags/applied/*` to mirror deployed/failed/
// denied tags into its proposed clones and diff against
// what's actually deployed. RO bind makes destructive git
// plumbing inside the container unable to corrupt applied.
//
// /meta is a third RO mount exposing the system-wide deploy
// flake (`git log /meta --oneline` shows every deploy across
// every agent; `cat /meta/flake.lock` resolves which sha each
// agent is pinned at right now).
let _ = write!(
binds,
" --bind={HOST_AGENTS_ROOT}:{CONTAINER_MANAGER_AGENTS_MOUNT}",
);
let _ = write!(
binds,
" --bind-ro={HOST_APPLIED_ROOT}:{CONTAINER_MANAGER_APPLIED_MOUNT}",
);
let _ = write!(
binds,
" --bind-ro={HOST_META_ROOT}:{mount}",
mount = crate::meta::CONTAINER_MANAGER_META_MOUNT,
);
}
let bind_flag = format!("EXTRA_NSPAWN_FLAGS=\"{binds}\"");
let mut lines: Vec<String> = original
.lines()
.filter(|line| {
let trimmed = line.trim_start();
// Strip any network-namespace knobs nixos-container's create
// might have populated. The start script adds `--network-veth`
// whenever HOST_ADDRESS / LOCAL_ADDRESS (or their IPv6 cousins)
// are non-empty — and veth implies a private netns, hiding our
// web-UI port from the host. Force host netns.
!trimmed.starts_with("EXTRA_NSPAWN_FLAGS=")
&& !trimmed.starts_with("PRIVATE_NETWORK=")
&& !trimmed.starts_with("HOST_ADDRESS=")
&& !trimmed.starts_with("LOCAL_ADDRESS=")
&& !trimmed.starts_with("HOST_ADDRESS6=")
&& !trimmed.starts_with("LOCAL_ADDRESS6=")
&& !trimmed.starts_with("HOST_BRIDGE=")
})
.map(str::to_owned)
.collect();
lines.push("PRIVATE_NETWORK=0".to_owned());
lines.push("HOST_ADDRESS=".to_owned());
lines.push("LOCAL_ADDRESS=".to_owned());
lines.push("HOST_ADDRESS6=".to_owned());
lines.push("LOCAL_ADDRESS6=".to_owned());
lines.push("HOST_BRIDGE=".to_owned());
lines.push(bind_flag);
let mut content = lines.join("\n");
content.push('\n');
std::fs::write(&path, content).with_context(|| format!("write {path}"))?;
tracing::info!(%path, "set PRIVATE_NETWORK=0 + EXTRA_NSPAWN_FLAGS");
Ok(())
}
/// Spawn `nixos-container <args>` and pipe its stdout + stderr into
/// `tracing` one line at a time so a long-running command (most
/// notably `update`, which kicks off a full nix build that can run
/// for minutes on a stale flake) shows progress in journald as it
/// happens. The buffered `.output()` we used before only flushed the
/// summary at exit, which made "slow" and "stuck" look identical to
/// the operator watching `journalctl -u hive-c0re -f`.
///
/// stdout lines log at INFO, stderr at WARN. Stderr lines are also
/// collected into a single string so the bailout message at the end
/// can include the actual failure reason (nix dumps eval errors to
/// stderr).
async fn run(args: &[&str]) -> Result<()> {
use tokio::io::{AsyncBufReadExt, BufReader};
let cmdline = args.join(" ");
let mut child = Command::new("nixos-container")
.args(args)
.stdout(std::process::Stdio::piped())
.stderr(std::process::Stdio::piped())
.spawn()
.with_context(|| format!("invoke nixos-container {cmdline}"))?;
let stdout = child.stdout.take().expect("piped stdout");
let stderr = child.stderr.take().expect("piped stderr");
let stdout_cmdline = cmdline.clone();
let pump_stdout = tokio::spawn(async move {
let mut lines = BufReader::new(stdout).lines();
while let Ok(Some(line)) = lines.next_line().await {
tracing::info!(target: "nixos-container", cmdline = %stdout_cmdline, "{line}");
}
});
// Tail of stderr lines (last 32) for the bailout message. Newer
// lines push older ones out; nix's actual error usually lands
// in the last few lines.
let stderr_cmdline = cmdline.clone();
let stderr_tail: std::sync::Arc<std::sync::Mutex<std::collections::VecDeque<String>>> =
std::sync::Arc::new(std::sync::Mutex::new(
std::collections::VecDeque::with_capacity(32),
));
let stderr_tail_pump = stderr_tail.clone();
let pump_stderr = tokio::spawn(async move {
let mut lines = BufReader::new(stderr).lines();
while let Ok(Some(line)) = lines.next_line().await {
tracing::warn!(target: "nixos-container", cmdline = %stderr_cmdline, "{line}");
let mut tail = stderr_tail_pump.lock().unwrap();
if tail.len() == 32 {
tail.pop_front();
}
tail.push_back(line);
}
});
let status = child
.wait()
.await
.with_context(|| format!("wait nixos-container {cmdline}"))?;
let _ = pump_stdout.await;
let _ = pump_stderr.await;
if !status.success() {
let tail = stderr_tail
.lock()
.unwrap()
.iter()
.cloned()
.collect::<Vec<_>>()
.join("\n");
bail!("nixos-container {cmdline} failed ({status}): {tail}");
}
Ok(())
}