910 lines
36 KiB
Rust
910 lines
36 KiB
Rust
//! `nixos-container` lifecycle + per-agent config flake generation.
|
|
|
|
use std::path::Path;
|
|
|
|
use anyhow::{Context, Result, bail};
|
|
use tokio::process::Command;
|
|
|
|
/// Sub-agent container prefix. `nixos-container` caps the total container name
|
|
/// at 11 chars (it gets encoded into network interface names), so the agent
|
|
/// name itself can be at most `MAX_AGENT_NAME` chars.
|
|
pub const AGENT_PREFIX: &str = "h-";
|
|
pub const MAX_AGENT_NAME: usize = 9;
|
|
/// Container name of the manager. Lives in the same path scheme as sub-agents
|
|
/// (`/var/lib/hyperhive/agents/hm1nd/`, `/var/lib/hyperhive/applied/hm1nd/`),
|
|
/// but its container has no `h-` prefix and extends a different
|
|
/// nixosConfiguration (`manager`, not `agent-base`).
|
|
pub const MANAGER_NAME: &str = "hm1nd";
|
|
|
|
/// Web UI port reserved for the manager (sub-agents hash into 8100..8999).
|
|
pub const MANAGER_PORT: u16 = 8000;
|
|
|
|
/// Mount point of the per-agent runtime directory inside the container.
|
|
pub const CONTAINER_RUNTIME_MOUNT: &str = "/run/hive";
|
|
|
|
/// Mount point of the per-agent Claude credentials dir inside the container.
|
|
/// Persistent across destroy/recreate so OAuth login survives.
|
|
pub const CONTAINER_CLAUDE_MOUNT: &str = "/root/.claude";
|
|
|
|
/// Mount point of the per-agent durable knowledge dir inside the container.
|
|
/// Agents are told (system prompt) to keep `notes.md` and any other scratch
|
|
/// state here; persists across destroy/recreate.
|
|
pub const CONTAINER_NOTES_MOUNT: &str = "/state";
|
|
|
|
/// Mount point of the shared directory accessible to all agents.
|
|
/// All agents can read/write here; agents should only put things they're
|
|
/// willing to lose (other agents may delete them).
|
|
pub const CONTAINER_SHARED_MOUNT: &str = "/shared";
|
|
|
|
const GIT_NAME: &str = "c0re";
|
|
const GIT_EMAIL: &str = "c0re@hyperhive";
|
|
|
|
/// Sub-agent web UI port range. Deterministic from the agent's name (FNV-1a
|
|
/// hash mod range size), so the dashboard can compute the same port without
|
|
/// asking hive-c0re.
|
|
const WEB_PORT_BASE: u16 = 8100;
|
|
const WEB_PORT_RANGE: u16 = 900;
|
|
|
|
/// Default resource caps applied to every managed container via a systemd
|
|
/// drop-in under `/run/systemd/system/container@<NAME>.service.d/`.
|
|
const DEFAULT_MEMORY_MAX: &str = "2G";
|
|
const DEFAULT_CPU_QUOTA: &str = "50%";
|
|
|
|
/// Per-agent web UI port. Manager is fixed at `MANAGER_PORT`; every
|
|
/// sub-agent is `WEB_PORT_BASE + FNV-1a(name) % WEB_PORT_RANGE`,
|
|
/// pure and reproducible from just the name. Collisions are
|
|
/// possible (birthday paradox at ~30 agents); the operator resolves
|
|
/// them by renaming an agent (different hash → different port).
|
|
/// Stable across hosts, restarts, and dashboard renders — no
|
|
/// state-file dance.
|
|
#[must_use]
|
|
pub fn agent_web_port(name: &str) -> u16 {
|
|
if name == MANAGER_NAME {
|
|
return MANAGER_PORT;
|
|
}
|
|
let mut hash: u32 = 2_166_136_261;
|
|
for b in name.bytes() {
|
|
hash ^= u32::from(b);
|
|
hash = hash.wrapping_mul(16_777_619);
|
|
}
|
|
// Modulo of a u32 by a u16's value is guaranteed < u16::MAX, so try_from never fails.
|
|
WEB_PORT_BASE + u16::try_from(hash % u32::from(WEB_PORT_RANGE)).unwrap_or(0)
|
|
}
|
|
|
|
#[must_use]
|
|
pub fn container_name(name: &str) -> String {
|
|
if name == MANAGER_NAME {
|
|
MANAGER_NAME.to_owned()
|
|
} else {
|
|
format!("{AGENT_PREFIX}{name}")
|
|
}
|
|
}
|
|
|
|
#[must_use]
|
|
pub fn is_manager(name: &str) -> bool {
|
|
name == MANAGER_NAME
|
|
}
|
|
|
|
fn validate(name: &str) -> Result<()> {
|
|
if name.is_empty() {
|
|
bail!("agent name must not be empty");
|
|
}
|
|
if is_manager(name) {
|
|
return Ok(());
|
|
}
|
|
if name.len() > MAX_AGENT_NAME {
|
|
bail!(
|
|
"agent name '{name}' is too long ({} chars); max {MAX_AGENT_NAME}",
|
|
name.len()
|
|
);
|
|
}
|
|
Ok(())
|
|
}
|
|
|
|
/// First name (≠ `self_name`) currently running whose hashed port
|
|
/// matches this agent's. The harness inside the colliding container
|
|
/// would otherwise loop on `AddrInUse` forever; we surface the
|
|
/// conflict here so spawn / rebuild fails loudly with an actionable
|
|
/// message instead.
|
|
async fn port_collision(self_name: &str) -> Option<String> {
|
|
let port = agent_web_port(self_name);
|
|
let raw = list().await.unwrap_or_default();
|
|
for c in raw {
|
|
let other = if c == MANAGER_NAME {
|
|
MANAGER_NAME.to_owned()
|
|
} else if let Some(n) = c.strip_prefix(AGENT_PREFIX) {
|
|
n.to_owned()
|
|
} else {
|
|
continue;
|
|
};
|
|
if other == self_name {
|
|
continue;
|
|
}
|
|
if agent_web_port(&other) == port && is_running(&other).await {
|
|
return Some(other);
|
|
}
|
|
}
|
|
None
|
|
}
|
|
|
|
#[allow(clippy::too_many_arguments)]
|
|
pub async fn spawn(
|
|
name: &str,
|
|
hyperhive_flake: &str,
|
|
agent_dir: &Path,
|
|
proposed_dir: &Path,
|
|
applied_dir: &Path,
|
|
claude_dir: &Path,
|
|
notes_dir: &Path,
|
|
dashboard_port: u16,
|
|
operator_pronouns: &str,
|
|
) -> Result<()> {
|
|
validate(name)?;
|
|
if let Some(other) = port_collision(name).await {
|
|
bail!(
|
|
"port {} is already taken by '{other}' — rename one of them and retry",
|
|
agent_web_port(name)
|
|
);
|
|
}
|
|
setup_proposed(proposed_dir, name).await?;
|
|
setup_applied(applied_dir, Some(proposed_dir), name).await?;
|
|
ensure_claude_dir(claude_dir)?;
|
|
ensure_state_dir(notes_dir)?;
|
|
// Meta flake gets the new agent's input + nixosConfiguration
|
|
// before `nixos-container create` so the `--flake meta#<name>`
|
|
// ref resolves.
|
|
let agents = agents_after_spawn(name).await?;
|
|
crate::meta::sync_agents(hyperhive_flake, dashboard_port, operator_pronouns, &agents).await?;
|
|
let container = container_name(name);
|
|
let flake_ref = format!("{}#{name}", crate::meta::meta_dir().display());
|
|
run(&["create", &container, "--flake", &flake_ref]).await?;
|
|
set_nspawn_flags(&container, agent_dir, claude_dir, notes_dir)?;
|
|
set_resource_limits(&container)?;
|
|
systemd_daemon_reload().await?;
|
|
run(&["start", &container]).await
|
|
}
|
|
|
|
/// Build the `AgentSpec` list for the meta flake from `nixos-container
|
|
/// list` + a hypothetical extra name not yet in the list (for spawn
|
|
/// where the new agent's container doesn't exist yet). Pass empty
|
|
/// `name_to_add` from rebuild paths where the agent is already in the
|
|
/// container list.
|
|
async fn agents_for_meta(name_to_add: Option<&str>) -> Result<Vec<crate::meta::AgentSpec>> {
|
|
let containers = list().await.unwrap_or_default();
|
|
let mut out: Vec<crate::meta::AgentSpec> = containers
|
|
.into_iter()
|
|
.filter_map(|c| {
|
|
let (name, is_manager) = if c == MANAGER_NAME {
|
|
(MANAGER_NAME.to_owned(), true)
|
|
} else if let Some(n) = c.strip_prefix(AGENT_PREFIX) {
|
|
(n.to_owned(), false)
|
|
} else {
|
|
return None;
|
|
};
|
|
Some(crate::meta::AgentSpec {
|
|
port: agent_web_port(&name),
|
|
name,
|
|
is_manager,
|
|
})
|
|
})
|
|
.collect();
|
|
if let Some(extra) = name_to_add
|
|
&& !out.iter().any(|a| a.name == extra)
|
|
{
|
|
out.push(crate::meta::AgentSpec {
|
|
name: extra.to_owned(),
|
|
is_manager: is_manager(extra),
|
|
port: agent_web_port(extra),
|
|
});
|
|
}
|
|
out.sort_by(|a, b| a.name.cmp(&b.name));
|
|
Ok(out)
|
|
}
|
|
|
|
async fn agents_after_spawn(name: &str) -> Result<Vec<crate::meta::AgentSpec>> {
|
|
agents_for_meta(Some(name)).await
|
|
}
|
|
|
|
/// Public enumeration of currently-existing agents (whatever
|
|
/// `nixos-container list` says), sorted, no extras. For callers
|
|
/// outside this module that need to reseed meta after lifecycle
|
|
/// changes — destroy, startup reconciliation, etc.
|
|
pub async fn agents_for_meta_listing() -> Result<Vec<crate::meta::AgentSpec>> {
|
|
agents_for_meta(None).await
|
|
}
|
|
|
|
pub async fn kill(name: &str) -> Result<()> {
|
|
validate(name)?;
|
|
let container = container_name(name);
|
|
run(&["stop", &container]).await
|
|
}
|
|
|
|
pub async fn start(name: &str) -> Result<()> {
|
|
validate(name)?;
|
|
let container = container_name(name);
|
|
run(&["start", &container]).await
|
|
}
|
|
|
|
/// Stop + start without regenerating any config. For "kick the container"
|
|
/// without touching the flake or nspawn flags.
|
|
pub async fn restart(name: &str) -> Result<()> {
|
|
kill(name).await?;
|
|
start(name).await
|
|
}
|
|
|
|
/// True when the container's systemd unit is active. Used by the dashboard
|
|
/// to gate stop/restart buttons.
|
|
pub async fn is_running(name: &str) -> bool {
|
|
let container = container_name(name);
|
|
let unit = format!("container@{container}.service");
|
|
Command::new("systemctl")
|
|
.args(["is-active", "--quiet", &unit])
|
|
.status()
|
|
.await
|
|
.map(|s| s.success())
|
|
.unwrap_or(false)
|
|
}
|
|
|
|
/// Fully tear down a sub-agent's container: stop + remove via `nixos-container
|
|
/// destroy`, then clean our own systemd drop-in. Leaves it to the caller to
|
|
/// wipe `/var/lib/hyperhive/...` state and the per-agent runtime dir.
|
|
pub async fn destroy(name: &str) -> Result<()> {
|
|
validate(name)?;
|
|
let container = container_name(name);
|
|
// nixos-container destroy handles stop + removal of /var/lib/nixos-containers/<C>
|
|
// and /etc/nixos-containers/<C>.conf. Tolerate "no such container".
|
|
if let Err(e) = run(&["destroy", &container]).await {
|
|
tracing::warn!(error = ?e, "nixos-container destroy returned an error; continuing cleanup");
|
|
}
|
|
let dropin_dir = format!("/run/systemd/system/container@{container}.service.d");
|
|
if std::path::Path::new(&dropin_dir).exists() {
|
|
std::fs::remove_dir_all(&dropin_dir).with_context(|| format!("remove {dropin_dir}"))?;
|
|
}
|
|
Ok(())
|
|
}
|
|
|
|
#[allow(clippy::too_many_arguments)]
|
|
pub async fn rebuild(
|
|
name: &str,
|
|
hyperhive_flake: &str,
|
|
agent_dir: &Path,
|
|
applied_dir: &Path,
|
|
claude_dir: &Path,
|
|
notes_dir: &Path,
|
|
dashboard_port: u16,
|
|
operator_pronouns: &str,
|
|
) -> Result<()> {
|
|
// Sync the meta flake (idempotent — no-op when the rendered
|
|
// flake matches disk) so a manual rebuild from the dashboard
|
|
// can also recover from a divergent meta repo (e.g. an agent
|
|
// got added directly via `nixos-container create` outside
|
|
// hive-c0re).
|
|
let agents = agents_for_meta(None).await?;
|
|
crate::meta::sync_agents(hyperhive_flake, dashboard_port, operator_pronouns, &agents).await?;
|
|
// Then bump just this agent's input — picks up whatever
|
|
// `applied/<n>/main` currently points at (deployed/<latest>).
|
|
// Commits the lock if it changed.
|
|
crate::meta::lock_update_for_rebuild(name).await?;
|
|
rebuild_no_meta(name, agent_dir, applied_dir, claude_dir, notes_dir).await
|
|
}
|
|
|
|
/// Container-level rebuild without touching the meta repo. Callers
|
|
/// that own the meta side themselves (`actions::run_apply_commit`
|
|
/// drives meta through the two-phase prepare/finalize/abort flow)
|
|
/// use this directly. Public `rebuild` wraps it with idempotent meta
|
|
/// sync + lock-bump-and-commit.
|
|
pub async fn rebuild_no_meta(
|
|
name: &str,
|
|
agent_dir: &Path,
|
|
applied_dir: &Path,
|
|
claude_dir: &Path,
|
|
notes_dir: &Path,
|
|
) -> Result<()> {
|
|
validate(name)?;
|
|
if let Some(other) = port_collision(name).await {
|
|
bail!(
|
|
"port {} is already taken by '{other}' — rename one of them and retry",
|
|
agent_web_port(name)
|
|
);
|
|
}
|
|
setup_applied(applied_dir, None, name).await?;
|
|
ensure_claude_dir(claude_dir)?;
|
|
ensure_state_dir(notes_dir)?;
|
|
let container = container_name(name);
|
|
let flake_ref = format!("{}#{name}", crate::meta::meta_dir().display());
|
|
set_nspawn_flags(&container, agent_dir, claude_dir, notes_dir)?;
|
|
set_resource_limits(&container)?;
|
|
systemd_daemon_reload().await?;
|
|
run(&["update", &container, "--flake", &flake_ref]).await?;
|
|
// Restart so any nspawn-level changes (bind mounts, networking, etc.) apply.
|
|
run(&["stop", &container]).await?;
|
|
run(&["start", &container]).await
|
|
}
|
|
|
|
pub async fn list() -> Result<Vec<String>> {
|
|
let out = Command::new("nixos-container")
|
|
.arg("list")
|
|
.output()
|
|
.await
|
|
.context("invoke nixos-container list")?;
|
|
if !out.status.success() {
|
|
bail!(
|
|
"nixos-container list exited with status {}: {}",
|
|
out.status,
|
|
String::from_utf8_lossy(&out.stderr).trim()
|
|
);
|
|
}
|
|
Ok(String::from_utf8_lossy(&out.stdout)
|
|
.lines()
|
|
.map(str::trim)
|
|
.filter(|line| line.starts_with(AGENT_PREFIX) || *line == MANAGER_NAME)
|
|
.map(str::to_owned)
|
|
.collect())
|
|
}
|
|
|
|
/// Initialize the manager-editable proposed repo. Seeds two tracked
|
|
/// files: `agent.nix` (the module the manager edits) and `flake.nix`
|
|
/// (the boilerplate that lets the meta flake import this repo as an
|
|
/// input — meta locks at a specific sha and reads
|
|
/// `nixosModules.default`, so `flake.nix` must be in the commit). The
|
|
/// manager shouldn't edit `flake.nix` (the prompt says so) but it's
|
|
/// visible so they can introspect.
|
|
///
|
|
/// Touched by hive-c0re only on first spawn — never again — so the
|
|
/// manager can't be surprised by hive-c0re commits or working-tree
|
|
/// resets.
|
|
pub async fn setup_proposed(proposed_dir: &Path, name: &str) -> Result<()> {
|
|
let fresh = !proposed_dir.join(".git").exists();
|
|
if fresh {
|
|
std::fs::create_dir_all(proposed_dir)
|
|
.with_context(|| format!("create {}", proposed_dir.display()))?;
|
|
let agent_path = proposed_dir.join("agent.nix");
|
|
if !agent_path.exists() {
|
|
std::fs::write(&agent_path, initial_agent_nix(name))
|
|
.with_context(|| format!("write {}", agent_path.display()))?;
|
|
}
|
|
let flake_path = proposed_dir.join("flake.nix");
|
|
if !flake_path.exists() {
|
|
std::fs::write(&flake_path, initial_flake_nix())
|
|
.with_context(|| format!("write {}", flake_path.display()))?;
|
|
}
|
|
git(proposed_dir, &["init", "--initial-branch=main"]).await?;
|
|
git(proposed_dir, &["add", "agent.nix", "flake.nix"]).await?;
|
|
git_commit(proposed_dir, "hive-c0re init").await?;
|
|
}
|
|
// Idempotently wire the `applied` remote — purely for the
|
|
// manager's ergonomics. The URL is the path inside the manager
|
|
// container (`/applied/<n>/.git`), where the RO bind in
|
|
// `set_nspawn_flags` makes it real. hive-c0re itself never
|
|
// dereferences this remote; the host-side fetch in
|
|
// `request_apply_commit` uses absolute host paths.
|
|
ensure_applied_remote(proposed_dir, name).await
|
|
}
|
|
|
|
async fn ensure_applied_remote(proposed_dir: &Path, name: &str) -> Result<()> {
|
|
let want = format!("/applied/{name}/.git");
|
|
let existing = git_command()
|
|
.current_dir(proposed_dir)
|
|
.args(["remote", "get-url", "applied"])
|
|
.output()
|
|
.await
|
|
.with_context(|| format!("git remote get-url applied in {}", proposed_dir.display()))?;
|
|
if existing.status.success() {
|
|
let current = String::from_utf8_lossy(&existing.stdout).trim().to_owned();
|
|
if current == want {
|
|
return Ok(());
|
|
}
|
|
// URL drifted (path scheme changed, etc.) — re-point it.
|
|
return git(proposed_dir, &["remote", "set-url", "applied", &want]).await;
|
|
}
|
|
git(proposed_dir, &["remote", "add", "applied", &want]).await
|
|
}
|
|
|
|
/// Set up the applied repo. First-spawn only: init the repo, pull
|
|
/// proposed's initial commit in via `git fetch`, tag it `deployed/0`.
|
|
/// This is the *only* time hive-c0re reads from `proposed` for an
|
|
/// agent — subsequent proposals are fetched at `request_apply_commit`
|
|
/// time and tagged `proposal/<id>` (see `actions::approve` for the
|
|
/// tag state machine).
|
|
///
|
|
/// `proposed_dir` is `None` on rebuild paths where the repo already
|
|
/// exists — we just verify it's the right shape and bail otherwise.
|
|
/// Unlike the pre-overhaul code path, `flake.nix` is no longer
|
|
/// regenerated at the host level: it's tracked in proposed (seeded by
|
|
/// `setup_proposed`) and rides along on every fetch.
|
|
pub async fn setup_applied(
|
|
applied_dir: &Path,
|
|
proposed_dir: Option<&Path>,
|
|
name: &str,
|
|
) -> Result<()> {
|
|
std::fs::create_dir_all(applied_dir)
|
|
.with_context(|| format!("create {}", applied_dir.display()))?;
|
|
|
|
if !applied_dir.join(".git").exists() {
|
|
let Some(proposed) = proposed_dir else {
|
|
bail!(
|
|
"applied repo at {} is missing its .git directory; \
|
|
cannot rebuild without a proposed source to seed from. \
|
|
destroy --purge and re-spawn this agent.",
|
|
applied_dir.display()
|
|
);
|
|
};
|
|
git(applied_dir, &["init", "--initial-branch=main"]).await?;
|
|
let proposed_str = proposed.display().to_string();
|
|
git(
|
|
applied_dir,
|
|
// --update-head-ok lets us fetch into refs/heads/main while
|
|
// HEAD still points there. git's default safeguard refuses
|
|
// to avoid index/working-tree desync, but the working tree
|
|
// is empty (we just `init`'d) and we read-tree-reset right
|
|
// after, so the safeguard is moot here.
|
|
&[
|
|
"fetch",
|
|
"--no-tags",
|
|
"--update-head-ok",
|
|
&proposed_str,
|
|
"main:refs/heads/main",
|
|
],
|
|
)
|
|
.await?;
|
|
git_read_tree_reset(applied_dir, "refs/heads/main").await?;
|
|
git_tag(applied_dir, "deployed/0", "refs/heads/main").await?;
|
|
} else if git_rev_parse(applied_dir, "refs/tags/deployed/0")
|
|
.await
|
|
.is_err()
|
|
{
|
|
// Pre-overhaul applied repo — no deployed/* tag scheme,
|
|
// flake.nix may be untracked, agent.nix possibly authored by
|
|
// hive-c0re directly. The startup auto-migration fixes this
|
|
// in place; if it didn't run (or got skipped), surface a
|
|
// clear error.
|
|
bail!(
|
|
"applied repo at {} predates the meta-flake layout. \
|
|
Restart hive-c0re to let the auto-migration run, or \
|
|
destroy --purge {name} and re-spawn.",
|
|
applied_dir.display()
|
|
);
|
|
}
|
|
Ok(())
|
|
}
|
|
|
|
/// Create the per-agent Claude credentials dir if missing. Mode 0700 — only
|
|
/// root inside the container reads/writes it. Idempotent: existing dirs are
|
|
/// left untouched (an agent's OAuth tokens survive `destroy`/recreate).
|
|
fn ensure_claude_dir(claude_dir: &Path) -> Result<()> {
|
|
if !claude_dir.exists() {
|
|
std::fs::create_dir_all(claude_dir)
|
|
.with_context(|| format!("create {}", claude_dir.display()))?;
|
|
#[cfg(unix)]
|
|
{
|
|
use std::os::unix::fs::PermissionsExt;
|
|
std::fs::set_permissions(claude_dir, std::fs::Permissions::from_mode(0o700))
|
|
.with_context(|| format!("chmod {}", claude_dir.display()))?;
|
|
}
|
|
}
|
|
Ok(())
|
|
}
|
|
|
|
fn ensure_state_dir(notes_dir: &Path) -> Result<()> {
|
|
if !notes_dir.exists() {
|
|
std::fs::create_dir_all(notes_dir)
|
|
.with_context(|| format!("create {}", notes_dir.display()))?;
|
|
}
|
|
Ok(())
|
|
}
|
|
|
|
fn initial_agent_nix(name: &str) -> String {
|
|
format!(
|
|
"{{ config, pkgs, lib, ... }}:\n{{\n # Per-agent overrides for {name}. This is a regular NixOS module\n # — add packages, services, modules, imports as needed.\n #\n # imports = [ ./extra-module.nix ];\n # environment.systemPackages = with pkgs; [ ];\n}}\n",
|
|
)
|
|
}
|
|
|
|
/// Module-only flake exposed by every agent's repo. Consumed by the
|
|
/// hive-c0re-owned meta flake at `/var/lib/hyperhive/meta/` as a flake
|
|
/// input. The wrapper is intentionally permissive:
|
|
///
|
|
/// - Manager edits `inputs.* = …` to add other flakes (e.g. an MCP
|
|
/// server's own flake) — the lock for those lands in the agent's
|
|
/// own `flake.lock` and rolls up into meta's lock transitively.
|
|
/// - The outputs block forwards every input (minus `self`) into
|
|
/// `agent.nix` as the `flakeInputs` module argument, so the
|
|
/// manager just references `flakeInputs.<name>.packages.${pkgs.system}.default`
|
|
/// without further plumbing.
|
|
///
|
|
/// Identity injection (`HIVE_PORT` / `HIVE_LABEL` / dashboard port /
|
|
/// git committer) still lives in the meta flake's wrapper.
|
|
pub fn initial_flake_nix() -> &'static str {
|
|
"{\n description = \"hyperhive agent\";\n inputs = { };\n outputs =\n { self, ... }@inputs:\n {\n nixosModules.default = {\n imports = [ ./agent.nix ];\n _module.args.flakeInputs = builtins.removeAttrs inputs [ \"self\" ];\n };\n };\n}\n"
|
|
}
|
|
|
|
async fn git_commit(dir: &Path, message: &str) -> Result<()> {
|
|
git(
|
|
dir,
|
|
&[
|
|
"-c",
|
|
&format!("user.name={GIT_NAME}"),
|
|
"-c",
|
|
&format!("user.email={GIT_EMAIL}"),
|
|
"commit",
|
|
"-m",
|
|
message,
|
|
],
|
|
)
|
|
.await
|
|
}
|
|
|
|
/// Spawn `git` honoring the `HYPERHIVE_GIT` env var (absolute path baked in
|
|
/// by the NixOS module), falling back to bare `git` (PATH lookup) otherwise.
|
|
#[must_use]
|
|
pub fn git_command() -> Command {
|
|
let exe = std::env::var("HYPERHIVE_GIT").unwrap_or_else(|_| "git".into());
|
|
Command::new(exe)
|
|
}
|
|
|
|
async fn git(dir: &Path, args: &[&str]) -> Result<()> {
|
|
let out = git_command()
|
|
.current_dir(dir)
|
|
.args(args)
|
|
.output()
|
|
.await
|
|
.with_context(|| format!("git {} in {}", args.join(" "), dir.display()))?;
|
|
if !out.status.success() {
|
|
bail!(
|
|
"git {} failed ({}): {}",
|
|
args.join(" "),
|
|
out.status,
|
|
String::from_utf8_lossy(&out.stderr).trim()
|
|
);
|
|
}
|
|
Ok(())
|
|
}
|
|
|
|
/// Fetch `sha` from the `src` git repo into `dst` and pin it as
|
|
/// `refs/tags/<tag>`. Used at `request_apply_commit` time so hive-c0re
|
|
/// captures an immutable handle on the manager's commit; subsequent
|
|
/// amendments / force-pushes in `src` no longer affect what gets
|
|
/// built. Returns the resolved sha (which equals `sha` on success
|
|
/// but normalised — short shas get expanded).
|
|
pub async fn git_fetch_to_tag(dst: &Path, src: &Path, sha: &str, tag: &str) -> Result<String> {
|
|
let src_str = src.display().to_string();
|
|
let refspec = format!("{sha}:refs/tags/{tag}");
|
|
git(dst, &["fetch", "--no-tags", &src_str, &refspec]).await?;
|
|
git_rev_parse(dst, &format!("refs/tags/{tag}")).await
|
|
}
|
|
|
|
/// Resolve `refname` (a tag, branch, or sha) in `dir` to its full sha.
|
|
pub async fn git_rev_parse(dir: &Path, refname: &str) -> Result<String> {
|
|
let out = git_command()
|
|
.current_dir(dir)
|
|
.args(["rev-parse", refname])
|
|
.output()
|
|
.await
|
|
.with_context(|| format!("git rev-parse {refname} in {}", dir.display()))?;
|
|
if !out.status.success() {
|
|
bail!(
|
|
"git rev-parse {refname} failed ({}): {}",
|
|
out.status,
|
|
String::from_utf8_lossy(&out.stderr).trim()
|
|
);
|
|
}
|
|
Ok(String::from_utf8_lossy(&out.stdout).trim().to_owned())
|
|
}
|
|
|
|
/// Plant a lightweight tag at `target`. Errors if the tag already
|
|
/// exists — we want loud failures on id reuse, not silent
|
|
/// overwrites.
|
|
pub async fn git_tag(dir: &Path, name: &str, target: &str) -> Result<()> {
|
|
git(dir, &["tag", name, target]).await
|
|
}
|
|
|
|
/// Plant an annotated tag with `body` as the message. Used for
|
|
/// `failed/<id>` (body = build error) and `denied/<id>` (body =
|
|
/// operator note). Multi-line bodies handled via stdin so we don't
|
|
/// have to escape anything.
|
|
pub async fn git_tag_annotated(dir: &Path, name: &str, target: &str, body: &str) -> Result<()> {
|
|
use tokio::io::AsyncWriteExt;
|
|
// Annotated tags are git objects, so they need a tagger identity
|
|
// (same constraint as a commit). Pass the hive-c0re identity
|
|
// inline rather than relying on a global git config — applied
|
|
// repos are hive-c0re-owned and the host's user might not have
|
|
// user.email set.
|
|
let mut child = git_command()
|
|
.current_dir(dir)
|
|
.args([
|
|
"-c",
|
|
&format!("user.name={GIT_NAME}"),
|
|
"-c",
|
|
&format!("user.email={GIT_EMAIL}"),
|
|
"tag",
|
|
"-a",
|
|
name,
|
|
target,
|
|
"-F",
|
|
"-",
|
|
])
|
|
.stdin(std::process::Stdio::piped())
|
|
.stdout(std::process::Stdio::piped())
|
|
.stderr(std::process::Stdio::piped())
|
|
.spawn()
|
|
.with_context(|| format!("spawn git tag -a {name} in {}", dir.display()))?;
|
|
if let Some(mut stdin) = child.stdin.take() {
|
|
stdin
|
|
.write_all(body.as_bytes())
|
|
.await
|
|
.context("write tag body to git stdin")?;
|
|
// Drop closes stdin so git can finish reading.
|
|
drop(stdin);
|
|
}
|
|
let out = child.wait_with_output().await.context("wait git tag -a")?;
|
|
if !out.status.success() {
|
|
bail!(
|
|
"git tag -a {name} failed ({}): {}",
|
|
out.status,
|
|
String::from_utf8_lossy(&out.stderr).trim()
|
|
);
|
|
}
|
|
Ok(())
|
|
}
|
|
|
|
/// Replace working tree + index with the tree at `target` without
|
|
/// moving HEAD. `applied/main` stays pointing at the last known-good
|
|
/// `deployed/*` while we let `nixos-container update` evaluate the
|
|
/// candidate. On build failure callers reset back to HEAD; on
|
|
/// success they fast-forward main to `target`.
|
|
pub async fn git_read_tree_reset(dir: &Path, target: &str) -> Result<()> {
|
|
git(dir, &["read-tree", "--reset", "-u", target]).await
|
|
}
|
|
|
|
/// Hard-set a ref to `target`. Used to fast-forward `refs/heads/main`
|
|
/// to the just-deployed proposal commit. Uses `update-ref`, not
|
|
/// `branch -f`, so it works regardless of where HEAD currently sits.
|
|
pub async fn git_update_ref(dir: &Path, refname: &str, target: &str) -> Result<()> {
|
|
git(dir, &["update-ref", refname, target]).await
|
|
}
|
|
|
|
/// Write a systemd drop-in for `container@<container>.service` that applies
|
|
/// our default resource caps. Goes under `/run/systemd/system/...` so it's
|
|
/// ephemeral (regenerated on every spawn / rebuild).
|
|
fn set_resource_limits(container: &str) -> Result<()> {
|
|
let dir = format!("/run/systemd/system/container@{container}.service.d");
|
|
std::fs::create_dir_all(&dir).with_context(|| format!("create {dir}"))?;
|
|
let path = format!("{dir}/hyperhive-limits.conf");
|
|
let content =
|
|
format!("[Service]\nMemoryMax={DEFAULT_MEMORY_MAX}\nCPUQuota={DEFAULT_CPU_QUOTA}\n",);
|
|
std::fs::write(&path, content).with_context(|| format!("write {path}"))?;
|
|
tracing::info!(
|
|
%path,
|
|
memory_max = DEFAULT_MEMORY_MAX,
|
|
cpu_quota = DEFAULT_CPU_QUOTA,
|
|
"wrote resource limits drop-in"
|
|
);
|
|
Ok(())
|
|
}
|
|
|
|
async fn systemd_daemon_reload() -> Result<()> {
|
|
let out = Command::new("systemctl")
|
|
.arg("daemon-reload")
|
|
.output()
|
|
.await
|
|
.context("invoke systemctl daemon-reload")?;
|
|
if !out.status.success() {
|
|
bail!(
|
|
"systemctl daemon-reload failed ({}): {}",
|
|
out.status,
|
|
String::from_utf8_lossy(&out.stderr).trim()
|
|
);
|
|
}
|
|
Ok(())
|
|
}
|
|
|
|
/// Idempotently rewrite the lines in `/etc/nixos-containers/<container>.conf`
|
|
/// that hive-c0re owns: `PRIVATE_NETWORK` (forced 0 so the agent's web UI port
|
|
/// is reachable on the host) and `EXTRA_NSPAWN_FLAGS` (the runtime-dir bind).
|
|
/// The start script expands `$EXTRA_NSPAWN_FLAGS` unquoted into the
|
|
/// `systemd-nspawn` command.
|
|
/// Where in the container's filesystem the manager sees its agents tree.
|
|
/// Matches the `/agents` path that pre-Phase-8 hosts declared via
|
|
/// `containers.hm1nd.bindMounts."/agents"`.
|
|
pub const CONTAINER_MANAGER_AGENTS_MOUNT: &str = "/agents";
|
|
|
|
/// Where the manager sees the applied trees of every agent, read-only.
|
|
/// Manager runs `git fetch /applied/<n>/.git refs/tags/*:refs/tags/applied/*`
|
|
/// to learn what hive-c0re deployed (or rejected, or failed to
|
|
/// build); the RO bind makes accidental writes impossible from
|
|
/// inside the container.
|
|
pub const CONTAINER_MANAGER_APPLIED_MOUNT: &str = "/applied";
|
|
|
|
/// The on-host root that gets bind-mounted to `/agents` inside the manager.
|
|
/// Hard-coded to match `AGENT_STATE_ROOT` in coordinator.rs (kept duplicated
|
|
/// here so lifecycle stays usable as a leaf module).
|
|
const HOST_AGENTS_ROOT: &str = "/var/lib/hyperhive/agents";
|
|
|
|
/// On-host applied repo root, mirrored RO into the manager. Matches
|
|
/// `APPLIED_STATE_ROOT` in coordinator.rs.
|
|
const HOST_APPLIED_ROOT: &str = "/var/lib/hyperhive/applied";
|
|
|
|
/// On-host meta repo root, mirrored RO into the manager. Matches
|
|
/// `meta::meta_dir()` but duplicated here so lifecycle stays a leaf.
|
|
const HOST_META_ROOT: &str = "/var/lib/hyperhive/meta";
|
|
|
|
/// Shared directory accessible to all agents. All agents bind-mount this RW.
|
|
const HOST_SHARED_ROOT: &str = "/var/lib/hyperhive/shared";
|
|
|
|
fn set_nspawn_flags(
|
|
container: &str,
|
|
runtime_dir: &Path,
|
|
claude_dir: &Path,
|
|
notes_dir: &Path,
|
|
) -> Result<()> {
|
|
// Ensure /shared directory exists before binding. systemd-nspawn requires the bind source to exist.
|
|
std::fs::create_dir_all(HOST_SHARED_ROOT)
|
|
.with_context(|| format!("create {HOST_SHARED_ROOT}"))?;
|
|
|
|
let path = format!("/etc/nixos-containers/{container}.conf");
|
|
let original = std::fs::read_to_string(&path).with_context(|| format!("read {path}"))?;
|
|
|
|
// Compute the in-container state mount point. Sub-agents get
|
|
// /agents/<name>/state; the manager keeps the legacy /state path.
|
|
// Claude credentials always land at /root/.claude for all agents so
|
|
// the `claude` CLI (which reads $HOME/.claude) finds them without any
|
|
// HOME override.
|
|
let notes_mount = if container == MANAGER_NAME {
|
|
CONTAINER_NOTES_MOUNT.to_owned()
|
|
} else {
|
|
let agent_name = container.strip_prefix(AGENT_PREFIX).unwrap_or(container);
|
|
format!("/agents/{agent_name}/state")
|
|
};
|
|
let claude_mount = CONTAINER_CLAUDE_MOUNT;
|
|
|
|
let mut binds = format!(
|
|
"--bind={runtime}:{CONTAINER_RUNTIME_MOUNT} --bind={claude}:{claude_mount} --bind={notes}:{notes_mount} --bind={shared}:{CONTAINER_SHARED_MOUNT}",
|
|
runtime = runtime_dir.display(),
|
|
claude = claude_dir.display(),
|
|
notes = notes_dir.display(),
|
|
shared = HOST_SHARED_ROOT,
|
|
);
|
|
if container == MANAGER_NAME {
|
|
use std::fmt::Write as _;
|
|
// systemd-nspawn refuses to start a container whose bind
|
|
// source doesn't exist. The meta repo is created by the
|
|
// startup migration, but make sure the directory is there
|
|
// before the manager comes up in case set_nspawn_flags fires
|
|
// first (e.g. cold start with no agents).
|
|
std::fs::create_dir_all(HOST_META_ROOT)
|
|
.with_context(|| format!("create {HOST_META_ROOT}"))?;
|
|
// Manager edits sub-agent proposed/ repos and its own. RW so it can
|
|
// git-commit. Sub-agents see only their own /run/hive socket and
|
|
// /root/.claude (no /agents or /applied).
|
|
//
|
|
// /applied is a separate RO mount of the hive-c0re-only applied
|
|
// repos so the manager can `git fetch /applied/<n>/.git
|
|
// refs/tags/*:refs/tags/applied/*` to mirror deployed/failed/
|
|
// denied tags into its proposed clones and diff against
|
|
// what's actually deployed. RO bind makes destructive git
|
|
// plumbing inside the container unable to corrupt applied.
|
|
//
|
|
// /meta is a third RO mount exposing the system-wide deploy
|
|
// flake (`git log /meta --oneline` shows every deploy across
|
|
// every agent; `cat /meta/flake.lock` resolves which sha each
|
|
// agent is pinned at right now).
|
|
let _ = write!(
|
|
binds,
|
|
" --bind={HOST_AGENTS_ROOT}:{CONTAINER_MANAGER_AGENTS_MOUNT}",
|
|
);
|
|
let _ = write!(
|
|
binds,
|
|
" --bind-ro={HOST_APPLIED_ROOT}:{CONTAINER_MANAGER_APPLIED_MOUNT}",
|
|
);
|
|
let _ = write!(
|
|
binds,
|
|
" --bind-ro={HOST_META_ROOT}:{mount}",
|
|
mount = crate::meta::CONTAINER_MANAGER_META_MOUNT,
|
|
);
|
|
}
|
|
let bind_flag = format!("EXTRA_NSPAWN_FLAGS=\"{binds}\"");
|
|
let mut lines: Vec<String> = original
|
|
.lines()
|
|
.filter(|line| {
|
|
let trimmed = line.trim_start();
|
|
// Strip any network-namespace knobs nixos-container's create
|
|
// might have populated. The start script adds `--network-veth`
|
|
// whenever HOST_ADDRESS / LOCAL_ADDRESS (or their IPv6 cousins)
|
|
// are non-empty — and veth implies a private netns, hiding our
|
|
// web-UI port from the host. Force host netns.
|
|
!trimmed.starts_with("EXTRA_NSPAWN_FLAGS=")
|
|
&& !trimmed.starts_with("PRIVATE_NETWORK=")
|
|
&& !trimmed.starts_with("HOST_ADDRESS=")
|
|
&& !trimmed.starts_with("LOCAL_ADDRESS=")
|
|
&& !trimmed.starts_with("HOST_ADDRESS6=")
|
|
&& !trimmed.starts_with("LOCAL_ADDRESS6=")
|
|
&& !trimmed.starts_with("HOST_BRIDGE=")
|
|
})
|
|
.map(str::to_owned)
|
|
.collect();
|
|
lines.push("PRIVATE_NETWORK=0".to_owned());
|
|
lines.push("HOST_ADDRESS=".to_owned());
|
|
lines.push("LOCAL_ADDRESS=".to_owned());
|
|
lines.push("HOST_ADDRESS6=".to_owned());
|
|
lines.push("LOCAL_ADDRESS6=".to_owned());
|
|
lines.push("HOST_BRIDGE=".to_owned());
|
|
lines.push(bind_flag);
|
|
let mut content = lines.join("\n");
|
|
content.push('\n');
|
|
std::fs::write(&path, content).with_context(|| format!("write {path}"))?;
|
|
tracing::info!(%path, "set PRIVATE_NETWORK=0 + EXTRA_NSPAWN_FLAGS");
|
|
Ok(())
|
|
}
|
|
|
|
/// Spawn `nixos-container <args>` and pipe its stdout + stderr into
|
|
/// `tracing` one line at a time so a long-running command (most
|
|
/// notably `update`, which kicks off a full nix build that can run
|
|
/// for minutes on a stale flake) shows progress in journald as it
|
|
/// happens. The buffered `.output()` we used before only flushed the
|
|
/// summary at exit, which made "slow" and "stuck" look identical to
|
|
/// the operator watching `journalctl -u hive-c0re -f`.
|
|
///
|
|
/// stdout lines log at INFO, stderr at WARN. Stderr lines are also
|
|
/// collected into a single string so the bailout message at the end
|
|
/// can include the actual failure reason (nix dumps eval errors to
|
|
/// stderr).
|
|
async fn run(args: &[&str]) -> Result<()> {
|
|
use tokio::io::{AsyncBufReadExt, BufReader};
|
|
let cmdline = args.join(" ");
|
|
let mut child = Command::new("nixos-container")
|
|
.args(args)
|
|
.stdout(std::process::Stdio::piped())
|
|
.stderr(std::process::Stdio::piped())
|
|
.spawn()
|
|
.with_context(|| format!("invoke nixos-container {cmdline}"))?;
|
|
|
|
let stdout = child.stdout.take().expect("piped stdout");
|
|
let stderr = child.stderr.take().expect("piped stderr");
|
|
|
|
let stdout_cmdline = cmdline.clone();
|
|
let pump_stdout = tokio::spawn(async move {
|
|
let mut lines = BufReader::new(stdout).lines();
|
|
while let Ok(Some(line)) = lines.next_line().await {
|
|
tracing::info!(target: "nixos-container", cmdline = %stdout_cmdline, "{line}");
|
|
}
|
|
});
|
|
|
|
// Tail of stderr lines (last 32) for the bailout message. Newer
|
|
// lines push older ones out; nix's actual error usually lands
|
|
// in the last few lines.
|
|
let stderr_cmdline = cmdline.clone();
|
|
let stderr_tail: std::sync::Arc<std::sync::Mutex<std::collections::VecDeque<String>>> =
|
|
std::sync::Arc::new(std::sync::Mutex::new(
|
|
std::collections::VecDeque::with_capacity(32),
|
|
));
|
|
let stderr_tail_pump = stderr_tail.clone();
|
|
let pump_stderr = tokio::spawn(async move {
|
|
let mut lines = BufReader::new(stderr).lines();
|
|
while let Ok(Some(line)) = lines.next_line().await {
|
|
tracing::warn!(target: "nixos-container", cmdline = %stderr_cmdline, "{line}");
|
|
let mut tail = stderr_tail_pump.lock().unwrap();
|
|
if tail.len() == 32 {
|
|
tail.pop_front();
|
|
}
|
|
tail.push_back(line);
|
|
}
|
|
});
|
|
|
|
let status = child
|
|
.wait()
|
|
.await
|
|
.with_context(|| format!("wait nixos-container {cmdline}"))?;
|
|
let _ = pump_stdout.await;
|
|
let _ = pump_stderr.await;
|
|
|
|
if !status.success() {
|
|
let tail = stderr_tail
|
|
.lock()
|
|
.unwrap()
|
|
.iter()
|
|
.cloned()
|
|
.collect::<Vec<_>>()
|
|
.join("\n");
|
|
bail!("nixos-container {cmdline} failed ({status}): {tail}");
|
|
}
|
|
Ok(())
|
|
}
|