set_nspawn_flags now adds --bind-ro=/var/lib/hyperhive/applied :/applied for the manager container alongside the existing /agents RW mount. manager can git-fetch deployed/failed/denied tags out of /applied/<n>/.git to mirror them into its proposed clones; the read-only bind means git plumbing inside the container cannot corrupt the authoritative repos. picked up by the next rebuild of hm1nd (no spawn-time change needed since set_nspawn_flags runs on every spawn + rebuild).
718 lines
26 KiB
Rust
718 lines
26 KiB
Rust
//! `nixos-container` lifecycle + per-agent config flake generation.
|
|
|
|
use std::path::Path;
|
|
|
|
use anyhow::{Context, Result, bail};
|
|
use tokio::process::Command;
|
|
|
|
/// Sub-agent container prefix. `nixos-container` caps the total container name
|
|
/// at 11 chars (it gets encoded into network interface names), so the agent
|
|
/// name itself can be at most `MAX_AGENT_NAME` chars.
|
|
pub const AGENT_PREFIX: &str = "h-";
|
|
pub const MAX_AGENT_NAME: usize = 9;
|
|
/// Container name of the manager. Lives in the same path scheme as sub-agents
|
|
/// (`/var/lib/hyperhive/agents/hm1nd/`, `/var/lib/hyperhive/applied/hm1nd/`),
|
|
/// but its container has no `h-` prefix and extends a different
|
|
/// nixosConfiguration (`manager`, not `agent-base`).
|
|
pub const MANAGER_NAME: &str = "hm1nd";
|
|
|
|
/// Web UI port reserved for the manager (sub-agents hash into 8100..8999).
|
|
pub const MANAGER_PORT: u16 = 8000;
|
|
|
|
/// Mount point of the per-agent runtime directory inside the container.
|
|
pub const CONTAINER_RUNTIME_MOUNT: &str = "/run/hive";
|
|
|
|
/// Mount point of the per-agent Claude credentials dir inside the container.
|
|
/// Persistent across destroy/recreate so OAuth login survives.
|
|
pub const CONTAINER_CLAUDE_MOUNT: &str = "/root/.claude";
|
|
|
|
/// Mount point of the per-agent durable knowledge dir inside the container.
|
|
/// Agents are told (system prompt) to keep `notes.md` and any other scratch
|
|
/// state here; persists across destroy/recreate.
|
|
pub const CONTAINER_NOTES_MOUNT: &str = "/state";
|
|
|
|
const GIT_NAME: &str = "hive-c0re";
|
|
const GIT_EMAIL: &str = "hive-c0re@hyperhive";
|
|
|
|
/// Sub-agent web UI port range. Deterministic from the agent's name (FNV-1a
|
|
/// hash mod range size), so the dashboard can compute the same port without
|
|
/// asking hive-c0re.
|
|
const WEB_PORT_BASE: u16 = 8100;
|
|
const WEB_PORT_RANGE: u16 = 900;
|
|
|
|
/// Default resource caps applied to every managed container via a systemd
|
|
/// drop-in under `/run/systemd/system/container@<NAME>.service.d/`.
|
|
const DEFAULT_MEMORY_MAX: &str = "2G";
|
|
const DEFAULT_CPU_QUOTA: &str = "50%";
|
|
|
|
/// Per-agent web UI port. Manager is fixed at `MANAGER_PORT`; every
|
|
/// sub-agent is `WEB_PORT_BASE + FNV-1a(name) % WEB_PORT_RANGE`,
|
|
/// pure and reproducible from just the name. Collisions are
|
|
/// possible (birthday paradox at ~30 agents); the operator resolves
|
|
/// them by renaming an agent (different hash → different port).
|
|
/// Stable across hosts, restarts, and dashboard renders — no
|
|
/// state-file dance.
|
|
#[must_use]
|
|
pub fn agent_web_port(name: &str) -> u16 {
|
|
if name == MANAGER_NAME {
|
|
return MANAGER_PORT;
|
|
}
|
|
let mut hash: u32 = 2_166_136_261;
|
|
for b in name.bytes() {
|
|
hash ^= u32::from(b);
|
|
hash = hash.wrapping_mul(16_777_619);
|
|
}
|
|
// Modulo of a u32 by a u16's value is guaranteed < u16::MAX, so try_from never fails.
|
|
WEB_PORT_BASE + u16::try_from(hash % u32::from(WEB_PORT_RANGE)).unwrap_or(0)
|
|
}
|
|
|
|
#[must_use]
|
|
pub fn container_name(name: &str) -> String {
|
|
if name == MANAGER_NAME {
|
|
MANAGER_NAME.to_owned()
|
|
} else {
|
|
format!("{AGENT_PREFIX}{name}")
|
|
}
|
|
}
|
|
|
|
#[must_use]
|
|
pub fn is_manager(name: &str) -> bool {
|
|
name == MANAGER_NAME
|
|
}
|
|
|
|
/// The nixosConfiguration in the hyperhive flake the agent's `flake.nix`
|
|
/// extends. Manager → `manager`; everyone else → `agent-base`.
|
|
#[must_use]
|
|
pub fn flake_base(name: &str) -> &'static str {
|
|
if is_manager(name) {
|
|
"manager"
|
|
} else {
|
|
"agent-base"
|
|
}
|
|
}
|
|
|
|
fn validate(name: &str) -> Result<()> {
|
|
if name.is_empty() {
|
|
bail!("agent name must not be empty");
|
|
}
|
|
if is_manager(name) {
|
|
return Ok(());
|
|
}
|
|
if name.len() > MAX_AGENT_NAME {
|
|
bail!(
|
|
"agent name '{name}' is too long ({} chars); max {MAX_AGENT_NAME}",
|
|
name.len()
|
|
);
|
|
}
|
|
Ok(())
|
|
}
|
|
|
|
/// First name (≠ `self_name`) currently running whose hashed port
|
|
/// matches this agent's. The harness inside the colliding container
|
|
/// would otherwise loop on `AddrInUse` forever; we surface the
|
|
/// conflict here so spawn / rebuild fails loudly with an actionable
|
|
/// message instead.
|
|
async fn port_collision(self_name: &str) -> Option<String> {
|
|
let port = agent_web_port(self_name);
|
|
let raw = list().await.unwrap_or_default();
|
|
for c in raw {
|
|
let other = if c == MANAGER_NAME {
|
|
MANAGER_NAME.to_owned()
|
|
} else if let Some(n) = c.strip_prefix(AGENT_PREFIX) {
|
|
n.to_owned()
|
|
} else {
|
|
continue;
|
|
};
|
|
if other == self_name {
|
|
continue;
|
|
}
|
|
if agent_web_port(&other) == port && is_running(&other).await {
|
|
return Some(other);
|
|
}
|
|
}
|
|
None
|
|
}
|
|
|
|
#[allow(clippy::too_many_arguments)]
|
|
pub async fn spawn(
|
|
name: &str,
|
|
hyperhive_flake: &str,
|
|
agent_dir: &Path,
|
|
proposed_dir: &Path,
|
|
applied_dir: &Path,
|
|
claude_dir: &Path,
|
|
notes_dir: &Path,
|
|
dashboard_port: u16,
|
|
) -> Result<()> {
|
|
validate(name)?;
|
|
if let Some(other) = port_collision(name).await {
|
|
bail!(
|
|
"port {} is already taken by '{other}' — rename one of them and retry",
|
|
agent_web_port(name)
|
|
);
|
|
}
|
|
setup_proposed(proposed_dir, name).await?;
|
|
setup_applied(
|
|
applied_dir,
|
|
Some(proposed_dir),
|
|
name,
|
|
hyperhive_flake,
|
|
dashboard_port,
|
|
)
|
|
.await?;
|
|
ensure_claude_dir(claude_dir)?;
|
|
ensure_state_dir(notes_dir)?;
|
|
let container = container_name(name);
|
|
let flake_ref = format!("{}#default", applied_dir.display());
|
|
run(&["create", &container, "--flake", &flake_ref]).await?;
|
|
set_nspawn_flags(&container, agent_dir, claude_dir, notes_dir)?;
|
|
set_resource_limits(&container)?;
|
|
systemd_daemon_reload().await?;
|
|
run(&["start", &container]).await
|
|
}
|
|
|
|
pub async fn kill(name: &str) -> Result<()> {
|
|
validate(name)?;
|
|
let container = container_name(name);
|
|
run(&["stop", &container]).await
|
|
}
|
|
|
|
pub async fn start(name: &str) -> Result<()> {
|
|
validate(name)?;
|
|
let container = container_name(name);
|
|
run(&["start", &container]).await
|
|
}
|
|
|
|
/// Stop + start without regenerating any config. For "kick the container"
|
|
/// without touching the flake or nspawn flags.
|
|
pub async fn restart(name: &str) -> Result<()> {
|
|
kill(name).await?;
|
|
start(name).await
|
|
}
|
|
|
|
/// True when the container's systemd unit is active. Used by the dashboard
|
|
/// to gate stop/restart buttons.
|
|
pub async fn is_running(name: &str) -> bool {
|
|
let container = container_name(name);
|
|
let unit = format!("container@{container}.service");
|
|
Command::new("systemctl")
|
|
.args(["is-active", "--quiet", &unit])
|
|
.status()
|
|
.await
|
|
.map(|s| s.success())
|
|
.unwrap_or(false)
|
|
}
|
|
|
|
/// Fully tear down a sub-agent's container: stop + remove via `nixos-container
|
|
/// destroy`, then clean our own systemd drop-in. Leaves it to the caller to
|
|
/// wipe `/var/lib/hyperhive/...` state and the per-agent runtime dir.
|
|
pub async fn destroy(name: &str) -> Result<()> {
|
|
validate(name)?;
|
|
let container = container_name(name);
|
|
// nixos-container destroy handles stop + removal of /var/lib/nixos-containers/<C>
|
|
// and /etc/nixos-containers/<C>.conf. Tolerate "no such container".
|
|
if let Err(e) = run(&["destroy", &container]).await {
|
|
tracing::warn!(error = ?e, "nixos-container destroy returned an error; continuing cleanup");
|
|
}
|
|
let dropin_dir = format!("/run/systemd/system/container@{container}.service.d");
|
|
if std::path::Path::new(&dropin_dir).exists() {
|
|
std::fs::remove_dir_all(&dropin_dir).with_context(|| format!("remove {dropin_dir}"))?;
|
|
}
|
|
Ok(())
|
|
}
|
|
|
|
pub async fn rebuild(
|
|
name: &str,
|
|
hyperhive_flake: &str,
|
|
agent_dir: &Path,
|
|
applied_dir: &Path,
|
|
claude_dir: &Path,
|
|
notes_dir: &Path,
|
|
dashboard_port: u16,
|
|
) -> Result<()> {
|
|
validate(name)?;
|
|
if let Some(other) = port_collision(name).await {
|
|
bail!(
|
|
"port {} is already taken by '{other}' — rename one of them and retry",
|
|
agent_web_port(name)
|
|
);
|
|
}
|
|
setup_applied(applied_dir, None, name, hyperhive_flake, dashboard_port).await?;
|
|
ensure_claude_dir(claude_dir)?;
|
|
ensure_state_dir(notes_dir)?;
|
|
let container = container_name(name);
|
|
let flake_ref = format!("{}#default", applied_dir.display());
|
|
set_nspawn_flags(&container, agent_dir, claude_dir, notes_dir)?;
|
|
set_resource_limits(&container)?;
|
|
systemd_daemon_reload().await?;
|
|
run(&["update", &container, "--flake", &flake_ref]).await?;
|
|
// Restart so any nspawn-level changes (bind mounts, networking, etc.) apply.
|
|
run(&["stop", &container]).await?;
|
|
run(&["start", &container]).await
|
|
}
|
|
|
|
pub async fn list() -> Result<Vec<String>> {
|
|
let out = Command::new("nixos-container")
|
|
.arg("list")
|
|
.output()
|
|
.await
|
|
.context("invoke nixos-container list")?;
|
|
if !out.status.success() {
|
|
bail!(
|
|
"nixos-container list exited with status {}: {}",
|
|
out.status,
|
|
String::from_utf8_lossy(&out.stderr).trim()
|
|
);
|
|
}
|
|
Ok(String::from_utf8_lossy(&out.stdout)
|
|
.lines()
|
|
.map(str::trim)
|
|
.filter(|line| line.starts_with(AGENT_PREFIX) || *line == MANAGER_NAME)
|
|
.map(str::to_owned)
|
|
.collect())
|
|
}
|
|
|
|
/// Initialize the manager-editable proposed repo. Contains only `agent.nix`
|
|
/// (the file the manager edits). Touched by hive-c0re only on first spawn —
|
|
/// never again — so the manager can't be surprised by hive-c0re commits or
|
|
/// working-tree resets.
|
|
pub async fn setup_proposed(proposed_dir: &Path, name: &str) -> Result<()> {
|
|
if proposed_dir.join(".git").exists() {
|
|
return Ok(());
|
|
}
|
|
std::fs::create_dir_all(proposed_dir)
|
|
.with_context(|| format!("create {}", proposed_dir.display()))?;
|
|
let agent_path = proposed_dir.join("agent.nix");
|
|
if !agent_path.exists() {
|
|
std::fs::write(&agent_path, initial_agent_nix(name))
|
|
.with_context(|| format!("write {}", agent_path.display()))?;
|
|
}
|
|
git(proposed_dir, &["init", "--initial-branch=main"]).await?;
|
|
git(proposed_dir, &["add", "agent.nix"]).await?;
|
|
git_commit(proposed_dir, "hive-c0re init").await?;
|
|
Ok(())
|
|
}
|
|
|
|
/// Set up the applied repo. Two responsibilities:
|
|
/// - First-spawn only: init the repo, pull proposed's initial commit
|
|
/// in via `git fetch`, tag it `deployed/0`. This is the *only* time
|
|
/// hive-c0re reads from `proposed` for an agent — subsequent
|
|
/// proposals are fetched at `request_apply_commit` time and tagged
|
|
/// `proposal/<id>` (see `actions::approve` for the tag state
|
|
/// machine).
|
|
/// - Every call: regenerate the untracked `flake.nix` so flake-url /
|
|
/// dashboard-port changes pick up on rebuild without churning the
|
|
/// git log.
|
|
///
|
|
/// `proposed_dir` is `None` on rebuild paths that just want the flake
|
|
/// refreshed.
|
|
pub async fn setup_applied(
|
|
applied_dir: &Path,
|
|
proposed_dir: Option<&Path>,
|
|
name: &str,
|
|
hyperhive_flake: &str,
|
|
dashboard_port: u16,
|
|
) -> Result<()> {
|
|
std::fs::create_dir_all(applied_dir)
|
|
.with_context(|| format!("create {}", applied_dir.display()))?;
|
|
|
|
// 1. First-spawn git init from proposed (or pre-overhaul detection).
|
|
if !applied_dir.join(".git").exists() {
|
|
let Some(proposed) = proposed_dir else {
|
|
bail!(
|
|
"applied repo at {} is missing its .git directory; \
|
|
cannot rebuild without a proposed source to seed from. \
|
|
destroy --purge and re-spawn this agent.",
|
|
applied_dir.display()
|
|
);
|
|
};
|
|
git(applied_dir, &["init", "--initial-branch=main"]).await?;
|
|
let proposed_str = proposed.display().to_string();
|
|
git(
|
|
applied_dir,
|
|
&["fetch", "--no-tags", &proposed_str, "main:refs/heads/main"],
|
|
)
|
|
.await?;
|
|
git_read_tree_reset(applied_dir, "refs/heads/main").await?;
|
|
git_tag(applied_dir, "deployed/0", "refs/heads/main").await?;
|
|
} else if git_rev_parse(applied_dir, "refs/tags/deployed/0")
|
|
.await
|
|
.is_err()
|
|
{
|
|
// Pre-overhaul applied repo — agent.nix is tracked directly,
|
|
// commits authored by hive-c0re, no deployed/* tag scheme.
|
|
// No in-place migration; fail loudly so the operator purges.
|
|
bail!(
|
|
"applied repo at {} predates the tag-driven config flow. \
|
|
Run `hive-c0re destroy --purge {name}` and re-spawn.",
|
|
applied_dir.display()
|
|
);
|
|
}
|
|
|
|
// 2. (Re)write the untracked wrapper flake. Tracked files in the
|
|
// working tree (agent.nix and anything the manager committed) are
|
|
// untouched.
|
|
let port = agent_web_port(name);
|
|
let base = flake_base(name);
|
|
let service = if is_manager(name) {
|
|
"hive-m1nd"
|
|
} else {
|
|
"hive-ag3nt"
|
|
};
|
|
let description = if is_manager(name) {
|
|
format!("hyperhive manager {name}")
|
|
} else {
|
|
format!("hyperhive sub-agent {name}")
|
|
};
|
|
let flake_body = format!(
|
|
r#"{{
|
|
description = "{description}";
|
|
inputs.hyperhive.url = "{hyperhive_flake}";
|
|
outputs =
|
|
{{ hyperhive, ... }}:
|
|
{{
|
|
nixosConfigurations.default = hyperhive.nixosConfigurations.{base}.extendModules {{
|
|
modules = [
|
|
./agent.nix
|
|
{{
|
|
programs.git.config.user = {{
|
|
name = "{name}";
|
|
email = "{name}@hyperhive";
|
|
}};
|
|
systemd.services.{service}.environment = {{
|
|
HIVE_PORT = "{port}";
|
|
HIVE_LABEL = "{name}";
|
|
HIVE_DASHBOARD_PORT = "{dashboard_port}";
|
|
}};
|
|
}}
|
|
];
|
|
}};
|
|
}};
|
|
}}
|
|
"#,
|
|
);
|
|
std::fs::write(applied_dir.join("flake.nix"), flake_body)
|
|
.with_context(|| format!("write {}/flake.nix", applied_dir.display()))?;
|
|
Ok(())
|
|
}
|
|
|
|
/// Create the per-agent Claude credentials dir if missing. Mode 0700 — only
|
|
/// root inside the container reads/writes it. Idempotent: existing dirs are
|
|
/// left untouched (an agent's OAuth tokens survive `destroy`/recreate).
|
|
fn ensure_claude_dir(claude_dir: &Path) -> Result<()> {
|
|
if !claude_dir.exists() {
|
|
std::fs::create_dir_all(claude_dir)
|
|
.with_context(|| format!("create {}", claude_dir.display()))?;
|
|
#[cfg(unix)]
|
|
{
|
|
use std::os::unix::fs::PermissionsExt;
|
|
std::fs::set_permissions(claude_dir, std::fs::Permissions::from_mode(0o700))
|
|
.with_context(|| format!("chmod {}", claude_dir.display()))?;
|
|
}
|
|
}
|
|
Ok(())
|
|
}
|
|
|
|
fn ensure_state_dir(notes_dir: &Path) -> Result<()> {
|
|
if !notes_dir.exists() {
|
|
std::fs::create_dir_all(notes_dir)
|
|
.with_context(|| format!("create {}", notes_dir.display()))?;
|
|
}
|
|
Ok(())
|
|
}
|
|
|
|
fn initial_agent_nix(name: &str) -> String {
|
|
format!(
|
|
"{{ ... }}:\n{{\n # Per-agent overrides for {name}. The manager edits this\n # file (and commits) to customise the agent's NixOS config.\n}}\n",
|
|
)
|
|
}
|
|
|
|
async fn git_commit(dir: &Path, message: &str) -> Result<()> {
|
|
git(
|
|
dir,
|
|
&[
|
|
"-c",
|
|
&format!("user.name={GIT_NAME}"),
|
|
"-c",
|
|
&format!("user.email={GIT_EMAIL}"),
|
|
"commit",
|
|
"-m",
|
|
message,
|
|
],
|
|
)
|
|
.await
|
|
}
|
|
|
|
/// Spawn `git` honoring the `HYPERHIVE_GIT` env var (absolute path baked in
|
|
/// by the NixOS module), falling back to bare `git` (PATH lookup) otherwise.
|
|
#[must_use]
|
|
pub fn git_command() -> Command {
|
|
let exe = std::env::var("HYPERHIVE_GIT").unwrap_or_else(|_| "git".into());
|
|
Command::new(exe)
|
|
}
|
|
|
|
async fn git(dir: &Path, args: &[&str]) -> Result<()> {
|
|
let out = git_command()
|
|
.current_dir(dir)
|
|
.args(args)
|
|
.output()
|
|
.await
|
|
.with_context(|| format!("git {} in {}", args.join(" "), dir.display()))?;
|
|
if !out.status.success() {
|
|
bail!(
|
|
"git {} failed ({}): {}",
|
|
args.join(" "),
|
|
out.status,
|
|
String::from_utf8_lossy(&out.stderr).trim()
|
|
);
|
|
}
|
|
Ok(())
|
|
}
|
|
|
|
/// Fetch `sha` from the `src` git repo into `dst` and pin it as
|
|
/// `refs/tags/<tag>`. Used at request_apply_commit time so hive-c0re
|
|
/// captures an immutable handle on the manager's commit; subsequent
|
|
/// amendments / force-pushes in `src` no longer affect what gets
|
|
/// built. Returns the resolved sha (which equals `sha` on success
|
|
/// but normalised — short shas get expanded).
|
|
#[allow(dead_code)] // wired up by manager_server in a later commit
|
|
pub async fn git_fetch_to_tag(dst: &Path, src: &Path, sha: &str, tag: &str) -> Result<String> {
|
|
let src_str = src.display().to_string();
|
|
let refspec = format!("{sha}:refs/tags/{tag}");
|
|
git(dst, &["fetch", "--no-tags", &src_str, &refspec]).await?;
|
|
git_rev_parse(dst, &format!("refs/tags/{tag}")).await
|
|
}
|
|
|
|
/// Resolve `refname` (a tag, branch, or sha) in `dir` to its full sha.
|
|
#[allow(dead_code)]
|
|
pub async fn git_rev_parse(dir: &Path, refname: &str) -> Result<String> {
|
|
let out = git_command()
|
|
.current_dir(dir)
|
|
.args(["rev-parse", refname])
|
|
.output()
|
|
.await
|
|
.with_context(|| format!("git rev-parse {refname} in {}", dir.display()))?;
|
|
if !out.status.success() {
|
|
bail!(
|
|
"git rev-parse {refname} failed ({}): {}",
|
|
out.status,
|
|
String::from_utf8_lossy(&out.stderr).trim()
|
|
);
|
|
}
|
|
Ok(String::from_utf8_lossy(&out.stdout).trim().to_owned())
|
|
}
|
|
|
|
/// Plant a lightweight tag at `target`. Errors if the tag already
|
|
/// exists — we want loud failures on id reuse, not silent
|
|
/// overwrites.
|
|
#[allow(dead_code)]
|
|
pub async fn git_tag(dir: &Path, name: &str, target: &str) -> Result<()> {
|
|
git(dir, &["tag", name, target]).await
|
|
}
|
|
|
|
/// Plant an annotated tag with `body` as the message. Used for
|
|
/// `failed/<id>` (body = build error) and `denied/<id>` (body =
|
|
/// operator note). Multi-line bodies handled via stdin so we don't
|
|
/// have to escape anything.
|
|
#[allow(dead_code)]
|
|
pub async fn git_tag_annotated(dir: &Path, name: &str, target: &str, body: &str) -> Result<()> {
|
|
use tokio::io::AsyncWriteExt;
|
|
let mut child = git_command()
|
|
.current_dir(dir)
|
|
.args(["tag", "-a", name, target, "-F", "-"])
|
|
.stdin(std::process::Stdio::piped())
|
|
.stdout(std::process::Stdio::piped())
|
|
.stderr(std::process::Stdio::piped())
|
|
.spawn()
|
|
.with_context(|| format!("spawn git tag -a {name} in {}", dir.display()))?;
|
|
if let Some(mut stdin) = child.stdin.take() {
|
|
stdin
|
|
.write_all(body.as_bytes())
|
|
.await
|
|
.context("write tag body to git stdin")?;
|
|
// Drop closes stdin so git can finish reading.
|
|
drop(stdin);
|
|
}
|
|
let out = child.wait_with_output().await.context("wait git tag -a")?;
|
|
if !out.status.success() {
|
|
bail!(
|
|
"git tag -a {name} failed ({}): {}",
|
|
out.status,
|
|
String::from_utf8_lossy(&out.stderr).trim()
|
|
);
|
|
}
|
|
Ok(())
|
|
}
|
|
|
|
/// Replace working tree + index with the tree at `target` without
|
|
/// moving HEAD. `applied/main` stays pointing at the last known-good
|
|
/// `deployed/*` while we let `nixos-container update` evaluate the
|
|
/// candidate. On build failure callers reset back to HEAD; on
|
|
/// success they fast-forward main to `target`.
|
|
#[allow(dead_code)]
|
|
pub async fn git_read_tree_reset(dir: &Path, target: &str) -> Result<()> {
|
|
git(dir, &["read-tree", "--reset", "-u", target]).await
|
|
}
|
|
|
|
/// Hard-set a ref to `target`. Used to fast-forward `refs/heads/main`
|
|
/// to the just-deployed proposal commit. Uses `update-ref`, not
|
|
/// `branch -f`, so it works regardless of where HEAD currently sits.
|
|
#[allow(dead_code)]
|
|
pub async fn git_update_ref(dir: &Path, refname: &str, target: &str) -> Result<()> {
|
|
git(dir, &["update-ref", refname, target]).await
|
|
}
|
|
|
|
/// Write a systemd drop-in for `container@<container>.service` that applies
|
|
/// our default resource caps. Goes under `/run/systemd/system/...` so it's
|
|
/// ephemeral (regenerated on every spawn / rebuild).
|
|
fn set_resource_limits(container: &str) -> Result<()> {
|
|
let dir = format!("/run/systemd/system/container@{container}.service.d");
|
|
std::fs::create_dir_all(&dir).with_context(|| format!("create {dir}"))?;
|
|
let path = format!("{dir}/hyperhive-limits.conf");
|
|
let content =
|
|
format!("[Service]\nMemoryMax={DEFAULT_MEMORY_MAX}\nCPUQuota={DEFAULT_CPU_QUOTA}\n",);
|
|
std::fs::write(&path, content).with_context(|| format!("write {path}"))?;
|
|
tracing::info!(
|
|
%path,
|
|
memory_max = DEFAULT_MEMORY_MAX,
|
|
cpu_quota = DEFAULT_CPU_QUOTA,
|
|
"wrote resource limits drop-in"
|
|
);
|
|
Ok(())
|
|
}
|
|
|
|
async fn systemd_daemon_reload() -> Result<()> {
|
|
let out = Command::new("systemctl")
|
|
.arg("daemon-reload")
|
|
.output()
|
|
.await
|
|
.context("invoke systemctl daemon-reload")?;
|
|
if !out.status.success() {
|
|
bail!(
|
|
"systemctl daemon-reload failed ({}): {}",
|
|
out.status,
|
|
String::from_utf8_lossy(&out.stderr).trim()
|
|
);
|
|
}
|
|
Ok(())
|
|
}
|
|
|
|
/// Idempotently rewrite the lines in `/etc/nixos-containers/<container>.conf`
|
|
/// that hive-c0re owns: `PRIVATE_NETWORK` (forced 0 so the agent's web UI port
|
|
/// is reachable on the host) and `EXTRA_NSPAWN_FLAGS` (the runtime-dir bind).
|
|
/// The start script expands `$EXTRA_NSPAWN_FLAGS` unquoted into the
|
|
/// `systemd-nspawn` command.
|
|
/// Where in the container's filesystem the manager sees its agents tree.
|
|
/// Matches the `/agents` path that pre-Phase-8 hosts declared via
|
|
/// `containers.hm1nd.bindMounts."/agents"`.
|
|
pub const CONTAINER_MANAGER_AGENTS_MOUNT: &str = "/agents";
|
|
|
|
/// Where the manager sees the applied trees of every agent, read-only.
|
|
/// Manager runs `git fetch /applied/<n>/.git refs/tags/*:refs/tags/applied/*`
|
|
/// to learn what hive-c0re deployed (or rejected, or failed to
|
|
/// build); the RO bind makes accidental writes impossible from
|
|
/// inside the container.
|
|
pub const CONTAINER_MANAGER_APPLIED_MOUNT: &str = "/applied";
|
|
|
|
/// The on-host root that gets bind-mounted to `/agents` inside the manager.
|
|
/// Hard-coded to match `AGENT_STATE_ROOT` in coordinator.rs (kept duplicated
|
|
/// here so lifecycle stays usable as a leaf module).
|
|
const HOST_AGENTS_ROOT: &str = "/var/lib/hyperhive/agents";
|
|
|
|
/// On-host applied repo root, mirrored RO into the manager. Matches
|
|
/// `APPLIED_STATE_ROOT` in coordinator.rs.
|
|
const HOST_APPLIED_ROOT: &str = "/var/lib/hyperhive/applied";
|
|
|
|
fn set_nspawn_flags(
|
|
container: &str,
|
|
runtime_dir: &Path,
|
|
claude_dir: &Path,
|
|
notes_dir: &Path,
|
|
) -> Result<()> {
|
|
let path = format!("/etc/nixos-containers/{container}.conf");
|
|
let original = std::fs::read_to_string(&path).with_context(|| format!("read {path}"))?;
|
|
let mut binds = format!(
|
|
"--bind={runtime}:{CONTAINER_RUNTIME_MOUNT} --bind={claude}:{CONTAINER_CLAUDE_MOUNT} --bind={notes}:{CONTAINER_NOTES_MOUNT}",
|
|
runtime = runtime_dir.display(),
|
|
claude = claude_dir.display(),
|
|
notes = notes_dir.display(),
|
|
);
|
|
if container == MANAGER_NAME {
|
|
// Manager edits sub-agent proposed/ repos and its own. RW so it can
|
|
// git-commit. Sub-agents see only their own /run/hive socket and
|
|
// /root/.claude (no /agents or /applied).
|
|
//
|
|
// /applied is a separate RO mount of the hive-c0re-only applied
|
|
// repos so the manager can `git fetch /applied/<n>/.git
|
|
// refs/tags/*:refs/tags/applied/*` to mirror deployed/failed/
|
|
// denied tags into its proposed clones and diff against
|
|
// what's actually deployed. RO bind makes destructive git
|
|
// plumbing inside the container unable to corrupt applied.
|
|
use std::fmt::Write as _;
|
|
let _ = write!(
|
|
binds,
|
|
" --bind={HOST_AGENTS_ROOT}:{CONTAINER_MANAGER_AGENTS_MOUNT}",
|
|
);
|
|
let _ = write!(
|
|
binds,
|
|
" --bind-ro={HOST_APPLIED_ROOT}:{CONTAINER_MANAGER_APPLIED_MOUNT}",
|
|
);
|
|
}
|
|
let bind_flag = format!("EXTRA_NSPAWN_FLAGS=\"{binds}\"");
|
|
let mut lines: Vec<String> = original
|
|
.lines()
|
|
.filter(|line| {
|
|
let trimmed = line.trim_start();
|
|
// Strip any network-namespace knobs nixos-container's create
|
|
// might have populated. The start script adds `--network-veth`
|
|
// whenever HOST_ADDRESS / LOCAL_ADDRESS (or their IPv6 cousins)
|
|
// are non-empty — and veth implies a private netns, hiding our
|
|
// web-UI port from the host. Force host netns.
|
|
!trimmed.starts_with("EXTRA_NSPAWN_FLAGS=")
|
|
&& !trimmed.starts_with("PRIVATE_NETWORK=")
|
|
&& !trimmed.starts_with("HOST_ADDRESS=")
|
|
&& !trimmed.starts_with("LOCAL_ADDRESS=")
|
|
&& !trimmed.starts_with("HOST_ADDRESS6=")
|
|
&& !trimmed.starts_with("LOCAL_ADDRESS6=")
|
|
&& !trimmed.starts_with("HOST_BRIDGE=")
|
|
})
|
|
.map(str::to_owned)
|
|
.collect();
|
|
lines.push("PRIVATE_NETWORK=0".to_owned());
|
|
lines.push("HOST_ADDRESS=".to_owned());
|
|
lines.push("LOCAL_ADDRESS=".to_owned());
|
|
lines.push("HOST_ADDRESS6=".to_owned());
|
|
lines.push("LOCAL_ADDRESS6=".to_owned());
|
|
lines.push("HOST_BRIDGE=".to_owned());
|
|
lines.push(bind_flag);
|
|
let mut content = lines.join("\n");
|
|
content.push('\n');
|
|
std::fs::write(&path, content).with_context(|| format!("write {path}"))?;
|
|
tracing::info!(%path, "set PRIVATE_NETWORK=0 + EXTRA_NSPAWN_FLAGS");
|
|
Ok(())
|
|
}
|
|
|
|
async fn run(args: &[&str]) -> Result<()> {
|
|
let out = Command::new("nixos-container")
|
|
.args(args)
|
|
.output()
|
|
.await
|
|
.with_context(|| format!("invoke nixos-container {}", args.join(" ")))?;
|
|
let stdout = String::from_utf8_lossy(&out.stdout);
|
|
let stderr = String::from_utf8_lossy(&out.stderr);
|
|
if !stdout.trim().is_empty() {
|
|
tracing::info!(target: "nixos-container", "{}", stdout.trim());
|
|
}
|
|
if !stderr.trim().is_empty() {
|
|
tracing::warn!(target: "nixos-container", "{}", stderr.trim());
|
|
}
|
|
if !out.status.success() {
|
|
bail!(
|
|
"nixos-container {} failed ({}): {}",
|
|
args.join(" "),
|
|
out.status,
|
|
stderr.trim()
|
|
);
|
|
}
|
|
Ok(())
|
|
}
|