hyperhive/hive-c0re/src/meta.rs
müde 266c2c7a77 dashboard: meta flake inputs UI + sequential rebuild loop
new section 'M3T4 1NPUTS' between approvals and message flow:
one row per input in meta/flake.lock (hyperhive first, then
agent-<n> alphabetically). each row shows the input name, the
first 12 chars of the locked sha, a relative timestamp from
locked.lastModified, and the original.url when available.
checkbox per row; submit button is disabled until at least one
box is checked; submitting confirms then POSTs the selected
names to /meta-update.

backend:
- meta::lock_update(inputs: &[String]) — runs 'nix flake update
  <names>' in the meta dir, commits the lock change with a
  combined message ('lock update: hyperhive, agent-coder').
  preserves the existing META_LOCK serialization. existing
  lock_update_for_rebuild / lock_update_hyperhive stay for
  their single-input callers.
- POST /meta-update — comma-separated 'inputs' form field
  (JS joins checkboxes since axum::Form doesn't natively
  decode repeated keys); spawns a background task that runs
  the lock update + per-agent rebuild loop. hyperhive
  selection fans out to all agents; agent-<n> selection only
  rebuilds <n>. each rebuild fires Rebuilt to the manager
  exactly like dashboard / admin-CLI / auto-update.

rebuild loop is sequential — auto_update::run too (was
parallel via tokio::spawn). parallel rebuilds collide on
nix-store's sqlite cache ('sqlite db busy, not using cache')
and the meta META_LOCK contention. nix-daemon serializes the
heavy build steps anyway, so this isn't a throughput loss.
2026-05-16 03:38:07 +02:00

372 lines
13 KiB
Rust

//! Single hive-c0re-owned flake at `/var/lib/hyperhive/meta/` that
//! consumes every agent's applied repo as a flake input and exports one
//! `nixosConfiguration` per agent. Containers run against
//! `--flake /var/lib/hyperhive/meta#<name>`; lifecycle ops here drive the
//! lock file so meta's git log is the system-wide deploy audit trail.
//!
//! Flow:
//! - `sync_agents` (idempotent) — render `flake.nix` for the current
//! agent set, init the repo on first call, relock if the rendered
//! contents changed, commit. Used by spawn / destroy / startup
//! migration.
//! - `prepare_deploy` + `finalize_deploy` / `abort_deploy` — two-phase
//! for the `request_apply_commit` path so a failed
//! `nixos-container update` leaves no orphan commit in meta. Prepare
//! writes the new lock without committing; finalize commits with the
//! deploy message; abort `git restore`s the lock back.
//! - `lock_update_hyperhive` — one-shot for the auto-update path.
use std::path::{Path, PathBuf};
use anyhow::{Context, Result, bail};
use tokio::process::Command;
use tokio::sync::Mutex;
use crate::lifecycle;
const META_ROOT: &str = "/var/lib/hyperhive/meta";
const APPLIED_ROOT: &str = "/var/lib/hyperhive/applied";
const GIT_NAME: &str = "c0re";
const GIT_EMAIL: &str = "c0re@hyperhive";
/// Single-writer lock around every meta-repo operation. Git isn't
/// safe to drive from concurrent processes against the same `.git/`
/// — two simultaneous `git add` / `commit` invocations race on
/// `.git/index.lock`; if either dies before releasing, the lock
/// sticks and the next operation hits "another git process seems to
/// be running" until somebody `rm`s it manually. Holding this mutex
/// across each public function's git+nix calls makes parallel
/// rebuilds (`auto_update` + dashboard-triggered + apply-commit)
/// take turns instead of colliding.
static META_LOCK: Mutex<()> = Mutex::const_new(());
/// Where the manager sees this directory inside its container (RO bind).
#[allow(dead_code)] // wired up by set_nspawn_flags in a follow-up commit
pub const CONTAINER_MANAGER_META_MOUNT: &str = "/meta";
#[derive(Debug, Clone)]
pub struct AgentSpec {
pub name: String,
pub is_manager: bool,
pub port: u16,
}
#[must_use]
pub fn meta_dir() -> PathBuf {
PathBuf::from(META_ROOT)
}
/// Idempotently reconcile the meta repo with the current agent set.
/// First call inits the git repo, runs `nix flake lock`, and lands a
/// seed commit. Subsequent calls only touch `flake.nix` when the
/// rendered contents differ from disk; an unchanged `flake.nix` is a
/// no-op.
#[allow(dead_code)] // first caller lands in a later commit
pub async fn sync_agents(
hyperhive_flake: &str,
dashboard_port: u16,
operator_pronouns: &str,
agents: &[AgentSpec],
) -> Result<()> {
let _guard = META_LOCK.lock().await;
let dir = meta_dir();
std::fs::create_dir_all(&dir).with_context(|| format!("create {}", dir.display()))?;
let new_flake = render_flake(hyperhive_flake, dashboard_port, operator_pronouns, agents);
let flake_path = dir.join("flake.nix");
let on_disk = std::fs::read_to_string(&flake_path).unwrap_or_default();
let initial = !dir.join(".git").exists();
if !initial && on_disk == new_flake {
return Ok(());
}
std::fs::write(&flake_path, &new_flake)
.with_context(|| format!("write {}", flake_path.display()))?;
if initial {
git(&dir, &["init", "--initial-branch=main"]).await?;
}
// Stage flake.nix *before* running nix flake lock. When meta is
// a git repo, nix treats it as a `git+file://` self-reference;
// its dirty-tree fetcher includes index entries (tracked +
// staged) but skips untracked files, so without the stage step
// an untracked flake.nix surfaces as "source tree does not
// contain '/flake.nix'". Lock then commit once with both
// flake.nix and flake.lock — single commit per change.
git(&dir, &["add", "flake.nix"]).await?;
nix(&dir, &["flake", "lock"]).await?;
if std::path::Path::new(&dir).join("flake.lock").exists() {
git(&dir, &["add", "flake.lock"]).await?;
}
let msg = if initial {
format!("seed meta from {} agent(s)", agents.len())
} else {
"regenerate meta flake".to_owned()
};
git_commit(&dir, &msg).await?;
Ok(())
}
/// Phase 1 of an apply-commit deploy. Updates the locked rev of
/// `agent-<name>` to whatever `applied/<name>/main` currently points
/// at and **stages** the lock so `nixos-container update --flake
/// meta#<n>` (which reads via `git+file://`) sees the new rev via
/// the index. Doesn't commit — `finalize_deploy` commits on build
/// success, `abort_deploy` drops the staged change on failure so
/// meta history only carries successful deploys.
#[allow(dead_code)] // wired up by actions::run_apply_commit in a later commit
pub async fn prepare_deploy(name: &str) -> Result<()> {
let _guard = META_LOCK.lock().await;
let dir = meta_dir();
let input = format!("agent-{name}");
nix(&dir, &["flake", "update", &input]).await?;
// Stage the new lock — git+file://'s dirty-tree fetcher reads
// index entries, so the upcoming nixos-container update sees the
// bumped rev without a commit yet.
git(&dir, &["add", "flake.lock"]).await
}
/// Phase 2-success. Commit the staged lock with the deployed tag +
/// sha as the message. No-op when the rev was already at the right
/// place (nothing staged → nothing to commit).
#[allow(dead_code)]
pub async fn finalize_deploy(name: &str, sha: &str, tag: &str) -> Result<()> {
let _guard = META_LOCK.lock().await;
let dir = meta_dir();
if !has_staged_changes(&dir).await? {
return Ok(());
}
let short = &sha[..sha.len().min(12)];
git_commit(&dir, &format!("deploy {name} {tag} {short}")).await
}
/// Phase 2-failure. Unstage + restore the lock so meta returns to
/// the previously-committed shas. The failed proposal is still
/// captured in `applied/<n>`'s annotated `failed/<id>` tag.
#[allow(dead_code)]
pub async fn abort_deploy() -> Result<()> {
let _guard = META_LOCK.lock().await;
let dir = meta_dir();
git(&dir, &["restore", "--staged", "flake.lock"]).await?;
git(&dir, &["restore", "flake.lock"]).await
}
async fn has_staged_changes(dir: &Path) -> Result<bool> {
let st = lifecycle::git_command()
.current_dir(dir)
.args(["diff", "--cached", "--quiet"])
.status()
.await
.with_context(|| format!("git diff --cached in {}", dir.display()))?;
// exit 1 = differences present, 0 = no diff, other = error
match st.code() {
Some(0) => Ok(false),
Some(1) => Ok(true),
_ => bail!("git diff --cached exited unexpectedly"),
}
}
/// One-shot used by the manual-rebuild path: relock just one
/// agent's input and commit the lock change if any. Single-phase
/// (no separate finalize) because rebuild has no failure-revert
/// semantics — it always wants the latest main.
#[allow(dead_code)] // wired up by lifecycle::rebuild in this commit
pub async fn lock_update_for_rebuild(name: &str) -> Result<()> {
let _guard = META_LOCK.lock().await;
let dir = meta_dir();
let input = format!("agent-{name}");
nix(&dir, &["flake", "update", &input]).await?;
if git_is_clean(&dir).await? {
return Ok(());
}
git(&dir, &["add", "flake.lock"]).await?;
git_commit(&dir, &format!("rebuild {name}: lock update")).await
}
/// Update one or more named inputs in the meta flake and commit
/// the resulting lock change with a single combined message.
/// Used by the dashboard's "update meta inputs" form so the
/// operator can bulk-bump `hyperhive` + selected agents in one
/// shot. Each input name is passed verbatim to
/// `nix flake update`; the caller is responsible for picking
/// real input keys (e.g. via `inputs_view()` snapshotted from
/// the lock file).
#[allow(dead_code)] // wired up by dashboard handler in the same commit
pub async fn lock_update(inputs: &[String]) -> Result<()> {
if inputs.is_empty() {
return Ok(());
}
let _guard = META_LOCK.lock().await;
let dir = meta_dir();
let mut args: Vec<&str> = vec!["flake", "update"];
for i in inputs {
args.push(i.as_str());
}
nix(&dir, &args).await?;
if git_is_clean(&dir).await? {
return Ok(());
}
git(&dir, &["add", "flake.lock"]).await?;
let msg = if inputs.len() == 1 {
format!("lock update: {}", inputs[0])
} else {
format!("lock update: {}", inputs.join(", "))
};
git_commit(&dir, &msg).await
}
/// One-shot used by the auto-update path: pin the latest hyperhive
/// rev, commit if the lock changed. Cheaper than `sync_agents`
/// because the per-agent inputs aren't touched.
#[allow(dead_code)]
pub async fn lock_update_hyperhive() -> Result<()> {
let _guard = META_LOCK.lock().await;
let dir = meta_dir();
nix(&dir, &["flake", "update", "hyperhive"]).await?;
if git_is_clean(&dir).await? {
return Ok(());
}
git(&dir, &["add", "flake.lock"]).await?;
git_commit(&dir, "bump hyperhive").await
}
fn render_flake(
hyperhive_flake: &str,
dashboard_port: u16,
operator_pronouns: &str,
agents: &[AgentSpec],
) -> String {
use std::fmt::Write as _;
let mut out = String::new();
out.push_str("{\n description = \"hyperhive deployed agents\";\n inputs = {\n");
let _ = writeln!(out, " hyperhive.url = \"{hyperhive_flake}\";");
for spec in agents {
let _ = writeln!(
out,
" agent-{}.url = \"git+file://{APPLIED_ROOT}/{}\";",
spec.name, spec.name,
);
}
out.push_str(" };\n outputs =\n { self, hyperhive, ... }@inputs:\n let\n");
// Free-text operator string — escape backslash + double-quote so a
// pronouns value like `he/him \ "rare"` round-trips into a valid
// nix string literal without breaking the flake.
let pronouns_escaped = operator_pronouns
.replace('\\', "\\\\")
.replace('"', "\\\"");
let _ = writeln!(
out,
" dashboardPort = {dashboard_port};\n operatorPronouns = \"{pronouns_escaped}\";\n mkAgent = {{ name, isManager, port }}:"
);
out.push_str(
r#" let
base = if isManager
then hyperhive.nixosConfigurations.manager
else hyperhive.nixosConfigurations.agent-base;
input = inputs."agent-${name}";
service = if isManager then "hive-m1nd" else "hive-ag3nt";
in
base.extendModules {
modules = [
input.nixosModules.default
{
programs.git.config.user = {
name = name;
email = "${name}@hyperhive";
};
systemd.services.${service}.environment = {
HIVE_PORT = toString port;
HIVE_LABEL = name;
HIVE_DASHBOARD_PORT = toString dashboardPort;
HIVE_OPERATOR_PRONOUNS = operatorPronouns;
};
}
];
};
in
{
nixosConfigurations = {
"#,
);
for spec in agents {
let _ = writeln!(
out,
" {} = mkAgent {{ name = \"{}\"; isManager = {}; port = {}; }};",
spec.name,
spec.name,
if spec.is_manager { "true" } else { "false" },
spec.port,
);
}
out.push_str(" };\n };\n}\n");
out
}
async fn git_is_clean(dir: &Path) -> Result<bool> {
let out = lifecycle::git_command()
.current_dir(dir)
.args(["status", "--porcelain"])
.output()
.await
.with_context(|| format!("git status in {}", dir.display()))?;
Ok(out.stdout.iter().all(u8::is_ascii_whitespace))
}
async fn git(dir: &Path, args: &[&str]) -> Result<()> {
let out = lifecycle::git_command()
.current_dir(dir)
.args(args)
.output()
.await
.with_context(|| format!("git {} in {}", args.join(" "), dir.display()))?;
if !out.status.success() {
bail!(
"git {} failed ({}): {}",
args.join(" "),
out.status,
String::from_utf8_lossy(&out.stderr).trim()
);
}
Ok(())
}
async fn git_commit(dir: &Path, message: &str) -> Result<()> {
git(
dir,
&[
"-c",
&format!("user.name={GIT_NAME}"),
"-c",
&format!("user.email={GIT_EMAIL}"),
"commit",
"-m",
message,
],
)
.await
}
async fn nix(dir: &Path, args: &[&str]) -> Result<()> {
// `--extra-experimental-features` belt-and-suspenders for hosts
// that haven't set this in nix.conf. The hyperhive module's
// deploy guide assumes flakes are already enabled, but the cost
// of being defensive is one extra argv each call.
let mut all = vec!["--extra-experimental-features", "nix-command flakes"];
all.extend(args);
let out = Command::new("nix")
.current_dir(dir)
.args(&all)
.output()
.await
.with_context(|| format!("nix {} in {}", args.join(" "), dir.display()))?;
if !out.status.success() {
bail!(
"nix {} failed ({}): {}",
args.join(" "),
out.status,
String::from_utf8_lossy(&out.stderr).trim()
);
}
Ok(())
}