//! Single hive-c0re-owned flake at `/var/lib/hyperhive/meta/` that //! consumes every agent's applied repo as a flake input and exports one //! `nixosConfiguration` per agent. Containers run against //! `--flake /var/lib/hyperhive/meta#`; lifecycle ops here drive the //! lock file so meta's git log is the system-wide deploy audit trail. //! //! Flow: //! - `sync_agents` (idempotent) — render `flake.nix` for the current //! agent set, init the repo on first call, relock if the rendered //! contents changed, commit. Used by spawn / destroy / startup //! migration. //! - `prepare_deploy` + `finalize_deploy` / `abort_deploy` — two-phase //! for the `request_apply_commit` path so a failed //! `nixos-container update` leaves no orphan commit in meta. Prepare //! writes the new lock without committing; finalize commits with the //! deploy message; abort `git restore`s the lock back. //! - `lock_update_hyperhive` — one-shot for the auto-update path. use std::path::{Path, PathBuf}; use anyhow::{Context, Result, bail}; use tokio::process::Command; use tokio::sync::Mutex; use crate::lifecycle; const META_ROOT: &str = "/var/lib/hyperhive/meta"; const APPLIED_ROOT: &str = "/var/lib/hyperhive/applied"; const GIT_NAME: &str = "c0re"; const GIT_EMAIL: &str = "c0re@hyperhive"; /// Single-writer lock around every meta-repo operation. Git isn't /// safe to drive from concurrent processes against the same `.git/` /// — two simultaneous `git add` / `commit` invocations race on /// `.git/index.lock`; if either dies before releasing, the lock /// sticks and the next operation hits "another git process seems to /// be running" until somebody `rm`s it manually. Holding this mutex /// across each public function's git+nix calls makes parallel /// rebuilds (`auto_update` + dashboard-triggered + apply-commit) /// take turns instead of colliding. static META_LOCK: Mutex<()> = Mutex::const_new(()); /// Where the manager sees this directory inside its container (RO bind). #[allow(dead_code)] // wired up by set_nspawn_flags in a follow-up commit pub const CONTAINER_MANAGER_META_MOUNT: &str = "/meta"; #[derive(Debug, Clone)] pub struct AgentSpec { pub name: String, pub is_manager: bool, pub port: u16, } #[must_use] pub fn meta_dir() -> PathBuf { PathBuf::from(META_ROOT) } /// Idempotently reconcile the meta repo with the current agent set. /// First call inits the git repo, runs `nix flake lock`, and lands a /// seed commit. Subsequent calls only touch `flake.nix` when the /// rendered contents differ from disk; an unchanged `flake.nix` is a /// no-op. #[allow(dead_code)] // first caller lands in a later commit pub async fn sync_agents( hyperhive_flake: &str, dashboard_port: u16, operator_pronouns: &str, agents: &[AgentSpec], ) -> Result<()> { let _guard = META_LOCK.lock().await; let dir = meta_dir(); std::fs::create_dir_all(&dir).with_context(|| format!("create {}", dir.display()))?; let new_flake = render_flake(hyperhive_flake, dashboard_port, operator_pronouns, agents); let flake_path = dir.join("flake.nix"); let on_disk = std::fs::read_to_string(&flake_path).unwrap_or_default(); let initial = !dir.join(".git").exists(); if !initial && on_disk == new_flake { return Ok(()); } std::fs::write(&flake_path, &new_flake) .with_context(|| format!("write {}", flake_path.display()))?; if initial { git(&dir, &["init", "--initial-branch=main"]).await?; } // Stage flake.nix *before* running nix flake lock. When meta is // a git repo, nix treats it as a `git+file://` self-reference; // its dirty-tree fetcher includes index entries (tracked + // staged) but skips untracked files, so without the stage step // an untracked flake.nix surfaces as "source tree does not // contain '/flake.nix'". Lock then commit once with both // flake.nix and flake.lock — single commit per change. git(&dir, &["add", "flake.nix"]).await?; nix(&dir, &["flake", "lock"]).await?; if std::path::Path::new(&dir).join("flake.lock").exists() { git(&dir, &["add", "flake.lock"]).await?; } let msg = if initial { format!("seed meta from {} agent(s)", agents.len()) } else { "regenerate meta flake".to_owned() }; git_commit(&dir, &msg).await?; Ok(()) } /// Phase 1 of an apply-commit deploy. Updates the locked rev of /// `agent-` to whatever `applied//main` currently points /// at and **stages** the lock so `nixos-container update --flake /// meta#` (which reads via `git+file://`) sees the new rev via /// the index. Doesn't commit — `finalize_deploy` commits on build /// success, `abort_deploy` drops the staged change on failure so /// meta history only carries successful deploys. #[allow(dead_code)] // wired up by actions::run_apply_commit in a later commit pub async fn prepare_deploy(name: &str) -> Result<()> { let _guard = META_LOCK.lock().await; let dir = meta_dir(); let input = format!("agent-{name}"); nix(&dir, &["flake", "update", &input]).await?; // Stage the new lock — git+file://'s dirty-tree fetcher reads // index entries, so the upcoming nixos-container update sees the // bumped rev without a commit yet. git(&dir, &["add", "flake.lock"]).await } /// Phase 2-success. Commit the staged lock with the deployed tag + /// sha as the message. No-op when the rev was already at the right /// place (nothing staged → nothing to commit). #[allow(dead_code)] pub async fn finalize_deploy(name: &str, sha: &str, tag: &str) -> Result<()> { let _guard = META_LOCK.lock().await; let dir = meta_dir(); if !has_staged_changes(&dir).await? { return Ok(()); } let short = &sha[..sha.len().min(12)]; git_commit(&dir, &format!("deploy {name} {tag} {short}")).await } /// Phase 2-failure. Unstage + restore the lock so meta returns to /// the previously-committed shas. The failed proposal is still /// captured in `applied/`'s annotated `failed/` tag. #[allow(dead_code)] pub async fn abort_deploy() -> Result<()> { let _guard = META_LOCK.lock().await; let dir = meta_dir(); git(&dir, &["restore", "--staged", "flake.lock"]).await?; git(&dir, &["restore", "flake.lock"]).await } async fn has_staged_changes(dir: &Path) -> Result { let st = lifecycle::git_command() .current_dir(dir) .args(["diff", "--cached", "--quiet"]) .status() .await .with_context(|| format!("git diff --cached in {}", dir.display()))?; // exit 1 = differences present, 0 = no diff, other = error match st.code() { Some(0) => Ok(false), Some(1) => Ok(true), _ => bail!("git diff --cached exited unexpectedly"), } } /// One-shot used by the manual-rebuild path: relock just one /// agent's input and commit the lock change if any. Single-phase /// (no separate finalize) because rebuild has no failure-revert /// semantics — it always wants the latest main. #[allow(dead_code)] // wired up by lifecycle::rebuild in this commit pub async fn lock_update_for_rebuild(name: &str) -> Result<()> { let _guard = META_LOCK.lock().await; let dir = meta_dir(); let input = format!("agent-{name}"); nix(&dir, &["flake", "update", &input]).await?; if git_is_clean(&dir).await? { return Ok(()); } git(&dir, &["add", "flake.lock"]).await?; git_commit(&dir, &format!("rebuild {name}: lock update")).await } /// Update one or more named inputs in the meta flake and commit /// the resulting lock change with a single combined message. /// Used by the dashboard's "update meta inputs" form so the /// operator can bulk-bump `hyperhive` + selected agents in one /// shot. Each input name is passed verbatim to /// `nix flake update`; the caller is responsible for picking /// real input keys (e.g. via `inputs_view()` snapshotted from /// the lock file). #[allow(dead_code)] // wired up by dashboard handler in the same commit pub async fn lock_update(inputs: &[String]) -> Result<()> { if inputs.is_empty() { return Ok(()); } let _guard = META_LOCK.lock().await; let dir = meta_dir(); let mut args: Vec<&str> = vec!["flake", "update"]; for i in inputs { args.push(i.as_str()); } nix(&dir, &args).await?; if git_is_clean(&dir).await? { return Ok(()); } git(&dir, &["add", "flake.lock"]).await?; let msg = if inputs.len() == 1 { format!("lock update: {}", inputs[0]) } else { format!("lock update: {}", inputs.join(", ")) }; git_commit(&dir, &msg).await } /// One-shot used by the auto-update path: pin the latest hyperhive /// rev, commit if the lock changed. Cheaper than `sync_agents` /// because the per-agent inputs aren't touched. #[allow(dead_code)] pub async fn lock_update_hyperhive() -> Result<()> { let _guard = META_LOCK.lock().await; let dir = meta_dir(); nix(&dir, &["flake", "update", "hyperhive"]).await?; if git_is_clean(&dir).await? { return Ok(()); } git(&dir, &["add", "flake.lock"]).await?; git_commit(&dir, "bump hyperhive").await } fn render_flake( hyperhive_flake: &str, dashboard_port: u16, operator_pronouns: &str, agents: &[AgentSpec], ) -> String { use std::fmt::Write as _; let mut out = String::new(); out.push_str("{\n description = \"hyperhive deployed agents\";\n inputs = {\n"); let _ = writeln!(out, " hyperhive.url = \"{hyperhive_flake}\";"); for spec in agents { let _ = writeln!( out, " agent-{}.url = \"git+file://{APPLIED_ROOT}/{}\";", spec.name, spec.name, ); } out.push_str(" };\n outputs =\n { self, hyperhive, ... }@inputs:\n let\n"); // Free-text operator string — escape backslash + double-quote so a // pronouns value like `he/him \ "rare"` round-trips into a valid // nix string literal without breaking the flake. let pronouns_escaped = operator_pronouns.replace('\\', "\\\\").replace('"', "\\\""); let _ = writeln!( out, " dashboardPort = {dashboard_port};\n operatorPronouns = \"{pronouns_escaped}\";\n mkAgent = {{ name, isManager, port }}:" ); out.push_str( r#" let base = if isManager then hyperhive.nixosConfigurations.manager else hyperhive.nixosConfigurations.agent-base; input = inputs."agent-${name}"; service = if isManager then "hive-m1nd" else "hive-ag3nt"; in base.extendModules { modules = [ input.nixosModules.default { programs.git.config.user = { name = name; email = "${name}@hyperhive"; }; systemd.services.${service}.environment = { HIVE_PORT = toString port; HIVE_LABEL = name; HIVE_DASHBOARD_PORT = toString dashboardPort; HIVE_OPERATOR_PRONOUNS = operatorPronouns; }; } ]; }; in { nixosConfigurations = { "#, ); for spec in agents { let _ = writeln!( out, " {} = mkAgent {{ name = \"{}\"; isManager = {}; port = {}; }};", spec.name, spec.name, if spec.is_manager { "true" } else { "false" }, spec.port, ); } out.push_str(" };\n };\n}\n"); out } async fn git_is_clean(dir: &Path) -> Result { let out = lifecycle::git_command() .current_dir(dir) .args(["status", "--porcelain"]) .output() .await .with_context(|| format!("git status in {}", dir.display()))?; Ok(out.stdout.iter().all(u8::is_ascii_whitespace)) } async fn git(dir: &Path, args: &[&str]) -> Result<()> { let out = lifecycle::git_command() .current_dir(dir) .args(args) .output() .await .with_context(|| format!("git {} in {}", args.join(" "), dir.display()))?; if !out.status.success() { bail!( "git {} failed ({}): {}", args.join(" "), out.status, String::from_utf8_lossy(&out.stderr).trim() ); } Ok(()) } async fn git_commit(dir: &Path, message: &str) -> Result<()> { git( dir, &[ "-c", &format!("user.name={GIT_NAME}"), "-c", &format!("user.email={GIT_EMAIL}"), "commit", "-m", message, ], ) .await } async fn nix(dir: &Path, args: &[&str]) -> Result<()> { // `--extra-experimental-features` belt-and-suspenders for hosts // that haven't set this in nix.conf. The hyperhive module's // deploy guide assumes flakes are already enabled, but the cost // of being defensive is one extra argv each call. let mut all = vec!["--extra-experimental-features", "nix-command flakes"]; all.extend(args); let out = Command::new("nix") .current_dir(dir) .args(&all) .output() .await .with_context(|| format!("nix {} in {}", args.join(" "), dir.display()))?; if !out.status.success() { bail!( "nix {} failed ({}): {}", args.join(" "), out.status, String::from_utf8_lossy(&out.stderr).trim() ); } Ok(()) }