From 59a89314f0663d36177b5c961a75ba79a49bbb52 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?m=C3=BCde?= Date: Sat, 16 May 2026 00:34:58 +0200 Subject: [PATCH] startup auto-migration from pre-meta layout MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit new migrate module runs before auto_update on hive-c0re boot. four idempotent phases: 1. for every applied// whose flake.nix isn't already the module-only boilerplate, rewrite + commit + relocate deployed/0 to HEAD so setup_applied's existence check passes 2. for every proposed//config without an 'applied' remote, wire it (delegates to setup_proposed which is now idempotent and adds the remote itself) 3. meta::sync_agents over the current container list — inits the meta repo on first call, rerender + relock if drifted 4. nixos-container update --flake meta# for every container, guarded by /var/lib/hyperhive/.meta-migration-done so phase 4's expensive eval only runs once across restarts env kill-switch HIVE_SKIP_META_MIGRATION=1 defers the whole thing. each agent's failure is logged + skipped so one broken agent doesn't block the rest. runs ahead of ensure_manager so the manager auto-spawn comes up against meta from the first attempt. --- hive-c0re/src/lifecycle.rs | 2 +- hive-c0re/src/main.rs | 10 ++ hive-c0re/src/migrate.rs | 183 +++++++++++++++++++++++++++++++++++++ 3 files changed, 194 insertions(+), 1 deletion(-) create mode 100644 hive-c0re/src/migrate.rs diff --git a/hive-c0re/src/lifecycle.rs b/hive-c0re/src/lifecycle.rs index 9307fd7..525005d 100644 --- a/hive-c0re/src/lifecycle.rs +++ b/hive-c0re/src/lifecycle.rs @@ -498,7 +498,7 @@ fn initial_agent_nix(name: &str) -> String { /// hive-c0re-owned meta flake at `/var/lib/hyperhive/meta/` as a flake /// input. Identity injection (`HIVE_PORT` / `HIVE_LABEL` / dashboard /// port / git committer) lives in the meta flake's wrapper, not here. -fn initial_flake_nix() -> &'static str { +pub fn initial_flake_nix() -> &'static str { "{\n description = \"hyperhive agent\";\n inputs = { };\n outputs = { self }: {\n nixosModules.default = import ./agent.nix;\n };\n}\n" } diff --git a/hive-c0re/src/main.rs b/hive-c0re/src/main.rs index eea5a96..e6b5893 100644 --- a/hive-c0re/src/main.rs +++ b/hive-c0re/src/main.rs @@ -18,6 +18,7 @@ mod events_vacuum; mod lifecycle; mod manager_server; mod meta; +mod migrate; mod operator_questions; mod server; @@ -97,6 +98,15 @@ async fn main() -> Result<()> { } => { let coord = Arc::new(Coordinator::open(&db, hyperhive_flake, dashboard_port)?); manager_server::start(coord.clone())?; + // Idempotent pre-flight: rewrite pre-meta-layout applied + // repos, ensure proposed repos carry the `applied` + // remote, bootstrap the meta repo, repoint containers at + // `meta#` (one-shot, guarded by a marker file). + // Runs before manager auto-spawn so the new manager is + // built against meta from the first attempt. + if let Err(e) = migrate::run(&coord).await { + tracing::warn!(error = ?e, "startup migration failed"); + } // Auto-create the manager container if it isn't there yet. Block // on this — without hm1nd the system has no manager harness. // Failures are logged but allowed: a broken auto-spawn shouldn't diff --git a/hive-c0re/src/migrate.rs b/hive-c0re/src/migrate.rs new file mode 100644 index 0000000..aab8d03 --- /dev/null +++ b/hive-c0re/src/migrate.rs @@ -0,0 +1,183 @@ +//! Startup auto-migration from the pre-meta layout. Runs before +//! `auto_update::run` and consists of four phases, each idempotent: +//! +//! 1. Per-agent applied repo: rewrite `flake.nix` to the module-only +//! boilerplate if it isn't already, commit, relocate `deployed/0` +//! to HEAD so `setup_applied`'s existence check passes. +//! 2. Per-agent proposed repo: ensure the `applied` git remote +//! points at `/applied//.git` (re-runs `setup_proposed`'s +//! `ensure_applied_remote` indirectly via a host-side git call). +//! 3. Meta repo: `meta::sync_agents` over the current agent list — +//! init the repo on first call, rerender + relock if anything +//! drifted. +//! 4. Container repoint: for every existing container, run +//! `nixos-container update --flake meta#` so it +//! activates against the meta flake. Guarded by a marker file +//! so the (expensive) phase 4 only runs once across hive-c0re +//! restarts. +//! +//! Env kill-switch: `HIVE_SKIP_META_MIGRATION=1` skips the whole +//! migration. Use when smoke-testing one agent at a time by hand. + +use std::path::{Path, PathBuf}; +use std::sync::Arc; + +use anyhow::{Context, Result}; +use tokio::process::Command; + +use crate::coordinator::Coordinator; +use crate::lifecycle::{self, AGENT_PREFIX, MANAGER_NAME}; +use crate::meta; + +const KILL_SWITCH: &str = "HIVE_SKIP_META_MIGRATION"; + +/// Marker for phase 4. Once present, container repoint is skipped on +/// future restarts. +fn repoint_marker() -> PathBuf { + PathBuf::from("/var/lib/hyperhive/.meta-migration-done") +} + +const MODULE_FLAKE_MARKER: &str = "nixosModules.default = import ./agent.nix"; + +pub async fn run(coord: &Arc) -> Result<()> { + if std::env::var(KILL_SWITCH).is_ok() { + tracing::info!("migration: {KILL_SWITCH} set — skipping"); + return Ok(()); + } + let names = enumerate_agents().await; + tracing::info!(count = names.len(), "migration: scanning"); + + // Phase 1 + 2: per-agent applied + proposed. + for name in &names { + if let Err(e) = migrate_applied_repo(name).await { + tracing::warn!(%name, error = ?e, "migration: applied repo rewrite failed"); + } + if let Err(e) = lifecycle::setup_proposed(&Coordinator::agent_proposed_dir(name), name) + .await + { + tracing::warn!(%name, error = ?e, "migration: setup_proposed failed"); + } + } + + // Phase 3: meta repo. + let agents = lifecycle::agents_for_meta_listing().await.unwrap_or_default(); + if let Err(e) = + meta::sync_agents(&coord.hyperhive_flake, coord.dashboard_port, &agents).await + { + tracing::warn!(error = ?e, "migration: meta sync_agents failed"); + } + + // Phase 4: container repoint, guarded by marker. + if repoint_marker().exists() { + tracing::debug!("migration: phase 4 marker present, skipping repoint"); + return Ok(()); + } + let mut all_ok = true; + for name in &names { + if let Err(e) = repoint_container(name).await { + tracing::warn!(%name, error = ?e, "migration: container repoint failed"); + all_ok = false; + } + } + if all_ok && !names.is_empty() + && let Err(e) = std::fs::write(repoint_marker(), b"done\n") + { + tracing::warn!(error = ?e, "migration: write repoint marker failed"); + } + Ok(()) +} + +async fn enumerate_agents() -> Vec { + let containers = lifecycle::list().await.unwrap_or_default(); + containers + .into_iter() + .filter_map(|c| { + if c == MANAGER_NAME { + Some(MANAGER_NAME.to_owned()) + } else { + c.strip_prefix(AGENT_PREFIX).map(str::to_owned) + } + }) + .collect() +} + +async fn migrate_applied_repo(name: &str) -> Result<()> { + let dir = Coordinator::agent_applied_dir(name); + if !dir.join(".git").exists() { + return Ok(()); + } + let flake_path = dir.join("flake.nix"); + let cur = std::fs::read_to_string(&flake_path).unwrap_or_default(); + if cur.contains(MODULE_FLAKE_MARKER) { + return Ok(()); + } + let want = lifecycle::initial_flake_nix(); + std::fs::write(&flake_path, want) + .with_context(|| format!("write {}", flake_path.display()))?; + raw_git( + &dir, + &[ + "-c", + "user.name=hive-c0re", + "-c", + "user.email=hive-c0re@hyperhive", + "add", + "flake.nix", + ], + ) + .await?; + raw_git( + &dir, + &[ + "-c", + "user.name=hive-c0re", + "-c", + "user.email=hive-c0re@hyperhive", + "commit", + "-m", + "migration: module-only flake", + ], + ) + .await?; + // Relocate deployed/0 to the migration commit so + // setup_applied's existence check passes. + raw_git(&dir, &["tag", "-f", "deployed/0", "HEAD"]).await?; + tracing::info!(%name, "migration: applied repo migrated to module-only flake"); + Ok(()) +} + +async fn repoint_container(name: &str) -> Result<()> { + let container = lifecycle::container_name(name); + let flake_ref = format!("{}#{name}", meta::meta_dir().display()); + let out = Command::new("nixos-container") + .args(["update", &container, "--flake", &flake_ref]) + .output() + .await + .with_context(|| format!("nixos-container update {container}"))?; + if !out.status.success() { + anyhow::bail!( + "nixos-container update {container} exited {}: {}", + out.status, + String::from_utf8_lossy(&out.stderr).trim() + ); + } + tracing::info!(%name, %container, "migration: container repointed at meta"); + Ok(()) +} + +async fn raw_git(dir: &Path, args: &[&str]) -> Result<()> { + let out = lifecycle::git_command() + .current_dir(dir) + .args(args) + .output() + .await + .with_context(|| format!("git {} in {}", args.join(" "), dir.display()))?; + if !out.status.success() { + anyhow::bail!( + "git {} failed: {}", + args.join(" "), + String::from_utf8_lossy(&out.stderr).trim() + ); + } + Ok(()) +}