startup auto-migration from pre-meta layout
new migrate module runs before auto_update on hive-c0re boot. four idempotent phases: 1. for every applied/<n>/ whose flake.nix isn't already the module-only boilerplate, rewrite + commit + relocate deployed/0 to HEAD so setup_applied's existence check passes 2. for every proposed/<n>/config without an 'applied' remote, wire it (delegates to setup_proposed which is now idempotent and adds the remote itself) 3. meta::sync_agents over the current container list — inits the meta repo on first call, rerender + relock if drifted 4. nixos-container update <c> --flake meta#<name> for every container, guarded by /var/lib/hyperhive/.meta-migration-done so phase 4's expensive eval only runs once across restarts env kill-switch HIVE_SKIP_META_MIGRATION=1 defers the whole thing. each agent's failure is logged + skipped so one broken agent doesn't block the rest. runs ahead of ensure_manager so the manager auto-spawn comes up against meta from the first attempt.
This commit is contained in:
parent
87016cd567
commit
59a89314f0
3 changed files with 194 additions and 1 deletions
|
|
@ -498,7 +498,7 @@ fn initial_agent_nix(name: &str) -> String {
|
|||
/// hive-c0re-owned meta flake at `/var/lib/hyperhive/meta/` as a flake
|
||||
/// input. Identity injection (`HIVE_PORT` / `HIVE_LABEL` / dashboard
|
||||
/// port / git committer) lives in the meta flake's wrapper, not here.
|
||||
fn initial_flake_nix() -> &'static str {
|
||||
pub fn initial_flake_nix() -> &'static str {
|
||||
"{\n description = \"hyperhive agent\";\n inputs = { };\n outputs = { self }: {\n nixosModules.default = import ./agent.nix;\n };\n}\n"
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -18,6 +18,7 @@ mod events_vacuum;
|
|||
mod lifecycle;
|
||||
mod manager_server;
|
||||
mod meta;
|
||||
mod migrate;
|
||||
mod operator_questions;
|
||||
mod server;
|
||||
|
||||
|
|
@ -97,6 +98,15 @@ async fn main() -> Result<()> {
|
|||
} => {
|
||||
let coord = Arc::new(Coordinator::open(&db, hyperhive_flake, dashboard_port)?);
|
||||
manager_server::start(coord.clone())?;
|
||||
// Idempotent pre-flight: rewrite pre-meta-layout applied
|
||||
// repos, ensure proposed repos carry the `applied`
|
||||
// remote, bootstrap the meta repo, repoint containers at
|
||||
// `meta#<name>` (one-shot, guarded by a marker file).
|
||||
// Runs before manager auto-spawn so the new manager is
|
||||
// built against meta from the first attempt.
|
||||
if let Err(e) = migrate::run(&coord).await {
|
||||
tracing::warn!(error = ?e, "startup migration failed");
|
||||
}
|
||||
// Auto-create the manager container if it isn't there yet. Block
|
||||
// on this — without hm1nd the system has no manager harness.
|
||||
// Failures are logged but allowed: a broken auto-spawn shouldn't
|
||||
|
|
|
|||
183
hive-c0re/src/migrate.rs
Normal file
183
hive-c0re/src/migrate.rs
Normal file
|
|
@ -0,0 +1,183 @@
|
|||
//! Startup auto-migration from the pre-meta layout. Runs before
|
||||
//! `auto_update::run` and consists of four phases, each idempotent:
|
||||
//!
|
||||
//! 1. Per-agent applied repo: rewrite `flake.nix` to the module-only
|
||||
//! boilerplate if it isn't already, commit, relocate `deployed/0`
|
||||
//! to HEAD so `setup_applied`'s existence check passes.
|
||||
//! 2. Per-agent proposed repo: ensure the `applied` git remote
|
||||
//! points at `/applied/<n>/.git` (re-runs `setup_proposed`'s
|
||||
//! `ensure_applied_remote` indirectly via a host-side git call).
|
||||
//! 3. Meta repo: `meta::sync_agents` over the current agent list —
|
||||
//! init the repo on first call, rerender + relock if anything
|
||||
//! drifted.
|
||||
//! 4. Container repoint: for every existing container, run
|
||||
//! `nixos-container update <c> --flake meta#<name>` so it
|
||||
//! activates against the meta flake. Guarded by a marker file
|
||||
//! so the (expensive) phase 4 only runs once across hive-c0re
|
||||
//! restarts.
|
||||
//!
|
||||
//! Env kill-switch: `HIVE_SKIP_META_MIGRATION=1` skips the whole
|
||||
//! migration. Use when smoke-testing one agent at a time by hand.
|
||||
|
||||
use std::path::{Path, PathBuf};
|
||||
use std::sync::Arc;
|
||||
|
||||
use anyhow::{Context, Result};
|
||||
use tokio::process::Command;
|
||||
|
||||
use crate::coordinator::Coordinator;
|
||||
use crate::lifecycle::{self, AGENT_PREFIX, MANAGER_NAME};
|
||||
use crate::meta;
|
||||
|
||||
const KILL_SWITCH: &str = "HIVE_SKIP_META_MIGRATION";
|
||||
|
||||
/// Marker for phase 4. Once present, container repoint is skipped on
|
||||
/// future restarts.
|
||||
fn repoint_marker() -> PathBuf {
|
||||
PathBuf::from("/var/lib/hyperhive/.meta-migration-done")
|
||||
}
|
||||
|
||||
const MODULE_FLAKE_MARKER: &str = "nixosModules.default = import ./agent.nix";
|
||||
|
||||
pub async fn run(coord: &Arc<Coordinator>) -> Result<()> {
|
||||
if std::env::var(KILL_SWITCH).is_ok() {
|
||||
tracing::info!("migration: {KILL_SWITCH} set — skipping");
|
||||
return Ok(());
|
||||
}
|
||||
let names = enumerate_agents().await;
|
||||
tracing::info!(count = names.len(), "migration: scanning");
|
||||
|
||||
// Phase 1 + 2: per-agent applied + proposed.
|
||||
for name in &names {
|
||||
if let Err(e) = migrate_applied_repo(name).await {
|
||||
tracing::warn!(%name, error = ?e, "migration: applied repo rewrite failed");
|
||||
}
|
||||
if let Err(e) = lifecycle::setup_proposed(&Coordinator::agent_proposed_dir(name), name)
|
||||
.await
|
||||
{
|
||||
tracing::warn!(%name, error = ?e, "migration: setup_proposed failed");
|
||||
}
|
||||
}
|
||||
|
||||
// Phase 3: meta repo.
|
||||
let agents = lifecycle::agents_for_meta_listing().await.unwrap_or_default();
|
||||
if let Err(e) =
|
||||
meta::sync_agents(&coord.hyperhive_flake, coord.dashboard_port, &agents).await
|
||||
{
|
||||
tracing::warn!(error = ?e, "migration: meta sync_agents failed");
|
||||
}
|
||||
|
||||
// Phase 4: container repoint, guarded by marker.
|
||||
if repoint_marker().exists() {
|
||||
tracing::debug!("migration: phase 4 marker present, skipping repoint");
|
||||
return Ok(());
|
||||
}
|
||||
let mut all_ok = true;
|
||||
for name in &names {
|
||||
if let Err(e) = repoint_container(name).await {
|
||||
tracing::warn!(%name, error = ?e, "migration: container repoint failed");
|
||||
all_ok = false;
|
||||
}
|
||||
}
|
||||
if all_ok && !names.is_empty()
|
||||
&& let Err(e) = std::fs::write(repoint_marker(), b"done\n")
|
||||
{
|
||||
tracing::warn!(error = ?e, "migration: write repoint marker failed");
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
async fn enumerate_agents() -> Vec<String> {
|
||||
let containers = lifecycle::list().await.unwrap_or_default();
|
||||
containers
|
||||
.into_iter()
|
||||
.filter_map(|c| {
|
||||
if c == MANAGER_NAME {
|
||||
Some(MANAGER_NAME.to_owned())
|
||||
} else {
|
||||
c.strip_prefix(AGENT_PREFIX).map(str::to_owned)
|
||||
}
|
||||
})
|
||||
.collect()
|
||||
}
|
||||
|
||||
async fn migrate_applied_repo(name: &str) -> Result<()> {
|
||||
let dir = Coordinator::agent_applied_dir(name);
|
||||
if !dir.join(".git").exists() {
|
||||
return Ok(());
|
||||
}
|
||||
let flake_path = dir.join("flake.nix");
|
||||
let cur = std::fs::read_to_string(&flake_path).unwrap_or_default();
|
||||
if cur.contains(MODULE_FLAKE_MARKER) {
|
||||
return Ok(());
|
||||
}
|
||||
let want = lifecycle::initial_flake_nix();
|
||||
std::fs::write(&flake_path, want)
|
||||
.with_context(|| format!("write {}", flake_path.display()))?;
|
||||
raw_git(
|
||||
&dir,
|
||||
&[
|
||||
"-c",
|
||||
"user.name=hive-c0re",
|
||||
"-c",
|
||||
"user.email=hive-c0re@hyperhive",
|
||||
"add",
|
||||
"flake.nix",
|
||||
],
|
||||
)
|
||||
.await?;
|
||||
raw_git(
|
||||
&dir,
|
||||
&[
|
||||
"-c",
|
||||
"user.name=hive-c0re",
|
||||
"-c",
|
||||
"user.email=hive-c0re@hyperhive",
|
||||
"commit",
|
||||
"-m",
|
||||
"migration: module-only flake",
|
||||
],
|
||||
)
|
||||
.await?;
|
||||
// Relocate deployed/0 to the migration commit so
|
||||
// setup_applied's existence check passes.
|
||||
raw_git(&dir, &["tag", "-f", "deployed/0", "HEAD"]).await?;
|
||||
tracing::info!(%name, "migration: applied repo migrated to module-only flake");
|
||||
Ok(())
|
||||
}
|
||||
|
||||
async fn repoint_container(name: &str) -> Result<()> {
|
||||
let container = lifecycle::container_name(name);
|
||||
let flake_ref = format!("{}#{name}", meta::meta_dir().display());
|
||||
let out = Command::new("nixos-container")
|
||||
.args(["update", &container, "--flake", &flake_ref])
|
||||
.output()
|
||||
.await
|
||||
.with_context(|| format!("nixos-container update {container}"))?;
|
||||
if !out.status.success() {
|
||||
anyhow::bail!(
|
||||
"nixos-container update {container} exited {}: {}",
|
||||
out.status,
|
||||
String::from_utf8_lossy(&out.stderr).trim()
|
||||
);
|
||||
}
|
||||
tracing::info!(%name, %container, "migration: container repointed at meta");
|
||||
Ok(())
|
||||
}
|
||||
|
||||
async fn raw_git(dir: &Path, args: &[&str]) -> Result<()> {
|
||||
let out = lifecycle::git_command()
|
||||
.current_dir(dir)
|
||||
.args(args)
|
||||
.output()
|
||||
.await
|
||||
.with_context(|| format!("git {} in {}", args.join(" "), dir.display()))?;
|
||||
if !out.status.success() {
|
||||
anyhow::bail!(
|
||||
"git {} failed: {}",
|
||||
args.join(" "),
|
||||
String::from_utf8_lossy(&out.stderr).trim()
|
||||
);
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
Loading…
Add table
Add a link
Reference in a new issue