journal showed three concurrent rebuilds racing on the meta repo's .git/index.lock — auto_update::run kicks off parallel tokio::spawn for every stale agent, each rebuild eventually calls into meta::sync_agents / lock_update_for_rebuild which do git add + commit, git isn't safe across concurrent processes on the same .git/, and one of the failing-mid-write children left index.lock behind. subsequent ops blocked until somebody rm'd it manually. fix: static META_LOCK (tokio::sync::Mutex<()>) acquired at the top of every public meta function. concurrent rebuilds take turns on meta ops; the actual nix build (nixos-container update) releases the lock first and runs without it, so parallel agent builds still parallelize on nix-daemon's own concurrency model. migrate::run additionally clears /var/lib/hyperhive/meta/.git/ index.lock on startup if it exists — we just booted, nothing of ours is holding it. covers the 'previous crash left a stale lock' case the user just hit so the daemon recovers without manual intervention.
213 lines
7.5 KiB
Rust
213 lines
7.5 KiB
Rust
//! Startup auto-migration from the pre-meta layout. Runs before
|
|
//! `auto_update::run` and consists of four phases, each idempotent:
|
|
//!
|
|
//! 1. Per-agent applied repo: rewrite `flake.nix` to the module-only
|
|
//! boilerplate if it isn't already, commit, relocate `deployed/0`
|
|
//! to HEAD so `setup_applied`'s existence check passes.
|
|
//! 2. Per-agent proposed repo: ensure the `applied` git remote
|
|
//! points at `/applied/<n>/.git` (re-runs `setup_proposed`'s
|
|
//! `ensure_applied_remote` indirectly via a host-side git call).
|
|
//! 3. Meta repo: `meta::sync_agents` over the current agent list —
|
|
//! init the repo on first call, rerender + relock if anything
|
|
//! drifted.
|
|
//! 4. Container repoint: for every existing container, run
|
|
//! `nixos-container update <c> --flake meta#<name>` so it
|
|
//! activates against the meta flake. Guarded by a marker file
|
|
//! so the (expensive) phase 4 only runs once across hive-c0re
|
|
//! restarts.
|
|
//!
|
|
//! Env kill-switch: `HIVE_SKIP_META_MIGRATION=1` skips the whole
|
|
//! migration. Use when smoke-testing one agent at a time by hand.
|
|
|
|
use std::path::{Path, PathBuf};
|
|
use std::sync::Arc;
|
|
|
|
use anyhow::{Context, Result};
|
|
use tokio::process::Command;
|
|
|
|
use crate::coordinator::Coordinator;
|
|
use crate::lifecycle::{self, AGENT_PREFIX, MANAGER_NAME};
|
|
use crate::meta;
|
|
|
|
const KILL_SWITCH: &str = "HIVE_SKIP_META_MIGRATION";
|
|
|
|
/// Marker for phase 4. Once present, container repoint is skipped on
|
|
/// future restarts.
|
|
fn repoint_marker() -> PathBuf {
|
|
PathBuf::from("/var/lib/hyperhive/.meta-migration-done")
|
|
}
|
|
|
|
/// Substring that identifies the *current* agent flake boilerplate.
|
|
/// Bumped whenever the template changes so the startup migration
|
|
/// re-renders existing agents onto the new shape. Today the marker
|
|
/// is the `flakeInputs` module-arg forwarding line — older templates
|
|
/// (raw `import ./agent.nix`) get rewritten on next hive-c0re start.
|
|
const MODULE_FLAKE_MARKER: &str = "_module.args.flakeInputs";
|
|
|
|
pub async fn run(coord: &Arc<Coordinator>) -> Result<()> {
|
|
if std::env::var(KILL_SWITCH).is_ok() {
|
|
tracing::info!("migration: {KILL_SWITCH} set — skipping");
|
|
return Ok(());
|
|
}
|
|
// Stale meta index lock: a previous hive-c0re crash mid-`git add`
|
|
// can leave `.git/index.lock` behind, which blocks every
|
|
// subsequent meta op until somebody `rm`s it manually. We just
|
|
// booted so nothing of ours is holding it; safe to clear.
|
|
let meta_lock = std::path::PathBuf::from("/var/lib/hyperhive/meta/.git/index.lock");
|
|
if meta_lock.exists() {
|
|
match std::fs::remove_file(&meta_lock) {
|
|
Ok(()) => tracing::warn!("cleared stale meta/.git/index.lock"),
|
|
Err(e) => tracing::warn!(error = ?e, "clear stale meta lock failed"),
|
|
}
|
|
}
|
|
let names = enumerate_agents().await;
|
|
tracing::info!(count = names.len(), "migration: scanning");
|
|
|
|
// Phase 1 + 2: per-agent applied + proposed.
|
|
for name in &names {
|
|
if let Err(e) = migrate_applied_repo(name).await {
|
|
tracing::warn!(%name, error = ?e, "migration: applied repo rewrite failed");
|
|
}
|
|
if let Err(e) = lifecycle::setup_proposed(&Coordinator::agent_proposed_dir(name), name)
|
|
.await
|
|
{
|
|
tracing::warn!(%name, error = ?e, "migration: setup_proposed failed");
|
|
}
|
|
}
|
|
|
|
// Phase 3: meta repo.
|
|
let agents = lifecycle::agents_for_meta_listing().await.unwrap_or_default();
|
|
if let Err(e) =
|
|
meta::sync_agents(
|
|
&coord.hyperhive_flake,
|
|
coord.dashboard_port,
|
|
&coord.operator_pronouns,
|
|
&agents,
|
|
)
|
|
.await
|
|
{
|
|
tracing::warn!(error = ?e, "migration: meta sync_agents failed");
|
|
}
|
|
|
|
// Phase 4: container repoint, guarded by marker.
|
|
if repoint_marker().exists() {
|
|
tracing::debug!("migration: phase 4 marker present, skipping repoint");
|
|
return Ok(());
|
|
}
|
|
let mut all_ok = true;
|
|
for name in &names {
|
|
// Mark Rebuilding so the crash watcher skips this container
|
|
// during the brief stop+start window the nixos-container
|
|
// update activation triggers. Without this, crash_watch
|
|
// would fire ContainerCrash for every agent here and the
|
|
// manager would spuriously try to recover them.
|
|
coord.set_transient(name, crate::coordinator::TransientKind::Rebuilding);
|
|
let result = repoint_container(name).await;
|
|
coord.clear_transient(name);
|
|
if let Err(e) = result {
|
|
tracing::warn!(%name, error = ?e, "migration: container repoint failed");
|
|
all_ok = false;
|
|
}
|
|
}
|
|
if all_ok && !names.is_empty()
|
|
&& let Err(e) = std::fs::write(repoint_marker(), b"done\n")
|
|
{
|
|
tracing::warn!(error = ?e, "migration: write repoint marker failed");
|
|
}
|
|
Ok(())
|
|
}
|
|
|
|
async fn enumerate_agents() -> Vec<String> {
|
|
let containers = lifecycle::list().await.unwrap_or_default();
|
|
containers
|
|
.into_iter()
|
|
.filter_map(|c| {
|
|
if c == MANAGER_NAME {
|
|
Some(MANAGER_NAME.to_owned())
|
|
} else {
|
|
c.strip_prefix(AGENT_PREFIX).map(str::to_owned)
|
|
}
|
|
})
|
|
.collect()
|
|
}
|
|
|
|
async fn migrate_applied_repo(name: &str) -> Result<()> {
|
|
let dir = Coordinator::agent_applied_dir(name);
|
|
if !dir.join(".git").exists() {
|
|
return Ok(());
|
|
}
|
|
let flake_path = dir.join("flake.nix");
|
|
let cur = std::fs::read_to_string(&flake_path).unwrap_or_default();
|
|
if cur.contains(MODULE_FLAKE_MARKER) {
|
|
return Ok(());
|
|
}
|
|
let want = lifecycle::initial_flake_nix();
|
|
std::fs::write(&flake_path, want)
|
|
.with_context(|| format!("write {}", flake_path.display()))?;
|
|
raw_git(
|
|
&dir,
|
|
&[
|
|
"-c",
|
|
"user.name=hive-c0re",
|
|
"-c",
|
|
"user.email=hive-c0re@hyperhive",
|
|
"add",
|
|
"flake.nix",
|
|
],
|
|
)
|
|
.await?;
|
|
raw_git(
|
|
&dir,
|
|
&[
|
|
"-c",
|
|
"user.name=hive-c0re",
|
|
"-c",
|
|
"user.email=hive-c0re@hyperhive",
|
|
"commit",
|
|
"-m",
|
|
"migration: module-only flake",
|
|
],
|
|
)
|
|
.await?;
|
|
// Relocate deployed/0 to the migration commit so
|
|
// setup_applied's existence check passes.
|
|
raw_git(&dir, &["tag", "-f", "deployed/0", "HEAD"]).await?;
|
|
tracing::info!(%name, "migration: applied repo migrated to module-only flake");
|
|
Ok(())
|
|
}
|
|
|
|
async fn repoint_container(name: &str) -> Result<()> {
|
|
let container = lifecycle::container_name(name);
|
|
let flake_ref = format!("{}#{name}", meta::meta_dir().display());
|
|
let out = Command::new("nixos-container")
|
|
.args(["update", &container, "--flake", &flake_ref])
|
|
.output()
|
|
.await
|
|
.with_context(|| format!("nixos-container update {container}"))?;
|
|
if !out.status.success() {
|
|
anyhow::bail!(
|
|
"nixos-container update {container} exited {}: {}",
|
|
out.status,
|
|
String::from_utf8_lossy(&out.stderr).trim()
|
|
);
|
|
}
|
|
tracing::info!(%name, %container, "migration: container repointed at meta");
|
|
Ok(())
|
|
}
|
|
|
|
async fn raw_git(dir: &Path, args: &[&str]) -> Result<()> {
|
|
let out = lifecycle::git_command()
|
|
.current_dir(dir)
|
|
.args(args)
|
|
.output()
|
|
.await
|
|
.with_context(|| format!("git {} in {}", args.join(" "), dir.display()))?;
|
|
if !out.status.success() {
|
|
anyhow::bail!(
|
|
"git {} failed: {}",
|
|
args.join(" "),
|
|
String::from_utf8_lossy(&out.stderr).trim()
|
|
);
|
|
}
|
|
Ok(())
|
|
}
|