meta: serialize all ops behind a tokio mutex + clear stale lock at startup

journal showed three concurrent rebuilds racing on the meta
repo's .git/index.lock — auto_update::run kicks off parallel
tokio::spawn for every stale agent, each rebuild eventually
calls into meta::sync_agents / lock_update_for_rebuild which
do git add + commit, git isn't safe across concurrent processes
on the same .git/, and one of the failing-mid-write children
left index.lock behind. subsequent ops blocked until somebody
rm'd it manually.

fix: static META_LOCK (tokio::sync::Mutex<()>) acquired at the
top of every public meta function. concurrent rebuilds take
turns on meta ops; the actual nix build (nixos-container update)
releases the lock first and runs without it, so parallel agent
builds still parallelize on nix-daemon's own concurrency model.

migrate::run additionally clears /var/lib/hyperhive/meta/.git/
index.lock on startup if it exists — we just booted, nothing
of ours is holding it. covers the 'previous crash left a stale
lock' case the user just hit so the daemon recovers without
manual intervention.
This commit is contained in:
müde 2026-05-16 02:44:39 +02:00
parent 3db33b0fe5
commit 78f21ccc5d
3 changed files with 84 additions and 0 deletions

View file

@ -20,6 +20,7 @@ use std::path::{Path, PathBuf};
use anyhow::{Context, Result, bail};
use tokio::process::Command;
use tokio::sync::Mutex;
use crate::lifecycle;
@ -28,6 +29,17 @@ const APPLIED_ROOT: &str = "/var/lib/hyperhive/applied";
const GIT_NAME: &str = "hive-c0re";
const GIT_EMAIL: &str = "hive-c0re@hyperhive";
/// Single-writer lock around every meta-repo operation. Git isn't
/// safe to drive from concurrent processes against the same `.git/`
/// — two simultaneous `git add` / `commit` invocations race on
/// `.git/index.lock`; if either dies before releasing, the lock
/// sticks and the next operation hits "another git process seems to
/// be running" until somebody `rm`s it manually. Holding this mutex
/// across each public function's git+nix calls makes parallel
/// rebuilds (`auto_update` + dashboard-triggered + apply-commit)
/// take turns instead of colliding.
static META_LOCK: Mutex<()> = Mutex::const_new(());
/// Where the manager sees this directory inside its container (RO bind).
#[allow(dead_code)] // wired up by set_nspawn_flags in a follow-up commit
pub const CONTAINER_MANAGER_META_MOUNT: &str = "/meta";
@ -56,6 +68,7 @@ pub async fn sync_agents(
operator_pronouns: &str,
agents: &[AgentSpec],
) -> Result<()> {
let _guard = META_LOCK.lock().await;
let dir = meta_dir();
std::fs::create_dir_all(&dir).with_context(|| format!("create {}", dir.display()))?;
@ -104,6 +117,7 @@ pub async fn sync_agents(
/// meta history only carries successful deploys.
#[allow(dead_code)] // wired up by actions::run_apply_commit in a later commit
pub async fn prepare_deploy(name: &str) -> Result<()> {
let _guard = META_LOCK.lock().await;
let dir = meta_dir();
let input = format!("agent-{name}");
nix(&dir, &["flake", "update", &input]).await?;
@ -118,6 +132,7 @@ pub async fn prepare_deploy(name: &str) -> Result<()> {
/// place (nothing staged → nothing to commit).
#[allow(dead_code)]
pub async fn finalize_deploy(name: &str, sha: &str, tag: &str) -> Result<()> {
let _guard = META_LOCK.lock().await;
let dir = meta_dir();
if !has_staged_changes(&dir).await? {
return Ok(());
@ -131,6 +146,7 @@ pub async fn finalize_deploy(name: &str, sha: &str, tag: &str) -> Result<()> {
/// captured in `applied/<n>`'s annotated `failed/<id>` tag.
#[allow(dead_code)]
pub async fn abort_deploy() -> Result<()> {
let _guard = META_LOCK.lock().await;
let dir = meta_dir();
git(&dir, &["restore", "--staged", "flake.lock"]).await?;
git(&dir, &["restore", "flake.lock"]).await
@ -157,6 +173,7 @@ async fn has_staged_changes(dir: &Path) -> Result<bool> {
/// semantics — it always wants the latest main.
#[allow(dead_code)] // wired up by lifecycle::rebuild in this commit
pub async fn lock_update_for_rebuild(name: &str) -> Result<()> {
let _guard = META_LOCK.lock().await;
let dir = meta_dir();
let input = format!("agent-{name}");
nix(&dir, &["flake", "update", &input]).await?;
@ -172,6 +189,7 @@ pub async fn lock_update_for_rebuild(name: &str) -> Result<()> {
/// because the per-agent inputs aren't touched.
#[allow(dead_code)]
pub async fn lock_update_hyperhive() -> Result<()> {
let _guard = META_LOCK.lock().await;
let dir = meta_dir();
nix(&dir, &["flake", "update", "hyperhive"]).await?;
if git_is_clean(&dir).await? {