From 78f21ccc5d2d52a862fa525cc41ff92a9b639560 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?m=C3=BCde?= Date: Sat, 16 May 2026 02:44:39 +0200 Subject: [PATCH] meta: serialize all ops behind a tokio mutex + clear stale lock at startup MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit journal showed three concurrent rebuilds racing on the meta repo's .git/index.lock — auto_update::run kicks off parallel tokio::spawn for every stale agent, each rebuild eventually calls into meta::sync_agents / lock_update_for_rebuild which do git add + commit, git isn't safe across concurrent processes on the same .git/, and one of the failing-mid-write children left index.lock behind. subsequent ops blocked until somebody rm'd it manually. fix: static META_LOCK (tokio::sync::Mutex<()>) acquired at the top of every public meta function. concurrent rebuilds take turns on meta ops; the actual nix build (nixos-container update) releases the lock first and runs without it, so parallel agent builds still parallelize on nix-daemon's own concurrency model. migrate::run additionally clears /var/lib/hyperhive/meta/.git/ index.lock on startup if it exists — we just booted, nothing of ours is holding it. covers the 'previous crash left a stale lock' case the user just hit so the daemon recovers without manual intervention. --- ...erver-allowedTools-into-allowedTools.patch | 55 +++++++++++++++++++ hive-c0re/src/meta.rs | 18 ++++++ hive-c0re/src/migrate.rs | 11 ++++ 3 files changed, 84 insertions(+) create mode 100644 0001-mcp-wire-extra-server-allowedTools-into-allowedTools.patch diff --git a/0001-mcp-wire-extra-server-allowedTools-into-allowedTools.patch b/0001-mcp-wire-extra-server-allowedTools-into-allowedTools.patch new file mode 100644 index 0000000..b3a563a --- /dev/null +++ b/0001-mcp-wire-extra-server-allowedTools-into-allowedTools.patch @@ -0,0 +1,55 @@ +From e0b18ff3c2ec5a7f771ab9a1a247ff4a24a8c475 Mon Sep 17 00:00:00 2001 +From: damocles +Date: Sat, 16 May 2026 02:28:21 +0200 +Subject: [PATCH] mcp: wire extra server allowedTools into --allowedTools arg + +--- + hive-ag3nt/src/mcp.rs | 18 +++++++++++++++++- + 1 file changed, 17 insertions(+), 1 deletion(-) + +diff --git a/hive-ag3nt/src/mcp.rs b/hive-ag3nt/src/mcp.rs +index d8831b4..cb0918a 100644 +--- a/hive-ag3nt/src/mcp.rs ++++ b/hive-ag3nt/src/mcp.rs +@@ -539,6 +539,8 @@ impl ManagerServer { + )] + impl ServerHandler for ManagerServer {} + ++ ++ + /// Name of the hyperhive MCP server inside claude's view. Claude prefixes + /// tools as `mcp____` (e.g. `mcp__hyperhive__send`). + pub const SERVER_NAME: &str = "hyperhive"; +@@ -601,7 +603,9 @@ pub fn allowed_mcp_tools(flavor: Flavor) -> Vec { + } + + /// Combined allow-list passed to `--allowedTools` (auto-approve) — covers +-/// both the built-ins and the MCP surface. ++/// the built-ins, the hyperhive MCP surface, and any extra MCP servers. ++/// Extra server tools are read from the same `/etc/hyperhive/extra-mcp.json` ++/// file that `render_claude_config` uses, so the two are always in sync. + #[must_use] + pub fn allowed_tools_arg(flavor: Flavor) -> String { + let mut all: Vec = ALLOWED_BUILTIN_TOOLS +@@ -609,6 +613,18 @@ pub fn allowed_tools_arg(flavor: Flavor) -> String { + .map(|s| (*s).to_owned()) + .collect(); + all.extend(allowed_mcp_tools(flavor)); ++ for (name, spec) in load_extra_mcp() { ++ if name == SERVER_NAME { ++ continue; // already covered above ++ } ++ for tool in &spec.allowed_tools { ++ if tool == "*" { ++ all.push(format!("mcp__{name}__*")); ++ } else { ++ all.push(format!("mcp__{name}__{tool}")); ++ } ++ } ++ } + all.join(",") + } + +-- +2.51.2 + diff --git a/hive-c0re/src/meta.rs b/hive-c0re/src/meta.rs index bc94862..0aa2919 100644 --- a/hive-c0re/src/meta.rs +++ b/hive-c0re/src/meta.rs @@ -20,6 +20,7 @@ use std::path::{Path, PathBuf}; use anyhow::{Context, Result, bail}; use tokio::process::Command; +use tokio::sync::Mutex; use crate::lifecycle; @@ -28,6 +29,17 @@ const APPLIED_ROOT: &str = "/var/lib/hyperhive/applied"; const GIT_NAME: &str = "hive-c0re"; const GIT_EMAIL: &str = "hive-c0re@hyperhive"; +/// Single-writer lock around every meta-repo operation. Git isn't +/// safe to drive from concurrent processes against the same `.git/` +/// — two simultaneous `git add` / `commit` invocations race on +/// `.git/index.lock`; if either dies before releasing, the lock +/// sticks and the next operation hits "another git process seems to +/// be running" until somebody `rm`s it manually. Holding this mutex +/// across each public function's git+nix calls makes parallel +/// rebuilds (`auto_update` + dashboard-triggered + apply-commit) +/// take turns instead of colliding. +static META_LOCK: Mutex<()> = Mutex::const_new(()); + /// Where the manager sees this directory inside its container (RO bind). #[allow(dead_code)] // wired up by set_nspawn_flags in a follow-up commit pub const CONTAINER_MANAGER_META_MOUNT: &str = "/meta"; @@ -56,6 +68,7 @@ pub async fn sync_agents( operator_pronouns: &str, agents: &[AgentSpec], ) -> Result<()> { + let _guard = META_LOCK.lock().await; let dir = meta_dir(); std::fs::create_dir_all(&dir).with_context(|| format!("create {}", dir.display()))?; @@ -104,6 +117,7 @@ pub async fn sync_agents( /// meta history only carries successful deploys. #[allow(dead_code)] // wired up by actions::run_apply_commit in a later commit pub async fn prepare_deploy(name: &str) -> Result<()> { + let _guard = META_LOCK.lock().await; let dir = meta_dir(); let input = format!("agent-{name}"); nix(&dir, &["flake", "update", &input]).await?; @@ -118,6 +132,7 @@ pub async fn prepare_deploy(name: &str) -> Result<()> { /// place (nothing staged → nothing to commit). #[allow(dead_code)] pub async fn finalize_deploy(name: &str, sha: &str, tag: &str) -> Result<()> { + let _guard = META_LOCK.lock().await; let dir = meta_dir(); if !has_staged_changes(&dir).await? { return Ok(()); @@ -131,6 +146,7 @@ pub async fn finalize_deploy(name: &str, sha: &str, tag: &str) -> Result<()> { /// captured in `applied/`'s annotated `failed/` tag. #[allow(dead_code)] pub async fn abort_deploy() -> Result<()> { + let _guard = META_LOCK.lock().await; let dir = meta_dir(); git(&dir, &["restore", "--staged", "flake.lock"]).await?; git(&dir, &["restore", "flake.lock"]).await @@ -157,6 +173,7 @@ async fn has_staged_changes(dir: &Path) -> Result { /// semantics — it always wants the latest main. #[allow(dead_code)] // wired up by lifecycle::rebuild in this commit pub async fn lock_update_for_rebuild(name: &str) -> Result<()> { + let _guard = META_LOCK.lock().await; let dir = meta_dir(); let input = format!("agent-{name}"); nix(&dir, &["flake", "update", &input]).await?; @@ -172,6 +189,7 @@ pub async fn lock_update_for_rebuild(name: &str) -> Result<()> { /// because the per-agent inputs aren't touched. #[allow(dead_code)] pub async fn lock_update_hyperhive() -> Result<()> { + let _guard = META_LOCK.lock().await; let dir = meta_dir(); nix(&dir, &["flake", "update", "hyperhive"]).await?; if git_is_clean(&dir).await? { diff --git a/hive-c0re/src/migrate.rs b/hive-c0re/src/migrate.rs index 33a7787..06b6906 100644 --- a/hive-c0re/src/migrate.rs +++ b/hive-c0re/src/migrate.rs @@ -49,6 +49,17 @@ pub async fn run(coord: &Arc) -> Result<()> { tracing::info!("migration: {KILL_SWITCH} set — skipping"); return Ok(()); } + // Stale meta index lock: a previous hive-c0re crash mid-`git add` + // can leave `.git/index.lock` behind, which blocks every + // subsequent meta op until somebody `rm`s it manually. We just + // booted so nothing of ours is holding it; safe to clear. + let meta_lock = std::path::PathBuf::from("/var/lib/hyperhive/meta/.git/index.lock"); + if meta_lock.exists() { + match std::fs::remove_file(&meta_lock) { + Ok(()) => tracing::warn!("cleared stale meta/.git/index.lock"), + Err(e) => tracing::warn!(error = ?e, "clear stale meta lock failed"), + } + } let names = enumerate_agents().await; tracing::info!(count = names.len(), "migration: scanning");