meta: serialize all ops behind a tokio mutex + clear stale lock at startup
journal showed three concurrent rebuilds racing on the meta repo's .git/index.lock — auto_update::run kicks off parallel tokio::spawn for every stale agent, each rebuild eventually calls into meta::sync_agents / lock_update_for_rebuild which do git add + commit, git isn't safe across concurrent processes on the same .git/, and one of the failing-mid-write children left index.lock behind. subsequent ops blocked until somebody rm'd it manually. fix: static META_LOCK (tokio::sync::Mutex<()>) acquired at the top of every public meta function. concurrent rebuilds take turns on meta ops; the actual nix build (nixos-container update) releases the lock first and runs without it, so parallel agent builds still parallelize on nix-daemon's own concurrency model. migrate::run additionally clears /var/lib/hyperhive/meta/.git/ index.lock on startup if it exists — we just booted, nothing of ours is holding it. covers the 'previous crash left a stale lock' case the user just hit so the daemon recovers without manual intervention.
This commit is contained in:
parent
3db33b0fe5
commit
78f21ccc5d
3 changed files with 84 additions and 0 deletions
|
|
@ -0,0 +1,55 @@
|
|||
From e0b18ff3c2ec5a7f771ab9a1a247ff4a24a8c475 Mon Sep 17 00:00:00 2001
|
||||
From: damocles <damocles@hyperhive>
|
||||
Date: Sat, 16 May 2026 02:28:21 +0200
|
||||
Subject: [PATCH] mcp: wire extra server allowedTools into --allowedTools arg
|
||||
|
||||
---
|
||||
hive-ag3nt/src/mcp.rs | 18 +++++++++++++++++-
|
||||
1 file changed, 17 insertions(+), 1 deletion(-)
|
||||
|
||||
diff --git a/hive-ag3nt/src/mcp.rs b/hive-ag3nt/src/mcp.rs
|
||||
index d8831b4..cb0918a 100644
|
||||
--- a/hive-ag3nt/src/mcp.rs
|
||||
+++ b/hive-ag3nt/src/mcp.rs
|
||||
@@ -539,6 +539,8 @@ impl ManagerServer {
|
||||
)]
|
||||
impl ServerHandler for ManagerServer {}
|
||||
|
||||
+
|
||||
+
|
||||
/// Name of the hyperhive MCP server inside claude's view. Claude prefixes
|
||||
/// tools as `mcp__<this>__<tool>` (e.g. `mcp__hyperhive__send`).
|
||||
pub const SERVER_NAME: &str = "hyperhive";
|
||||
@@ -601,7 +603,9 @@ pub fn allowed_mcp_tools(flavor: Flavor) -> Vec<String> {
|
||||
}
|
||||
|
||||
/// Combined allow-list passed to `--allowedTools` (auto-approve) — covers
|
||||
-/// both the built-ins and the MCP surface.
|
||||
+/// the built-ins, the hyperhive MCP surface, and any extra MCP servers.
|
||||
+/// Extra server tools are read from the same `/etc/hyperhive/extra-mcp.json`
|
||||
+/// file that `render_claude_config` uses, so the two are always in sync.
|
||||
#[must_use]
|
||||
pub fn allowed_tools_arg(flavor: Flavor) -> String {
|
||||
let mut all: Vec<String> = ALLOWED_BUILTIN_TOOLS
|
||||
@@ -609,6 +613,18 @@ pub fn allowed_tools_arg(flavor: Flavor) -> String {
|
||||
.map(|s| (*s).to_owned())
|
||||
.collect();
|
||||
all.extend(allowed_mcp_tools(flavor));
|
||||
+ for (name, spec) in load_extra_mcp() {
|
||||
+ if name == SERVER_NAME {
|
||||
+ continue; // already covered above
|
||||
+ }
|
||||
+ for tool in &spec.allowed_tools {
|
||||
+ if tool == "*" {
|
||||
+ all.push(format!("mcp__{name}__*"));
|
||||
+ } else {
|
||||
+ all.push(format!("mcp__{name}__{tool}"));
|
||||
+ }
|
||||
+ }
|
||||
+ }
|
||||
all.join(",")
|
||||
}
|
||||
|
||||
--
|
||||
2.51.2
|
||||
|
||||
|
|
@ -20,6 +20,7 @@ use std::path::{Path, PathBuf};
|
|||
|
||||
use anyhow::{Context, Result, bail};
|
||||
use tokio::process::Command;
|
||||
use tokio::sync::Mutex;
|
||||
|
||||
use crate::lifecycle;
|
||||
|
||||
|
|
@ -28,6 +29,17 @@ const APPLIED_ROOT: &str = "/var/lib/hyperhive/applied";
|
|||
const GIT_NAME: &str = "hive-c0re";
|
||||
const GIT_EMAIL: &str = "hive-c0re@hyperhive";
|
||||
|
||||
/// Single-writer lock around every meta-repo operation. Git isn't
|
||||
/// safe to drive from concurrent processes against the same `.git/`
|
||||
/// — two simultaneous `git add` / `commit` invocations race on
|
||||
/// `.git/index.lock`; if either dies before releasing, the lock
|
||||
/// sticks and the next operation hits "another git process seems to
|
||||
/// be running" until somebody `rm`s it manually. Holding this mutex
|
||||
/// across each public function's git+nix calls makes parallel
|
||||
/// rebuilds (`auto_update` + dashboard-triggered + apply-commit)
|
||||
/// take turns instead of colliding.
|
||||
static META_LOCK: Mutex<()> = Mutex::const_new(());
|
||||
|
||||
/// Where the manager sees this directory inside its container (RO bind).
|
||||
#[allow(dead_code)] // wired up by set_nspawn_flags in a follow-up commit
|
||||
pub const CONTAINER_MANAGER_META_MOUNT: &str = "/meta";
|
||||
|
|
@ -56,6 +68,7 @@ pub async fn sync_agents(
|
|||
operator_pronouns: &str,
|
||||
agents: &[AgentSpec],
|
||||
) -> Result<()> {
|
||||
let _guard = META_LOCK.lock().await;
|
||||
let dir = meta_dir();
|
||||
std::fs::create_dir_all(&dir).with_context(|| format!("create {}", dir.display()))?;
|
||||
|
||||
|
|
@ -104,6 +117,7 @@ pub async fn sync_agents(
|
|||
/// meta history only carries successful deploys.
|
||||
#[allow(dead_code)] // wired up by actions::run_apply_commit in a later commit
|
||||
pub async fn prepare_deploy(name: &str) -> Result<()> {
|
||||
let _guard = META_LOCK.lock().await;
|
||||
let dir = meta_dir();
|
||||
let input = format!("agent-{name}");
|
||||
nix(&dir, &["flake", "update", &input]).await?;
|
||||
|
|
@ -118,6 +132,7 @@ pub async fn prepare_deploy(name: &str) -> Result<()> {
|
|||
/// place (nothing staged → nothing to commit).
|
||||
#[allow(dead_code)]
|
||||
pub async fn finalize_deploy(name: &str, sha: &str, tag: &str) -> Result<()> {
|
||||
let _guard = META_LOCK.lock().await;
|
||||
let dir = meta_dir();
|
||||
if !has_staged_changes(&dir).await? {
|
||||
return Ok(());
|
||||
|
|
@ -131,6 +146,7 @@ pub async fn finalize_deploy(name: &str, sha: &str, tag: &str) -> Result<()> {
|
|||
/// captured in `applied/<n>`'s annotated `failed/<id>` tag.
|
||||
#[allow(dead_code)]
|
||||
pub async fn abort_deploy() -> Result<()> {
|
||||
let _guard = META_LOCK.lock().await;
|
||||
let dir = meta_dir();
|
||||
git(&dir, &["restore", "--staged", "flake.lock"]).await?;
|
||||
git(&dir, &["restore", "flake.lock"]).await
|
||||
|
|
@ -157,6 +173,7 @@ async fn has_staged_changes(dir: &Path) -> Result<bool> {
|
|||
/// semantics — it always wants the latest main.
|
||||
#[allow(dead_code)] // wired up by lifecycle::rebuild in this commit
|
||||
pub async fn lock_update_for_rebuild(name: &str) -> Result<()> {
|
||||
let _guard = META_LOCK.lock().await;
|
||||
let dir = meta_dir();
|
||||
let input = format!("agent-{name}");
|
||||
nix(&dir, &["flake", "update", &input]).await?;
|
||||
|
|
@ -172,6 +189,7 @@ pub async fn lock_update_for_rebuild(name: &str) -> Result<()> {
|
|||
/// because the per-agent inputs aren't touched.
|
||||
#[allow(dead_code)]
|
||||
pub async fn lock_update_hyperhive() -> Result<()> {
|
||||
let _guard = META_LOCK.lock().await;
|
||||
let dir = meta_dir();
|
||||
nix(&dir, &["flake", "update", "hyperhive"]).await?;
|
||||
if git_is_clean(&dir).await? {
|
||||
|
|
|
|||
|
|
@ -49,6 +49,17 @@ pub async fn run(coord: &Arc<Coordinator>) -> Result<()> {
|
|||
tracing::info!("migration: {KILL_SWITCH} set — skipping");
|
||||
return Ok(());
|
||||
}
|
||||
// Stale meta index lock: a previous hive-c0re crash mid-`git add`
|
||||
// can leave `.git/index.lock` behind, which blocks every
|
||||
// subsequent meta op until somebody `rm`s it manually. We just
|
||||
// booted so nothing of ours is holding it; safe to clear.
|
||||
let meta_lock = std::path::PathBuf::from("/var/lib/hyperhive/meta/.git/index.lock");
|
||||
if meta_lock.exists() {
|
||||
match std::fs::remove_file(&meta_lock) {
|
||||
Ok(()) => tracing::warn!("cleared stale meta/.git/index.lock"),
|
||||
Err(e) => tracing::warn!(error = ?e, "clear stale meta lock failed"),
|
||||
}
|
||||
}
|
||||
let names = enumerate_agents().await;
|
||||
tracing::info!(count = names.len(), "migration: scanning");
|
||||
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue