From d202f3785c171c840cded43e811f0fb6bfbff50e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?m=C3=BCde?= Date: Sat, 16 May 2026 01:12:48 +0200 Subject: [PATCH] suppress crash_watch during background rebuilds + meta repoint crash_watch fires ContainerCrash whenever it sees a previously- running container in a non-running state without a transient flag set. dashboard rebuilds already set Rebuilding via lifecycle_action; the two other rebuild paths didn't: - migrate::repoint_container: phase 4 walks every container, each nixos-container update activation briefly takes the systemd unit down. previously fired ContainerCrash for every agent during the migration; manager would then spuriously call start() on agents that were already coming back up. - auto_update::rebuild_agent: startup scan + admin-socket caller bypass lifecycle_action. both paths now set the Rebuilding transient around the rebuild + clear after. matches what dashboard does. --- hive-c0re/src/auto_update.rs | 6 ++++++ hive-c0re/src/migrate.rs | 10 +++++++++- 2 files changed, 15 insertions(+), 1 deletion(-) diff --git a/hive-c0re/src/auto_update.rs b/hive-c0re/src/auto_update.rs index ad3db22..537da80 100644 --- a/hive-c0re/src/auto_update.rs +++ b/hive-c0re/src/auto_update.rs @@ -59,6 +59,11 @@ pub async fn rebuild_agent(coord: &Arc, name: &str, current_rev: &s let applied_dir = Coordinator::agent_applied_dir(name); let claude_dir = Coordinator::agent_claude_dir(name); let notes_dir = Coordinator::agent_notes_dir(name); + // Suppress crash_watch during the stop+start window inside + // lifecycle::rebuild. Dashboard rebuilds already do this via + // lifecycle_action; this catches the auto-update scan + any + // other direct caller. + coord.set_transient(name, crate::coordinator::TransientKind::Rebuilding); let result = lifecycle::rebuild( name, &coord.hyperhive_flake, @@ -69,6 +74,7 @@ pub async fn rebuild_agent(coord: &Arc, name: &str, current_rev: &s coord.dashboard_port, ) .await; + coord.clear_transient(name); match &result { Ok(()) => { if let Err(e) = std::fs::write(rev_marker_path(name), current_rev) { diff --git a/hive-c0re/src/migrate.rs b/hive-c0re/src/migrate.rs index aab8d03..b613755 100644 --- a/hive-c0re/src/migrate.rs +++ b/hive-c0re/src/migrate.rs @@ -74,7 +74,15 @@ pub async fn run(coord: &Arc) -> Result<()> { } let mut all_ok = true; for name in &names { - if let Err(e) = repoint_container(name).await { + // Mark Rebuilding so the crash watcher skips this container + // during the brief stop+start window the nixos-container + // update activation triggers. Without this, crash_watch + // would fire ContainerCrash for every agent here and the + // manager would spuriously try to recover them. + coord.set_transient(name, crate::coordinator::TransientKind::Rebuilding); + let result = repoint_container(name).await; + coord.clear_transient(name); + if let Err(e) = result { tracing::warn!(%name, error = ?e, "migration: container repoint failed"); all_ok = false; }