From d202f3785c171c840cded43e811f0fb6bfbff50e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?m=C3=BCde?= <git@darkest.space>
Date: Sat, 16 May 2026 01:12:48 +0200
Subject: [PATCH] suppress crash_watch during background rebuilds + meta
 repoint

crash_watch fires ContainerCrash whenever it sees a previously-
running container in a non-running state without a transient
flag set. dashboard rebuilds already set Rebuilding via
lifecycle_action; the two other rebuild paths didn't:

- migrate::repoint_container: phase 4 walks every container,
  each nixos-container update activation briefly takes the
  systemd unit down. previously fired ContainerCrash for every
  agent during the migration; manager would then spuriously
  call start() on agents that were already coming back up.
- auto_update::rebuild_agent: startup scan + admin-socket
  caller bypass lifecycle_action.

both paths now set the Rebuilding transient around the rebuild
+ clear after. matches what dashboard does.
---
 hive-c0re/src/auto_update.rs |  6 ++++++
 hive-c0re/src/migrate.rs     | 10 +++++++++-
 2 files changed, 15 insertions(+), 1 deletion(-)
diff --git a/hive-c0re/src/auto_update.rs b/hive-c0re/src/auto_update.rs
index ad3db22..537da80 100644
--- a/hive-c0re/src/auto_update.rs
+++ b/hive-c0re/src/auto_update.rs
@@ -59,6 +59,11 @@ pub async fn rebuild_agent(coord: &Arc<Coordinator>, name: &str, current_rev: &s
     let applied_dir = Coordinator::agent_applied_dir(name);
     let claude_dir = Coordinator::agent_claude_dir(name);
     let notes_dir = Coordinator::agent_notes_dir(name);
+    // Suppress crash_watch during the stop+start window inside
+    // lifecycle::rebuild. Dashboard rebuilds already do this via
+    // lifecycle_action; this catches the auto-update scan + any
+    // other direct caller.
+    coord.set_transient(name, crate::coordinator::TransientKind::Rebuilding);
     let result = lifecycle::rebuild(
         name,
         &coord.hyperhive_flake,
@@ -69,6 +74,7 @@ pub async fn rebuild_agent(coord: &Arc<Coordinator>, name: &str, current_rev: &s
         coord.dashboard_port,
     )
     .await;
+    coord.clear_transient(name);
     match &result {
         Ok(()) => {
             if let Err(e) = std::fs::write(rev_marker_path(name), current_rev) {
diff --git a/hive-c0re/src/migrate.rs b/hive-c0re/src/migrate.rs
index aab8d03..b613755 100644
--- a/hive-c0re/src/migrate.rs
+++ b/hive-c0re/src/migrate.rs
@@ -74,7 +74,15 @@ pub async fn run(coord: &Arc<Coordinator>) -> Result<()> {
     }
     let mut all_ok = true;
     for name in &names {
-        if let Err(e) = repoint_container(name).await {
+        // Mark Rebuilding so the crash watcher skips this container
+        // during the brief stop+start window the nixos-container
+        // update activation triggers. Without this, crash_watch
+        // would fire ContainerCrash for every agent here and the
+        // manager would spuriously try to recover them.
+        coord.set_transient(name, crate::coordinator::TransientKind::Rebuilding);
+        let result = repoint_container(name).await;
+        coord.clear_transient(name);
+        if let Err(e) = result {
             tracing::warn!(%name, error = ?e, "migration: container repoint failed");
             all_ok = false;
         }