suppress crash_watch during background rebuilds + meta repoint

crash_watch fires ContainerCrash whenever it sees a previously-
running container in a non-running state without a transient
flag set. dashboard rebuilds already set Rebuilding via
lifecycle_action; the two other rebuild paths didn't:

- migrate::repoint_container: phase 4 walks every container,
  each nixos-container update activation briefly takes the
  systemd unit down. previously fired ContainerCrash for every
  agent during the migration; manager would then spuriously
  call start() on agents that were already coming back up.
- auto_update::rebuild_agent: startup scan + admin-socket
  caller bypass lifecycle_action.

both paths now set the Rebuilding transient around the rebuild
+ clear after. matches what dashboard does.
This commit is contained in:
müde 2026-05-16 01:12:48 +02:00
parent 63e8a98df2
commit d202f3785c
2 changed files with 15 additions and 1 deletions

View file

@ -59,6 +59,11 @@ pub async fn rebuild_agent(coord: &Arc<Coordinator>, name: &str, current_rev: &s
let applied_dir = Coordinator::agent_applied_dir(name); let applied_dir = Coordinator::agent_applied_dir(name);
let claude_dir = Coordinator::agent_claude_dir(name); let claude_dir = Coordinator::agent_claude_dir(name);
let notes_dir = Coordinator::agent_notes_dir(name); let notes_dir = Coordinator::agent_notes_dir(name);
// Suppress crash_watch during the stop+start window inside
// lifecycle::rebuild. Dashboard rebuilds already do this via
// lifecycle_action; this catches the auto-update scan + any
// other direct caller.
coord.set_transient(name, crate::coordinator::TransientKind::Rebuilding);
let result = lifecycle::rebuild( let result = lifecycle::rebuild(
name, name,
&coord.hyperhive_flake, &coord.hyperhive_flake,
@ -69,6 +74,7 @@ pub async fn rebuild_agent(coord: &Arc<Coordinator>, name: &str, current_rev: &s
coord.dashboard_port, coord.dashboard_port,
) )
.await; .await;
coord.clear_transient(name);
match &result { match &result {
Ok(()) => { Ok(()) => {
if let Err(e) = std::fs::write(rev_marker_path(name), current_rev) { if let Err(e) = std::fs::write(rev_marker_path(name), current_rev) {

View file

@ -74,7 +74,15 @@ pub async fn run(coord: &Arc<Coordinator>) -> Result<()> {
} }
let mut all_ok = true; let mut all_ok = true;
for name in &names { for name in &names {
if let Err(e) = repoint_container(name).await { // Mark Rebuilding so the crash watcher skips this container
// during the brief stop+start window the nixos-container
// update activation triggers. Without this, crash_watch
// would fire ContainerCrash for every agent here and the
// manager would spuriously try to recover them.
coord.set_transient(name, crate::coordinator::TransientKind::Rebuilding);
let result = repoint_container(name).await;
coord.clear_transient(name);
if let Err(e) = result {
tracing::warn!(%name, error = ?e, "migration: container repoint failed"); tracing::warn!(%name, error = ?e, "migration: container repoint failed");
all_ok = false; all_ok = false;
} }