fix: transient state leak via RAII guard

bare set_transient/clear_transient pairs leak the in-memory transient on task cancellation, panics, or any early return between the two calls — dashboard then shows the agent stuck in 'rebuilding…' forever (coder hit this today). add Coordinator::transient_guard returning a TransientGuard whose Drop clears, and convert every caller (dashboard lifecycle_action, auto_update::rebuild_agent, manager_server Update, actions::destroy, actions Spawn task, migrate phase 4). destroy() now takes &Arc<Coordinator> so it can hold a guard. existing stuck transients clear on next hive-c0re restart since transient state is in-memory only.
2026-05-16 19:47:52 +02:00 · 2026-05-16 19:47:52 +02:00 · 313121a6e9
commit 313121a6e9
parent 1a36c38a54
6 changed files with 56 additions and 18 deletions
--- a/hive-c0re/src/auto_update.rs
+++ b/hive-c0re/src/auto_update.rs
@ -63,7 +63,7 @@ pub async fn rebuild_agent(coord: &Arc<Coordinator>, name: &str, current_rev: &s
    // lifecycle::rebuild. Dashboard rebuilds already do this via
    // lifecycle_action; this catches the auto-update scan + any
    // other direct caller.
-    coord.set_transient(name, crate::coordinator::TransientKind::Rebuilding);
+    let _guard = coord.transient_guard(name, crate::coordinator::TransientKind::Rebuilding);
    let result = lifecycle::rebuild(
        name,
        &coord.hyperhive_flake,
@ -75,7 +75,7 @@ pub async fn rebuild_agent(coord: &Arc<Coordinator>, name: &str, current_rev: &s
        &coord.operator_pronouns,
    )
    .await;
-    coord.clear_transient(name);
+    drop(_guard);
    match &result {
        Ok(()) => {
            if let Err(e) = std::fs::write(rev_marker_path(name), current_rev) {