fix: transient state leak via RAII guard

bare set_transient/clear_transient pairs leak the in-memory transient
on task cancellation, panics, or any early return between the two
calls — dashboard then shows the agent stuck in 'rebuilding…'
forever (coder hit this today). add Coordinator::transient_guard
returning a TransientGuard whose Drop clears, and convert every
caller (dashboard lifecycle_action, auto_update::rebuild_agent,
manager_server Update, actions::destroy, actions Spawn task,
migrate phase 4). destroy() now takes &Arc<Coordinator> so it can
hold a guard. existing stuck transients clear on next hive-c0re
restart since transient state is in-memory only.
This commit is contained in:
müde 2026-05-16 19:47:52 +02:00
parent 1a36c38a54
commit 313121a6e9
6 changed files with 56 additions and 18 deletions

View file

@ -57,6 +57,21 @@ pub struct TransientState {
pub since: std::time::Instant,
}
/// RAII handle returned by `Coordinator::transient_guard`. Cleared on
/// drop — including drop-via-cancellation, the path that bare
/// `set_transient` / `clear_transient` pairs leaked through. Holds an
/// `Arc<Coordinator>` so the guard is freely returnable / movable.
pub struct TransientGuard {
coord: Arc<Coordinator>,
name: String,
}
impl Drop for TransientGuard {
fn drop(&mut self) {
self.coord.clear_transient(&self.name);
}
}
#[derive(Debug, Clone, Copy)]
pub enum TransientKind {
/// `lifecycle::spawn` is running (nixos-container create + update + start).
@ -122,6 +137,13 @@ impl Coordinator {
}
/// Mark an agent as in-progress (only one state per agent for now).
///
/// Prefer `transient_guard` when possible — it auto-clears on drop
/// even if the surrounding future is cancelled (HTTP request
/// aborted, runtime shutdown mid-rebuild, panic between set and
/// clear). The bare `set_transient` / `clear_transient` pair leaks
/// the transient on any of those paths and the dashboard then
/// shows the agent stuck in "rebuilding…" forever.
pub fn set_transient(&self, name: &str, kind: TransientKind) {
self.transient.lock().unwrap().insert(
name.to_owned(),
@ -136,6 +158,19 @@ impl Coordinator {
self.transient.lock().unwrap().remove(name);
}
/// Set a transient state and return a guard that clears it on drop.
/// Use this from any path where the surrounding future could be
/// cancelled or panic between set and clear (HTTP handlers, spawned
/// tasks). The guard's `Drop` runs even on task cancellation, so
/// the dashboard's spinner can't get pinned forever.
pub fn transient_guard(self: &Arc<Self>, name: &str, kind: TransientKind) -> TransientGuard {
self.set_transient(name, kind);
TransientGuard {
coord: self.clone(),
name: name.to_owned(),
}
}
pub fn transient_snapshot(&self) -> HashMap<String, TransientState> {
self.transient.lock().unwrap().clone()
}