fix: transient state leak via RAII guard

bare set_transient/clear_transient pairs leak the in-memory transient
on task cancellation, panics, or any early return between the two
calls — dashboard then shows the agent stuck in 'rebuilding…'
forever (coder hit this today). add Coordinator::transient_guard
returning a TransientGuard whose Drop clears, and convert every
caller (dashboard lifecycle_action, auto_update::rebuild_agent,
manager_server Update, actions::destroy, actions Spawn task,
migrate phase 4). destroy() now takes &Arc<Coordinator> so it can
hold a guard. existing stuck transients clear on next hive-c0re
restart since transient state is in-memory only.
This commit is contained in:
müde 2026-05-16 19:47:52 +02:00
parent 1a36c38a54
commit 313121a6e9
6 changed files with 56 additions and 18 deletions

View file

@ -55,10 +55,14 @@ pub async fn approve(coord: Arc<Coordinator>, id: i64) -> Result<()> {
ApprovalKind::Spawn => {
// Run the spawn in the background so the approve POST returns
// immediately. The dashboard reads `transient` to render a spinner.
coord.set_transient(&approval.agent, TransientKind::Spawning);
// Guard is created synchronously here (so the spinner appears
// the moment the operator clicks approve) and moved into the
// task; it auto-clears even if the runtime drops the task.
let coord_bg = coord.clone();
let approval_bg = approval.clone();
let guard = coord_bg.transient_guard(&approval_bg.agent, TransientKind::Spawning);
tokio::spawn(async move {
let guard = guard;
let agent_bg = approval_bg.agent.clone();
let result = lifecycle::spawn(
&approval_bg.agent,
@ -72,7 +76,7 @@ pub async fn approve(coord: Arc<Coordinator>, id: i64) -> Result<()> {
&coord_bg.operator_pronouns,
)
.await;
coord_bg.clear_transient(&agent_bg);
drop(guard);
if let Err(e) = finish_approval(&coord_bg, &approval_bg, result, None) {
tracing::warn!(agent = %agent_bg, error = ?e, "spawn approval failed");
}
@ -285,17 +289,15 @@ async fn run_apply_commit(
/// anyway. With `purge=true` the persistent trees are also wiped — config
/// history, claude creds, notes — there is no undo.
/// Refuses the manager (declarative; would fight with the host's nixos config).
pub async fn destroy(coord: &Coordinator, name: &str, purge: bool) -> Result<()> {
pub async fn destroy(coord: &Arc<Coordinator>, name: &str, purge: bool) -> Result<()> {
if name == MANAGER_NAME || name == MANAGER_AGENT {
bail!("refusing to destroy the manager ({name})");
}
tracing::info!(%name, purge, "destroy");
coord.set_transient(name, TransientKind::Destroying);
let result = lifecycle::destroy(name).await;
if result.is_err() {
coord.clear_transient(name);
}
result?;
// Guard auto-clears on the success path's final scope exit and on
// every early-return / cancellation along the way.
let _guard = coord.transient_guard(name, TransientKind::Destroying);
lifecycle::destroy(name).await?;
coord.unregister_agent(name);
let runtime = Coordinator::agent_dir(name);
if runtime.exists() {
@ -329,7 +331,7 @@ pub async fn destroy(coord: &Coordinator, name: &str, purge: bool) -> Result<()>
"agent destroyed"
},
);
coord.clear_transient(name);
drop(_guard);
coord.notify_manager(&HelperEvent::Destroyed {
agent: name.to_owned(),
});