fix: transient state leak via RAII guard
bare set_transient/clear_transient pairs leak the in-memory transient on task cancellation, panics, or any early return between the two calls — dashboard then shows the agent stuck in 'rebuilding…' forever (coder hit this today). add Coordinator::transient_guard returning a TransientGuard whose Drop clears, and convert every caller (dashboard lifecycle_action, auto_update::rebuild_agent, manager_server Update, actions::destroy, actions Spawn task, migrate phase 4). destroy() now takes &Arc<Coordinator> so it can hold a guard. existing stuck transients clear on next hive-c0re restart since transient state is in-memory only.
This commit is contained in:
parent
1a36c38a54
commit
313121a6e9
6 changed files with 56 additions and 18 deletions
|
|
@ -55,10 +55,14 @@ pub async fn approve(coord: Arc<Coordinator>, id: i64) -> Result<()> {
|
|||
ApprovalKind::Spawn => {
|
||||
// Run the spawn in the background so the approve POST returns
|
||||
// immediately. The dashboard reads `transient` to render a spinner.
|
||||
coord.set_transient(&approval.agent, TransientKind::Spawning);
|
||||
// Guard is created synchronously here (so the spinner appears
|
||||
// the moment the operator clicks approve) and moved into the
|
||||
// task; it auto-clears even if the runtime drops the task.
|
||||
let coord_bg = coord.clone();
|
||||
let approval_bg = approval.clone();
|
||||
let guard = coord_bg.transient_guard(&approval_bg.agent, TransientKind::Spawning);
|
||||
tokio::spawn(async move {
|
||||
let guard = guard;
|
||||
let agent_bg = approval_bg.agent.clone();
|
||||
let result = lifecycle::spawn(
|
||||
&approval_bg.agent,
|
||||
|
|
@ -72,7 +76,7 @@ pub async fn approve(coord: Arc<Coordinator>, id: i64) -> Result<()> {
|
|||
&coord_bg.operator_pronouns,
|
||||
)
|
||||
.await;
|
||||
coord_bg.clear_transient(&agent_bg);
|
||||
drop(guard);
|
||||
if let Err(e) = finish_approval(&coord_bg, &approval_bg, result, None) {
|
||||
tracing::warn!(agent = %agent_bg, error = ?e, "spawn approval failed");
|
||||
}
|
||||
|
|
@ -285,17 +289,15 @@ async fn run_apply_commit(
|
|||
/// anyway. With `purge=true` the persistent trees are also wiped — config
|
||||
/// history, claude creds, notes — there is no undo.
|
||||
/// Refuses the manager (declarative; would fight with the host's nixos config).
|
||||
pub async fn destroy(coord: &Coordinator, name: &str, purge: bool) -> Result<()> {
|
||||
pub async fn destroy(coord: &Arc<Coordinator>, name: &str, purge: bool) -> Result<()> {
|
||||
if name == MANAGER_NAME || name == MANAGER_AGENT {
|
||||
bail!("refusing to destroy the manager ({name})");
|
||||
}
|
||||
tracing::info!(%name, purge, "destroy");
|
||||
coord.set_transient(name, TransientKind::Destroying);
|
||||
let result = lifecycle::destroy(name).await;
|
||||
if result.is_err() {
|
||||
coord.clear_transient(name);
|
||||
}
|
||||
result?;
|
||||
// Guard auto-clears on the success path's final scope exit and on
|
||||
// every early-return / cancellation along the way.
|
||||
let _guard = coord.transient_guard(name, TransientKind::Destroying);
|
||||
lifecycle::destroy(name).await?;
|
||||
coord.unregister_agent(name);
|
||||
let runtime = Coordinator::agent_dir(name);
|
||||
if runtime.exists() {
|
||||
|
|
@ -329,7 +331,7 @@ pub async fn destroy(coord: &Coordinator, name: &str, purge: bool) -> Result<()>
|
|||
"agent destroyed"
|
||||
},
|
||||
);
|
||||
coord.clear_transient(name);
|
||||
drop(_guard);
|
||||
coord.notify_manager(&HelperEvent::Destroyed {
|
||||
agent: name.to_owned(),
|
||||
});
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue