dashboard: K3PT ST4T3 section + agent links open in new tab

new section between containers and questions: lists every name with a
state dir under /var/lib/hyperhive/agents/ that doesn't correspond to
a live container. shows state size + last-modified age + whether
claude creds are kept. two actions per row:

- R3V1V3 — queues a spawn approval with the same name (operator
  approves to recreate; spawn flow reuses prior config + claude
  creds, no re-login needed)
- PURG3 — wipes the agent's state + applied dirs (post /purge-tombstone/
  endpoint; refuses if a live container with that name still exists)

dashboard also opens agent links in new tabs now (target=_blank +
rel=noopener) so the operator's overview tab stays put when they
dive into an agent.
This commit is contained in:
müde 2026-05-15 19:55:27 +02:00
parent 8344dd9ab7
commit 5ee65d2f15
6 changed files with 212 additions and 3 deletions

View file

@ -51,6 +51,7 @@ pub async fn serve(port: u16, coord: Arc<Coordinator>) -> Result<()> {
.route("/rebuild/{name}", post(post_rebuild))
.route("/update-all", post(post_update_all))
.route("/answer-question/{id}", post(post_answer_question))
.route("/purge-tombstone/{name}", post(post_purge_tombstone))
.route("/request-spawn", post(post_request_spawn))
.route("/messages/stream", get(messages_stream))
.with_state(AppState { coord });
@ -106,6 +107,21 @@ struct StateSnapshot {
/// we mark the row answered and fire `HelperEvent::OperatorAnswered`
/// into the manager's inbox.
questions: Vec<crate::operator_questions::OpQuestion>,
/// State dirs (config history + claude creds + /state/ notes) that
/// survive after a destroy-without-purge. The operator can re-spawn
/// with the same name to resume, or PURG3 to wipe them.
tombstones: Vec<TombstoneView>,
}
#[derive(Serialize)]
struct TombstoneView {
name: String,
/// Bytes used by the state dir tree. Cheap-ish to compute; let the
/// operator know how much they're holding onto.
state_bytes: u64,
/// Mtime (unix seconds) of the state dir; rough "last seen".
last_seen: i64,
has_creds: bool,
}
#[derive(Serialize)]
@ -145,6 +161,7 @@ struct ApprovalView {
diff_html: Option<String>,
}
#[allow(clippy::too_many_lines)]
async fn api_state(headers: HeaderMap, State(state): State<AppState>) -> axum::Json<StateSnapshot> {
let host = headers
.get("host")
@ -242,6 +259,35 @@ async fn api_state(headers: HeaderMap, State(state): State<AppState>) -> axum::J
.unwrap_or_default();
let questions = state.coord.questions.pending().unwrap_or_default();
// Tombstones: state-dir names that don't appear in the live container
// list (and aren't the manager). Operator can re-spawn or PURG3.
let live: std::collections::HashSet<String> = containers
.iter()
.map(|c| c.name.clone())
.chain(state.coord.transient_snapshot().into_keys())
.collect();
let tombstones: Vec<TombstoneView> = Coordinator::kept_state_names()
.into_iter()
.filter(|name| name != MANAGER_NAME && !live.contains(name))
.map(|name| {
let root = Coordinator::agent_state_root(&name);
let state_bytes = dir_size_bytes(&root);
let last_seen = std::fs::metadata(&root)
.and_then(|m| m.modified())
.ok()
.and_then(|t| t.duration_since(std::time::UNIX_EPOCH).ok())
.and_then(|d| i64::try_from(d.as_secs()).ok())
.unwrap_or(0);
let has_creds = claude_has_session(&Coordinator::agent_claude_dir(&name));
TombstoneView {
name,
state_bytes,
last_seen,
has_creds,
}
})
.collect();
axum::Json(StateSnapshot {
hostname,
manager_port: MANAGER_PORT,
@ -251,9 +297,33 @@ async fn api_state(headers: HeaderMap, State(state): State<AppState>) -> axum::J
approvals: approval_views,
operator_inbox,
questions,
tombstones,
})
}
/// Sum the byte size of every regular file under `root`. Cheap to compute
/// for typical agent state (config repo + claude creds + notes file —
/// usually a few MB); fine to do inline on each /api/state. Returns 0 on
/// any error.
fn dir_size_bytes(root: &Path) -> u64 {
fn walk(p: &Path, acc: &mut u64) {
let Ok(rd) = std::fs::read_dir(p) else { return };
for entry in rd.flatten() {
let Ok(ft) = entry.file_type() else { continue };
if ft.is_dir() {
walk(&entry.path(), acc);
} else if ft.is_file()
&& let Ok(meta) = entry.metadata()
{
*acc += meta.len();
}
}
}
let mut total = 0u64;
walk(root, &mut total);
total
}
async fn messages_stream(
State(state): State<AppState>,
) -> Sse<impl Stream<Item = Result<Event, Infallible>>> {
@ -316,6 +386,48 @@ async fn post_answer_question(
}
}
async fn post_purge_tombstone(
State(state): State<AppState>,
AxumPath(name): AxumPath<String>,
) -> Response {
if name == lifecycle::MANAGER_NAME {
return error_response("refusing to purge the manager's state");
}
// Sanity: refuse to purge if a live container still exists with this
// name. The dashboard already filters tombstones to non-live names,
// but the operator could send a stale POST.
let live = lifecycle::list().await.unwrap_or_default();
if live
.iter()
.any(|c| c == &format!("{}{name}", lifecycle::AGENT_PREFIX) || c == &name)
{
return error_response(&format!(
"refusing to purge {name}: container still exists — use DESTR0Y first"
));
}
let mut errors = Vec::new();
for dir in [
Coordinator::agent_state_root(&name),
Coordinator::agent_applied_dir(&name),
] {
if dir.exists()
&& let Err(e) = std::fs::remove_dir_all(&dir)
{
errors.push(format!("{}: {e}", dir.display()));
}
}
let _ = state
.coord
.approvals
.fail_pending_for_agent(&name, "agent state purged");
if errors.is_empty() {
tracing::info!(%name, "tombstone purged");
Redirect::to("/").into_response()
} else {
error_response(&format!("purge {name} partial: {}", errors.join(", ")))
}
}
async fn post_request_spawn(
State(state): State<AppState>,
Form(form): Form<RequestSpawnForm>,