phase 6: container events + drop the 5s /api/state poll

new DashboardEvent::ContainerStateChanged + ContainerRemoved
close the last refetch loop on the dashboard. Coordinator's
rescan_containers_and_emit diffs a fresh container_view::build_all
against a cached last_containers map and fires per-row events.
called from actions::approve (post-spawn), actions::destroy,
the lifecycle_action wrapper, auto_update::rebuild_agent, and
the existing 10s crash_watch poll.

ContainerView extracted to its own module so coordinator and
dashboard can both build it. dashboard endpoints flip to 200;
container-lifecycle forms carry data-no-refresh. client drops
the periodic poll entirely — initial cold load + SSE for
everything afterwards. pending overlay reads from the existing
transientsState since the new event payload doesn't carry it.

PURG3 + meta-update keep the post-submit refetch since
tombstones + meta_inputs aren't event-derived yet; tracked in
TODO.md.
This commit is contained in:
müde 2026-05-17 22:01:15 +02:00
parent f153639cb4
commit e7ce35c503
11 changed files with 396 additions and 195 deletions

View file

@ -14,7 +14,7 @@ use axum::{
extract::{Path as AxumPath, State},
http::{HeaderMap, StatusCode},
response::{
Html, IntoResponse, Redirect, Response,
Html, IntoResponse, Response,
sse::{Event, KeepAlive, Sse},
},
routing::{get, post},
@ -25,8 +25,9 @@ use tokio_stream::wrappers::BroadcastStream;
use tokio_stream::{Stream, StreamExt};
use crate::actions;
use crate::container_view::{ContainerView, claude_has_session};
use crate::coordinator::Coordinator;
use crate::lifecycle::{self, AGENT_PREFIX, MANAGER_NAME};
use crate::lifecycle::{self, MANAGER_NAME};
const MANAGER_PORT: u16 = 8000;
@ -200,31 +201,6 @@ struct TombstoneView {
has_creds: bool,
}
#[derive(Serialize)]
#[allow(clippy::struct_excessive_bools)]
struct ContainerView {
/// Logical agent name (no `h-` prefix). Used in action URLs.
name: String,
/// Container name as nixos-container sees it (`h-foo`, `hm1nd`).
container: String,
is_manager: bool,
port: u16,
running: bool,
needs_update: bool,
needs_login: bool,
/// When a lifecycle action is in flight on this container, the kind
/// (`starting`, `stopping`, etc.) so the JS can render a spinner +
/// disable other buttons.
#[serde(skip_serializing_if = "Option::is_none")]
pending: Option<&'static str>,
/// First 12 chars of the sha the meta flake currently has locked
/// for this agent's input. Reflects what's actually deployed; can
/// differ from `applied/<n>/main` only between
/// `meta::prepare_deploy` and `finalize_deploy` (≤ build duration).
#[serde(skip_serializing_if = "Option::is_none")]
deployed_sha: Option<String>,
}
#[derive(Serialize)]
struct TransientView {
name: String,
@ -303,17 +279,20 @@ async fn api_state(headers: HeaderMap, State(state): State<AppState>) -> axum::J
// to make idempotent, not ours to avoid here.
let seq = state.coord.current_seq();
let raw_containers = log_default("nixos-container list", lifecycle::list().await);
let current_rev = crate::auto_update::current_flake_rev(&state.coord.hyperhive_flake);
// Refresh the coordinator's cached container snapshot before
// reading. Cold-load clients then see whatever the latest rescan
// produced; live clients converge via the matching
// `ContainerStateChanged` / `ContainerRemoved` events the rescan
// emits.
state.coord.rescan_containers_and_emit().await;
let containers = state.coord.containers_snapshot().await;
let any_stale = containers.iter().any(|c| c.needs_update);
let transient_snapshot = state.coord.transient_snapshot();
let pending_approvals = gc_orphans(
&state.coord,
log_default("approvals.pending", state.coord.approvals.pending()),
);
let (containers, any_stale) =
build_container_views(&raw_containers, current_rev.as_deref(), &transient_snapshot).await;
let transients = build_transient_views(&raw_containers, &transient_snapshot);
let transients = build_transient_views(&containers, &transient_snapshot);
let approvals = build_approval_views(pending_approvals).await;
let approval_history = log_default(
"approvals.recent_resolved",
@ -370,96 +349,6 @@ fn build_port_conflicts(containers: &[ContainerView]) -> Vec<PortConflict> {
.collect()
}
/// Build `ContainerView`s for every live nixos-container. Returns the
/// list and whether any container is stale (drives the "↻ UPD4TE 4LL"
/// banner).
async fn build_container_views(
raw_containers: &[String],
current_rev: Option<&str>,
transient_snapshot: &std::collections::HashMap<String, crate::coordinator::TransientState>,
) -> (Vec<ContainerView>, bool) {
let mut out = Vec::new();
let mut any_stale = false;
let locked = read_meta_locked_revs();
for c in raw_containers {
let (logical, is_manager) = if c == MANAGER_NAME {
(MANAGER_NAME.to_owned(), true)
} else if let Some(n) = c.strip_prefix(AGENT_PREFIX) {
(n.to_owned(), false)
} else {
continue;
};
let needs_update =
current_rev.is_some_and(|rev| crate::auto_update::agent_needs_update(&logical, rev));
if needs_update {
any_stale = true;
}
let needs_login =
!is_manager && !claude_has_session(&Coordinator::agent_claude_dir(&logical));
let pending = transient_snapshot
.get(&logical)
.map(|st| transient_label(st.kind));
let deployed_sha = locked
.get(&format!("agent-{logical}"))
.map(|s| s[..s.len().min(12)].to_owned());
out.push(ContainerView {
port: lifecycle::agent_web_port(&logical),
running: lifecycle::is_running(&logical).await,
container: c.clone(),
name: logical,
is_manager,
needs_update,
needs_login,
pending,
deployed_sha,
});
}
(out, any_stale)
}
/// Map of node name → locked sha for nodes the **root** of meta
/// directly depends on (`hyperhive`, `agent-<n>`). Used by the
/// container row to render its `deployed:<sha12>` chip per agent.
/// Distinct from `read_meta_inputs()` which walks deeper for the
/// flake-input update form.
fn read_meta_locked_revs() -> std::collections::HashMap<String, String> {
let mut out = std::collections::HashMap::new();
let Ok(raw) = std::fs::read_to_string("/var/lib/hyperhive/meta/flake.lock") else {
return out;
};
let Ok(json) = serde_json::from_str::<serde_json::Value>(&raw) else {
return out;
};
let Some(nodes) = json.get("nodes").and_then(|v| v.as_object()) else {
return out;
};
let Some(root_name) = json.get("root").and_then(|v| v.as_str()) else {
return out;
};
let Some(root_inputs) = nodes
.get(root_name)
.and_then(|n| n.get("inputs"))
.and_then(|v| v.as_object())
else {
return out;
};
for alias in root_inputs.keys() {
let target_name = match root_inputs.get(alias) {
Some(serde_json::Value::String(s)) => s.clone(),
_ => continue,
};
if let Some(rev) = nodes
.get(&target_name)
.and_then(|n| n.get("locked"))
.and_then(|v| v.get("rev"))
.and_then(|v| v.as_str())
{
out.insert(alias.clone(), rev.to_owned());
}
}
out
}
#[derive(Serialize, Clone)]
struct MetaInputView {
/// Input key in meta's `flake.nix` — `hyperhive`, `agent-<n>`, etc.
@ -577,16 +466,12 @@ fn walk_meta_inputs(
/// (`Spawning`). Lifecycle ops on existing containers surface as
/// `ContainerView.pending` inline; this list only catches pre-creation.
fn build_transient_views(
raw_containers: &[String],
containers: &[ContainerView],
transient_snapshot: &std::collections::HashMap<String, crate::coordinator::TransientState>,
) -> Vec<TransientView> {
transient_snapshot
.iter()
.filter(|(name, _)| {
!raw_containers
.iter()
.any(|c| c == &format!("{AGENT_PREFIX}{name}") || c == *name)
})
.filter(|(name, _)| !containers.iter().any(|c| &c.name == *name))
.map(|(name, st)| TransientView {
name: name.clone(),
kind: transient_label(st.kind),
@ -1034,7 +919,10 @@ async fn post_purge_tombstone(
.fail_pending_for_agent(&name, "agent state purged");
if errors.is_empty() {
tracing::info!(%name, "tombstone purged");
Redirect::to("/").into_response()
// Tombstones aren't event-derived yet, so the client still
// refetches /api/state to see this one disappear (matching
// form omits `data-no-refresh`).
(StatusCode::OK, "ok").into_response()
} else {
error_response(&format!("purge {name} partial: {}", errors.join(", ")))
}
@ -1086,7 +974,10 @@ async fn post_meta_update(
tokio::spawn(async move {
run_meta_update(&coord, &inputs_clone).await;
});
Redirect::to("/").into_response()
// Background task — each per-agent rebuild emits its own
// `ContainerStateChanged`; the meta inputs panel still relies on
// /api/state freshness (matching form omits `data-no-refresh`).
(StatusCode::OK, "ok").into_response()
}
/// Background task: run `nix flake update <inputs>` in meta + commit,
@ -1260,7 +1151,13 @@ where
match result {
Ok(()) => {
extra(state, &logical);
Redirect::to("/").into_response()
// Rescan so the running/needs_login/needs_update flip on
// the affected row lands on every dashboard's SSE channel
// without waiting for a snapshot poll. 200 + matching
// `data-no-refresh` on the form skip the post-submit
// /api/state refetch.
state.coord.rescan_containers_and_emit().await;
(StatusCode::OK, "ok").into_response()
}
Err(e) => error_response(&format!("{verb} {logical} failed: {e:#}")),
}
@ -1336,7 +1233,8 @@ async fn post_update_all(State(state): State<AppState>) -> Response {
}
}
if errors.is_empty() {
Redirect::to("/").into_response()
// Each rebuild_agent rescanned; no extra refetch needed.
(StatusCode::OK, "ok").into_response()
} else {
error_response(&format!(
"update-all partial failure:\n{}",
@ -1380,8 +1278,11 @@ async fn post_destroy(
) -> Response {
// Checkbox semantics: any non-empty value (axum sends "on") = purge.
let purge = form.purge.as_deref().is_some_and(|v| !v.is_empty());
// `actions::destroy` rescans the container list on success, so the
// `ContainerRemoved` event lands before we return 200. The matching
// form carries `data-no-refresh`.
match actions::destroy(&state.coord, &name, purge).await {
Ok(()) => Redirect::to("/").into_response(),
Ok(()) => (StatusCode::OK, "ok").into_response(),
Err(e) => error_response(&format!("destroy {name} failed: {e:#}")),
}
}
@ -1429,18 +1330,6 @@ fn gc_orphans(coord: &Coordinator, approvals: Vec<Approval>) -> Vec<Approval> {
.collect()
}
/// Host-side mirror of `hive_ag3nt::login::has_session`. Returns true if the
/// agent's bound `~/.claude/` dir on disk contains any regular file. The
/// dashboard reads this each render so logins driven from the agent web UI
/// (Phase 8 step 4) reflect within one auto-refresh cycle.
fn claude_has_session(dir: &Path) -> bool {
let Ok(entries) = std::fs::read_dir(dir) else {
return false;
};
entries
.flatten()
.any(|e| e.file_type().is_ok_and(|t| t.is_file()))
}
/// Multi-file unified diff between the currently-deployed tree and
/// the proposal for this approval. Runs against the applied repo