From e7ce35c5032aa34408562a3d9c253cbd716a4090 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?m=C3=BCde?= Date: Sun, 17 May 2026 22:01:15 +0200 Subject: [PATCH] phase 6: container events + drop the 5s /api/state poll MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit new DashboardEvent::ContainerStateChanged + ContainerRemoved close the last refetch loop on the dashboard. Coordinator's rescan_containers_and_emit diffs a fresh container_view::build_all against a cached last_containers map and fires per-row events. called from actions::approve (post-spawn), actions::destroy, the lifecycle_action wrapper, auto_update::rebuild_agent, and the existing 10s crash_watch poll. ContainerView extracted to its own module so coordinator and dashboard can both build it. dashboard endpoints flip to 200; container-lifecycle forms carry data-no-refresh. client drops the periodic poll entirely — initial cold load + SSE for everything afterwards. pending overlay reads from the existing transientsState since the new event payload doesn't carry it. PURG3 + meta-update keep the post-submit refetch since tombstones + meta_inputs aren't event-derived yet; tracked in TODO.md. --- CLAUDE.md | 20 ++++ TODO.md | 3 +- hive-c0re/assets/app.js | 140 +++++++++++++++++------ hive-c0re/src/actions.rs | 7 ++ hive-c0re/src/auto_update.rs | 5 + hive-c0re/src/container_view.rs | 121 ++++++++++++++++++++ hive-c0re/src/coordinator.rs | 72 ++++++++++++ hive-c0re/src/crash_watch.rs | 19 ++-- hive-c0re/src/dashboard.rs | 181 ++++++------------------------ hive-c0re/src/dashboard_events.rs | 22 ++++ hive-c0re/src/main.rs | 1 + 11 files changed, 396 insertions(+), 195 deletions(-) create mode 100644 hive-c0re/src/container_view.rs diff --git a/CLAUDE.md b/CLAUDE.md index 0b2bf15..ab82688 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -193,6 +193,26 @@ Prune freely. domain tooling — the agent flake's `inputs` block pulls the external flake, `agent.nix` references it via `flakeInputs..packages.${pkgs.system}.default`. +- **Just landed:** Phase 6 container events. New + `DashboardEvent::ContainerStateChanged { container }` + + `ContainerRemoved { name }` close the last refetch loop on the + dashboard side. `Coordinator::rescan_containers_and_emit` builds a + fresh `container_view::build_all` snapshot, diffs it against a + cached `last_containers` map, and fires per-row events for the + delta. Called from every mutation site: `actions::approve` + (post-spawn), `actions::destroy`, the `lifecycle_action` wrapper + in `dashboard.rs` (start/stop/restart/rebuild), `auto_update:: + rebuild_agent`, and the existing 10s `crash_watch` poll loop. + `ContainerView` extracted to its own module so coordinator + + dashboard can both build it. Dashboard endpoints (`/restart`, + `/destroy`, `/kill`, `/rebuild`, `/start`, `/update-all`, + `/meta-update`, `/purge-tombstone`) now return 200; matching + forms carry `data-no-refresh` where the event coverage is + complete (purge + meta-update keep the refetch since tombstones + + meta_inputs aren't event-derived yet). Client drops the 5s + periodic `/api/state` poll entirely — initial cold load + SSE + for everything afterwards; pending overlay reads from + `transientsState` since the new event payload doesn't carry it. - **Just landed:** dashboard event refactor. New `hive-fr0nt` workspace crate hosts shared frontend assets (palette + terminal CSS + `window.HiveTerminal.create` JS) so both the dashboard and diff --git a/TODO.md b/TODO.md index fcbd796..86c57b1 100644 --- a/TODO.md +++ b/TODO.md @@ -27,7 +27,8 @@ - Per-agent reminder status (pending, delivered) - Reminder query interface for debugging - Display reminder delivery errors (failed sends, mark failures) -- **Phase 6 leftovers** — event-covered endpoints (`/approve`, `/deny`, `/answer-question`, `/cancel-question`, `/request-spawn`) now return 200 (f559441); the matching forms carry `data-no-refresh` so the post-submit `/api/state` refetch is skipped (the SSE event delivers the update). Container-lifecycle endpoints (`/restart`, `/destroy`, `/kill`, `/rebuild`, `/start`, `/api/{cancel,compact,model,new-session}`, `/meta-update`, `/purge-tombstone`) still need a `ContainerListChanged` event before their redirects can drop — `ContainerView` is currently sourced from external `nixos-container list`, so the 5s poll continues to drive that section. +- **Phase 6 follow-ups** — dashboard side is fully event-driven (Phase 6 leftovers landed); the per-agent web UI's lifecycle endpoints (`/api/{cancel,compact,model,new-session}`, `/login/*`) still 303-redirect-and-poll. Convert them to 200 + `data-no-refresh` so the per-agent page stops refetching `/api/state` on every operator click — `LiveEvent::Note` already covers cancel/compact/model/new-session, login state needs its own `NeedsLogin` / `LoggedIn` events on the per-agent bus. +- **Tombstones + meta_inputs events**: not yet event-derived. PURG3 + meta-update still trigger a post-submit `/api/state` refetch on the dashboard. Add `TombstoneAdded`/`TombstoneRemoved` + `MetaInputsChanged` so those forms can drop their refetch too and the cold-load is the only `/api/state` fetch in normal operation. ## Bugs diff --git a/hive-c0re/assets/app.js b/hive-c0re/assets/app.js index 4d247a7..338aed5 100644 --- a/hive-c0re/assets/app.js +++ b/hive-c0re/assets/app.js @@ -214,6 +214,27 @@ } }); + // Derived container state — cold-loaded from /api/state.containers, + // then mutated live by `container_state_changed` (upsert by name) + // and `container_removed` (drop by name). The coordinator's rescan + // helper fires these after every mutation site + on a periodic poll + // in crash_watch. Keyed by ContainerView.name so the lifecycle + // forms' POST → 200 → matching event flips the row without a + // snapshot refetch. + const containersState = new Map(); + function syncContainersFromSnapshot(s) { + containersState.clear(); + for (const c of s.containers || []) containersState.set(c.name, c); + } + function applyContainerStateChanged(ev) { + if (!ev.container || !ev.container.name) return; + containersState.set(ev.container.name, ev.container); + renderContainersFromState(); + } + function applyContainerRemoved(ev) { + if (containersState.delete(ev.name)) renderContainersFromState(); + } + // Derived transient state — cold-loaded from /api/state.transients, // then mutated live by `transient_set` / `transient_cleared`. Keyed // by agent name so add/remove are O(1). `since_unix` is wall-clock so @@ -251,27 +272,56 @@ if (s) renderContainers(s); } + // Re-derive port conflicts from the live containers map. Mirrors the + // server-side `build_port_conflicts` so the banner reacts to event + // updates instead of waiting for a /api/state refetch. + function derivePortConflicts(containers) { + const byPort = new Map(); + for (const c of containers) { + if (!byPort.has(c.port)) byPort.set(c.port, []); + byPort.get(c.port).push(c.name); + } + const out = []; + for (const [port, agents] of byPort) { + if (agents.length > 1) { + agents.sort(); + out.push({ port, agents }); + } + } + out.sort((a, b) => a.port - b.port); + return out; + } + // ─── state rendering ──────────────────────────────────────────────────── function renderContainers(s) { const root = $('containers-section'); root.innerHTML = ''; + // Containers come from the derived map (event-driven) rather than + // `s.containers`; `s` still supplies hostname (for the web-ui + // link) and tombstones/meta_inputs (not event-derived yet). + const containers = Array.from(containersState.values()) + .sort((a, b) => a.name.localeCompare(b.name)); + const portConflicts = derivePortConflicts(containers); + const anyStale = containers.some((c) => c.needs_update); + // Port-hash collisions: rename one of the listed agents and // rebuild. The banner sits above the agent list so it's the // first thing the operator sees when something's wedged. - if (s.port_conflicts && s.port_conflicts.length) { + if (portConflicts.length) { const banner = el('div', { class: 'port-conflict' }, el('strong', {}, '⚠ port collision'), ' — '); - const groups = s.port_conflicts.map((c) => + const groups = portConflicts.map((c) => `:${c.port} (${c.agents.join(' + ')})`).join('; '); banner.append(groups + '. rename one of each and ↻ R3BU1LD.'); root.append(banner); } - if (s.any_stale) { + if (anyStale) { root.append(form( '/update-all', 'btn-rebuild', '↻ UPD4TE 4LL', 'rebuild every stale container?', + {}, { noRefresh: true }, )); } @@ -290,15 +340,20 @@ root.append(ul); } - if (!s.containers.length && !transientsState.size) { + if (!containers.length && !transientsState.size) { root.append(el('p', { class: 'empty' }, 'no managed containers')); return; } + const hostname = (s && s.hostname) || window.location.hostname; const ul = el('ul', { class: 'containers' }); - for (const c of s.containers) { - const url = `http://${s.hostname}:${c.port}/`; - const li = el('li', { class: 'container-row' + (c.pending ? ' pending' : '') }); + for (const c of containers) { + const url = `http://${hostname}:${c.port}/`; + // Pending state is overlaid from the transient store, not from + // the container row — `ContainerStateChanged` doesn't carry it, + // `TransientSet` / `TransientCleared` do. + const pending = transientsState.get(c.name)?.kind || null; + const li = el('li', { class: 'container-row' + (pending ? ' pending' : '') }); // ── line 1: identity ───────────────────────────────────────── const head = el('div', { class: 'head' }); @@ -307,9 +362,9 @@ el('span', { class: c.is_manager ? 'role role-m1nd' : 'role role-ag3nt' }, c.is_manager ? 'm1nd' : 'ag3nt'), ); - if (c.pending) { + if (pending) { head.append(el('span', { class: 'pending-state' }, - el('span', { class: 'spinner' }, '◐'), ' ', c.pending + '…')); + el('span', { class: 'spinner' }, '◐'), ' ', pending + '…')); } else if (c.needs_login) { head.append(el('a', { class: 'badge badge-warn', href: url, target: '_blank', rel: 'noopener' }, @@ -319,6 +374,7 @@ head.append(form( '/rebuild/' + c.name, 'badge badge-warn btn-inline', 'needs update ↻', 'rebuild ' + c.name + '? hot-reloads the container.', + {}, { noRefresh: true }, )); } head.append(el('span', { class: 'meta' }, `${c.container} :${c.port}`)); @@ -333,29 +389,37 @@ const actions = el('div', { class: 'actions' }); if (c.running) { actions.append( - form('/restart/' + c.name, 'btn-restart', '↺ R3ST4RT', 'restart ' + c.name + '?'), + form('/restart/' + c.name, 'btn-restart', '↺ R3ST4RT', + 'restart ' + c.name + '?', {}, { noRefresh: true }), ); if (!c.is_manager) { actions.append( - form('/kill/' + c.name, 'btn-stop', '■ ST0P', 'stop ' + c.name + '?'), + form('/kill/' + c.name, 'btn-stop', '■ ST0P', + 'stop ' + c.name + '?', {}, { noRefresh: true }), ); } } else { actions.append( - form('/start/' + c.name, 'btn-start', '▶ ST4RT', 'start ' + c.name + '?'), + form('/start/' + c.name, 'btn-start', '▶ ST4RT', + 'start ' + c.name + '?', {}, { noRefresh: true }), ); } actions.append( form('/rebuild/' + c.name, 'btn-rebuild', '↻ R3BU1LD', - 'rebuild ' + c.name + '? hot-reloads the container.'), + 'rebuild ' + c.name + '? hot-reloads the container.', + {}, { noRefresh: true }), ); if (!c.is_manager) { + // DESTR0Y is event-covered (ContainerRemoved); PURG3 also + // wipes tombstone state which isn't event-derived yet, so it + // keeps the post-submit refetch. actions.append( form('/destroy/' + c.name, 'btn-destroy', 'DESTR0Y', - 'destroy ' + c.name + '? container is removed; state + creds kept.'), + 'destroy ' + c.name + '? container is removed; state + creds kept.', + {}, { noRefresh: true }), form('/destroy/' + c.name, 'btn-destroy', 'PURG3', 'PURGE ' + c.name + '? container, config history, claude creds, ' - + 'and /state/ notes are all WIPED. no undo.', { purge: 'on' }), + + 'and notes are all WIPED. no undo.', { purge: 'on' }), ); } li.append(actions); @@ -1088,10 +1152,11 @@ // names from here instead of refetching on every keystroke). window.__hyperhive_state = s; const openDetails = snapshotOpenDetails(); - // Sync transients first so renderContainers below sees the - // current derived map (it reads from `transientsState`, not - // from `s.transients`). + // Sync transients + containers first so renderContainers below + // sees the current derived maps (it reads from + // `transientsState` + `containersState`, not from `s.*`). syncTransientsFromSnapshot(s); + syncContainersFromSnapshot(s); renderContainers(s); renderTombstones(s); // Sync the derived approvals + questions stores from the @@ -1106,18 +1171,20 @@ renderMetaInputs(s); restoreOpenDetails(openDetails); notifyDeltas(s); - // Auto-refresh: fast (2s) while a spawn or a per-container - // action is in flight, otherwise heartbeat (5s) so newly-queued - // approvals from the manager show up without the operator - // having to reload the page. Broker SSE already triggers a - // refresh on operator-bound messages; this catches the rest - // (approvals, tombstones, questions). - const anyPending = s.containers.some((c) => c.pending); - const next = (transientsState.size || anyPending) ? 2000 : 5000; + // No periodic refresh timer. Phase 6 covers every container + // mutation with `ContainerStateChanged` / `ContainerRemoved` + // (lifecycle ops, destroy, rebuild, crash_watch's 10s poll); + // approvals + questions + transients have their own events; + // broker traffic flows through the SSE channel. The only + // /api/state fetches are the initial cold load and the + // post-submit refetch on forms without `data-no-refresh` + // (tombstones, meta-input updates). if (pollTimer) { clearTimeout(pollTimer); pollTimer = null; } - if (next) pollTimer = setTimeout(refreshState, next); } catch (err) { console.error('refreshState failed', err); + // Schedule a single retry on transient errors so the page + // recovers from a brief network blip without making the + // operator reload. pollTimer = setTimeout(refreshState, 5000); } } @@ -1171,6 +1238,8 @@ question_resolved: (ev) => { applyQuestionResolved(ev); }, transient_set: (ev) => { applyTransientSet(ev); }, transient_cleared: (ev) => { applyTransientCleared(ev); }, + container_state_changed: (ev) => { applyContainerStateChanged(ev); }, + container_removed: (ev) => { applyContainerRemoved(ev); }, }, // Both history backfill and live frames flow through here, so the // inbox section ends up populated correctly on first paint and @@ -1208,15 +1277,14 @@ prompt.textContent = stickyTo ? `@${stickyTo}>` : '@—>'; } function knownAgents() { - const s = window.__hyperhive_state; - if (!s || !Array.isArray(s.containers)) return []; - // The broker uses the literal recipient `manager` for the - // manager's inbox, not the container name `hm1nd`. Swap on - // suggestion so `@manager` Just Works. - const names = s.containers.map((c) => (c.is_manager ? 'manager' : c.name)); - // `*` fans out the message to every registered agent (server-side - // broadcast_send). Surface it as a suggestion so operators can - // type `@*` from the dashboard the same way the manager does. + // Read live from the derived containers map so newly-spawned + // agents become addressable without an /api/state refetch. + // Broker uses the literal recipient `manager` for the manager's + // inbox, not the container name `hm1nd`. + const names = Array.from(containersState.values()) + .map((c) => (c.is_manager ? 'manager' : c.name)); + // `*` fans out to every registered agent (server-side + // broadcast_send). names.unshift('*'); return names; } diff --git a/hive-c0re/src/actions.rs b/hive-c0re/src/actions.rs index 8dbbfdf..32ce5af 100644 --- a/hive-c0re/src/actions.rs +++ b/hive-c0re/src/actions.rs @@ -85,6 +85,10 @@ pub async fn approve(coord: Arc, id: i64) -> Result<()> { if let Err(e) = finish_approval(&coord_bg, &approval_bg, result, None) { tracing::warn!(agent = %agent_bg, error = ?e, "spawn approval failed"); } + // New container row appeared (or didn't, on failure + // before nixos-container create completed) — rescan so + // dashboards reflect the post-spawn state. + coord_bg.rescan_containers_and_emit().await; }); Ok(()) } @@ -355,6 +359,9 @@ pub async fn destroy(coord: &Arc, name: &str, purge: bool) -> Resul coord.notify_manager(&HelperEvent::Destroyed { agent: name.to_owned(), }); + // Container row disappeared — rescan so the dashboard fires + // `ContainerRemoved` for the gone row. + coord.rescan_containers_and_emit().await; Ok(()) } diff --git a/hive-c0re/src/auto_update.rs b/hive-c0re/src/auto_update.rs index 2b2eb6e..4c714d3 100644 --- a/hive-c0re/src/auto_update.rs +++ b/hive-c0re/src/auto_update.rs @@ -95,6 +95,10 @@ pub async fn rebuild_agent(coord: &Arc, name: &str, current_rev: &s // dashboard's meta-input update path — all of which // route through rebuild_agent. coord.kick_agent(name, "container rebuilt"); + // Container state (needs_update, deployed_sha) may have + // shifted — rescan so dashboards drop the "needs update" + // chip without waiting for the next /api/state poll. + coord.rescan_containers_and_emit().await; } Err(e) => { coord.notify_manager(&hive_sh4re::HelperEvent::Rebuilt { @@ -104,6 +108,7 @@ pub async fn rebuild_agent(coord: &Arc, name: &str, current_rev: &s sha: None, tag: None, }); + coord.rescan_containers_and_emit().await; } } result diff --git a/hive-c0re/src/container_view.rs b/hive-c0re/src/container_view.rs new file mode 100644 index 0000000..869eea8 --- /dev/null +++ b/hive-c0re/src/container_view.rs @@ -0,0 +1,121 @@ +//! `ContainerView` + the snapshot builder that turns +//! `nixos-container list` (plus per-agent state on disk) into the row +//! shape the dashboard renders. Extracted from `dashboard.rs` so the +//! coordinator's rescan-and-emit helper can build the same view and +//! diff against the last snapshot to fire +//! `ContainerStateChanged` / `ContainerRemoved` events. + +use std::collections::HashMap; +use std::path::Path; + +use serde::Serialize; + +use crate::coordinator::Coordinator; +use crate::lifecycle::{self, AGENT_PREFIX, MANAGER_NAME}; + +#[derive(Serialize, Clone, PartialEq, Eq, Debug)] +#[allow(clippy::struct_excessive_bools)] +pub struct ContainerView { + /// Logical agent name (no `h-` prefix). Used in action URLs. + pub name: String, + /// Container name as nixos-container sees it (`h-foo`, `hm1nd`). + pub container: String, + pub is_manager: bool, + pub port: u16, + pub running: bool, + pub needs_update: bool, + pub needs_login: bool, + /// First 12 chars of the sha the meta flake currently has locked + /// for this agent's input. + #[serde(skip_serializing_if = "Option::is_none")] + pub deployed_sha: Option, +} + +/// Build the full container list. Wraps `lifecycle::list()` and +/// resolves every per-agent attribute the dashboard surfaces. +pub async fn build_all(coord: &Coordinator) -> Vec { + let raw = lifecycle::list().await.unwrap_or_default(); + let current_rev = crate::auto_update::current_flake_rev(&coord.hyperhive_flake); + let locked = read_meta_locked_revs(); + let mut out = Vec::new(); + for c in &raw { + let (logical, is_manager) = if c == MANAGER_NAME { + (MANAGER_NAME.to_owned(), true) + } else if let Some(n) = c.strip_prefix(AGENT_PREFIX) { + (n.to_owned(), false) + } else { + continue; + }; + let needs_update = + current_rev.as_deref().is_some_and(|rev| crate::auto_update::agent_needs_update(&logical, rev)); + let needs_login = + !is_manager && !claude_has_session(&Coordinator::agent_claude_dir(&logical)); + let deployed_sha = locked + .get(&format!("agent-{logical}")) + .map(|s| s[..s.len().min(12)].to_owned()); + out.push(ContainerView { + port: lifecycle::agent_web_port(&logical), + running: lifecycle::is_running(&logical).await, + container: c.clone(), + name: logical, + is_manager, + needs_update, + needs_login, + deployed_sha, + }); + } + out +} + +/// Host-side mirror of `hive_ag3nt::login::has_session`. Returns true +/// if the agent's bound `~/.claude/` dir on disk contains any regular +/// file. Reads each `build_all()` so a login driven from the agent's +/// own web UI reflects on the next snapshot. +pub fn claude_has_session(dir: &Path) -> bool { + let Ok(entries) = std::fs::read_dir(dir) else { + return false; + }; + entries + .flatten() + .any(|e| e.file_type().is_ok_and(|t| t.is_file())) +} + +/// Map of `agent-` → locked sha from meta's flake.lock. Used to +/// render the `deployed:` chip per container row. +fn read_meta_locked_revs() -> HashMap { + let mut out = HashMap::new(); + let Ok(raw) = std::fs::read_to_string("/var/lib/hyperhive/meta/flake.lock") else { + return out; + }; + let Ok(json) = serde_json::from_str::(&raw) else { + return out; + }; + let Some(nodes) = json.get("nodes").and_then(|v| v.as_object()) else { + return out; + }; + let Some(root_name) = json.get("root").and_then(|v| v.as_str()) else { + return out; + }; + let Some(root_inputs) = nodes + .get(root_name) + .and_then(|n| n.get("inputs")) + .and_then(|v| v.as_object()) + else { + return out; + }; + for alias in root_inputs.keys() { + let target_name = match root_inputs.get(alias) { + Some(serde_json::Value::String(s)) => s.clone(), + _ => continue, + }; + if let Some(rev) = nodes + .get(&target_name) + .and_then(|n| n.get("locked")) + .and_then(|v| v.get("rev")) + .and_then(|v| v.as_str()) + { + out.insert(alias.clone(), rev.to_owned()); + } + } + out +} diff --git a/hive-c0re/src/coordinator.rs b/hive-c0re/src/coordinator.rs index 35c486f..892e417 100644 --- a/hive-c0re/src/coordinator.rs +++ b/hive-c0re/src/coordinator.rs @@ -13,6 +13,7 @@ use tokio::sync::broadcast; use crate::agent_server::{self, AgentSocket}; use crate::approvals::Approvals; use crate::broker::Broker; +use crate::container_view::{self, ContainerView}; use crate::dashboard_events::DashboardEvent; use crate::operator_questions::OperatorQuestions; @@ -64,6 +65,14 @@ pub struct Coordinator { /// snapshot. dashboard_events: broadcast::Sender, event_seq: AtomicU64, + /// Last container snapshot seen by `rescan_containers_and_emit`, + /// keyed by `ContainerView.name`. The rescan diffs a fresh + /// `container_view::build_all` against this map and emits one + /// `ContainerStateChanged` per added/changed row and one + /// `ContainerRemoved` per disappeared row. Async — guarded by a + /// tokio mutex so the rescan can `await` `lifecycle::list` / + /// `is_running` without blocking other coordinator paths. + last_containers: tokio::sync::Mutex>, } /// Per-agent in-progress state that the dashboard surfaces between approve @@ -142,6 +151,7 @@ impl Coordinator { transient: Mutex::new(HashMap::new()), dashboard_events, event_seq: AtomicU64::new(0), + last_containers: tokio::sync::Mutex::new(HashMap::new()), }) } @@ -291,6 +301,68 @@ impl Coordinator { }); } + /// Rebuild the per-container snapshot, diff it against the last + /// one cached on `self`, and emit one + /// `DashboardEvent::ContainerStateChanged` per added/changed row + /// and one `DashboardEvent::ContainerRemoved` per disappeared row. + /// Call after any mutation that could affect what + /// `nixos-container list` returns or what a row's + /// `running` / `needs_update` / `needs_login` / `deployed_sha` + /// resolves to — lifecycle ops, destroy, approve (post-spawn), + /// rebuild, meta-update, and the crash-watcher's periodic poll. + /// Cheap when nothing changed (one `nixos-container list` + a + /// HashMap diff + zero emits). + pub async fn rescan_containers_and_emit(self: &Arc) { + let fresh = container_view::build_all(self).await; + let mut last = self.last_containers.lock().await; + let mut changed_or_new = Vec::new(); + let mut removed = Vec::new(); + // Diff into change vs. add. + for view in &fresh { + match last.get(&view.name) { + Some(prev) if prev == view => {} // unchanged + _ => changed_or_new.push(view.clone()), + } + } + // Anything in `last` but not in `fresh` is gone. + let fresh_names: std::collections::HashSet<&str> = + fresh.iter().map(|c| c.name.as_str()).collect(); + for name in last.keys() { + if !fresh_names.contains(name.as_str()) { + removed.push(name.clone()); + } + } + // Rebuild the cache from the fresh snapshot. + last.clear(); + for c in fresh { + last.insert(c.name.clone(), c); + } + drop(last); + for c in changed_or_new { + self.emit_dashboard_event(DashboardEvent::ContainerStateChanged { + seq: self.next_seq(), + container: c, + }); + } + for name in removed { + self.emit_dashboard_event(DashboardEvent::ContainerRemoved { + seq: self.next_seq(), + name, + }); + } + } + + /// Read-only snapshot of the last cached container view. Used by + /// `/api/state` to cold-load page-open clients without re-running + /// `nixos-container list` themselves; the + /// `rescan_containers_and_emit` calls keep this fresh. + pub async fn containers_snapshot(&self) -> Vec { + let last = self.last_containers.lock().await; + let mut out: Vec = last.values().cloned().collect(); + out.sort_by(|a, b| a.name.cmp(&b.name)); + out + } + pub fn register_agent(self: &Arc, name: &str) -> Result { // Idempotent: drop any existing listener so re-registration (e.g. on rebuild, // or after a hive-c0re restart cleared /run/hyperhive) gets a fresh socket. diff --git a/hive-c0re/src/crash_watch.rs b/hive-c0re/src/crash_watch.rs index 66c3600..724bd8e 100644 --- a/hive-c0re/src/crash_watch.rs +++ b/hive-c0re/src/crash_watch.rs @@ -16,10 +16,10 @@ //! but polling is simpler and a 10s detection delay is fine. use std::collections::HashSet; -use std::path::Path; use std::sync::Arc; use std::time::Duration; +use crate::container_view::claude_has_session; use crate::coordinator::{Coordinator, TransientKind}; use crate::lifecycle::{self, AGENT_PREFIX, MANAGER_NAME}; @@ -69,6 +69,12 @@ pub fn spawn(coord: Arc) { emit_login_transitions(&coord, &prev_logged_in, ¤t_logged_in, &sub_agents); emit_update_transitions(&coord, &prev_updated, ¤t_updated, &sub_agents); } + // Periodic container rescan — catches state flips that + // happen outside our mutation surface (operator runs + // `nixos-container stop` over ssh, agent logs in via its + // own web UI, etc.) so the dashboard converges within one + // POLL_INTERVAL. Idempotent + cheap when nothing changed. + coord.rescan_containers_and_emit().await; prev_running = current_running; prev_logged_in = current_logged_in; prev_updated = current_updated; @@ -163,14 +169,3 @@ fn emit_update_transitions( }); } } - -/// Mirrors `dashboard::claude_has_session`. Lives here too so the -/// watcher doesn't depend on dashboard internals. -fn claude_has_session(dir: &Path) -> bool { - let Ok(entries) = std::fs::read_dir(dir) else { - return false; - }; - entries - .flatten() - .any(|e| e.file_type().is_ok_and(|t| t.is_file())) -} diff --git a/hive-c0re/src/dashboard.rs b/hive-c0re/src/dashboard.rs index 8054fff..0810e1e 100644 --- a/hive-c0re/src/dashboard.rs +++ b/hive-c0re/src/dashboard.rs @@ -14,7 +14,7 @@ use axum::{ extract::{Path as AxumPath, State}, http::{HeaderMap, StatusCode}, response::{ - Html, IntoResponse, Redirect, Response, + Html, IntoResponse, Response, sse::{Event, KeepAlive, Sse}, }, routing::{get, post}, @@ -25,8 +25,9 @@ use tokio_stream::wrappers::BroadcastStream; use tokio_stream::{Stream, StreamExt}; use crate::actions; +use crate::container_view::{ContainerView, claude_has_session}; use crate::coordinator::Coordinator; -use crate::lifecycle::{self, AGENT_PREFIX, MANAGER_NAME}; +use crate::lifecycle::{self, MANAGER_NAME}; const MANAGER_PORT: u16 = 8000; @@ -200,31 +201,6 @@ struct TombstoneView { has_creds: bool, } -#[derive(Serialize)] -#[allow(clippy::struct_excessive_bools)] -struct ContainerView { - /// Logical agent name (no `h-` prefix). Used in action URLs. - name: String, - /// Container name as nixos-container sees it (`h-foo`, `hm1nd`). - container: String, - is_manager: bool, - port: u16, - running: bool, - needs_update: bool, - needs_login: bool, - /// When a lifecycle action is in flight on this container, the kind - /// (`starting`, `stopping`, etc.) so the JS can render a spinner + - /// disable other buttons. - #[serde(skip_serializing_if = "Option::is_none")] - pending: Option<&'static str>, - /// First 12 chars of the sha the meta flake currently has locked - /// for this agent's input. Reflects what's actually deployed; can - /// differ from `applied//main` only between - /// `meta::prepare_deploy` and `finalize_deploy` (≤ build duration). - #[serde(skip_serializing_if = "Option::is_none")] - deployed_sha: Option, -} - #[derive(Serialize)] struct TransientView { name: String, @@ -303,17 +279,20 @@ async fn api_state(headers: HeaderMap, State(state): State) -> axum::J // to make idempotent, not ours to avoid here. let seq = state.coord.current_seq(); - let raw_containers = log_default("nixos-container list", lifecycle::list().await); - let current_rev = crate::auto_update::current_flake_rev(&state.coord.hyperhive_flake); + // Refresh the coordinator's cached container snapshot before + // reading. Cold-load clients then see whatever the latest rescan + // produced; live clients converge via the matching + // `ContainerStateChanged` / `ContainerRemoved` events the rescan + // emits. + state.coord.rescan_containers_and_emit().await; + let containers = state.coord.containers_snapshot().await; + let any_stale = containers.iter().any(|c| c.needs_update); let transient_snapshot = state.coord.transient_snapshot(); let pending_approvals = gc_orphans( &state.coord, log_default("approvals.pending", state.coord.approvals.pending()), ); - - let (containers, any_stale) = - build_container_views(&raw_containers, current_rev.as_deref(), &transient_snapshot).await; - let transients = build_transient_views(&raw_containers, &transient_snapshot); + let transients = build_transient_views(&containers, &transient_snapshot); let approvals = build_approval_views(pending_approvals).await; let approval_history = log_default( "approvals.recent_resolved", @@ -370,96 +349,6 @@ fn build_port_conflicts(containers: &[ContainerView]) -> Vec { .collect() } -/// Build `ContainerView`s for every live nixos-container. Returns the -/// list and whether any container is stale (drives the "↻ UPD4TE 4LL" -/// banner). -async fn build_container_views( - raw_containers: &[String], - current_rev: Option<&str>, - transient_snapshot: &std::collections::HashMap, -) -> (Vec, bool) { - let mut out = Vec::new(); - let mut any_stale = false; - let locked = read_meta_locked_revs(); - for c in raw_containers { - let (logical, is_manager) = if c == MANAGER_NAME { - (MANAGER_NAME.to_owned(), true) - } else if let Some(n) = c.strip_prefix(AGENT_PREFIX) { - (n.to_owned(), false) - } else { - continue; - }; - let needs_update = - current_rev.is_some_and(|rev| crate::auto_update::agent_needs_update(&logical, rev)); - if needs_update { - any_stale = true; - } - let needs_login = - !is_manager && !claude_has_session(&Coordinator::agent_claude_dir(&logical)); - let pending = transient_snapshot - .get(&logical) - .map(|st| transient_label(st.kind)); - let deployed_sha = locked - .get(&format!("agent-{logical}")) - .map(|s| s[..s.len().min(12)].to_owned()); - out.push(ContainerView { - port: lifecycle::agent_web_port(&logical), - running: lifecycle::is_running(&logical).await, - container: c.clone(), - name: logical, - is_manager, - needs_update, - needs_login, - pending, - deployed_sha, - }); - } - (out, any_stale) -} - -/// Map of node name → locked sha for nodes the **root** of meta -/// directly depends on (`hyperhive`, `agent-`). Used by the -/// container row to render its `deployed:` chip per agent. -/// Distinct from `read_meta_inputs()` which walks deeper for the -/// flake-input update form. -fn read_meta_locked_revs() -> std::collections::HashMap { - let mut out = std::collections::HashMap::new(); - let Ok(raw) = std::fs::read_to_string("/var/lib/hyperhive/meta/flake.lock") else { - return out; - }; - let Ok(json) = serde_json::from_str::(&raw) else { - return out; - }; - let Some(nodes) = json.get("nodes").and_then(|v| v.as_object()) else { - return out; - }; - let Some(root_name) = json.get("root").and_then(|v| v.as_str()) else { - return out; - }; - let Some(root_inputs) = nodes - .get(root_name) - .and_then(|n| n.get("inputs")) - .and_then(|v| v.as_object()) - else { - return out; - }; - for alias in root_inputs.keys() { - let target_name = match root_inputs.get(alias) { - Some(serde_json::Value::String(s)) => s.clone(), - _ => continue, - }; - if let Some(rev) = nodes - .get(&target_name) - .and_then(|n| n.get("locked")) - .and_then(|v| v.get("rev")) - .and_then(|v| v.as_str()) - { - out.insert(alias.clone(), rev.to_owned()); - } - } - out -} - #[derive(Serialize, Clone)] struct MetaInputView { /// Input key in meta's `flake.nix` — `hyperhive`, `agent-`, etc. @@ -577,16 +466,12 @@ fn walk_meta_inputs( /// (`Spawning`). Lifecycle ops on existing containers surface as /// `ContainerView.pending` inline; this list only catches pre-creation. fn build_transient_views( - raw_containers: &[String], + containers: &[ContainerView], transient_snapshot: &std::collections::HashMap, ) -> Vec { transient_snapshot .iter() - .filter(|(name, _)| { - !raw_containers - .iter() - .any(|c| c == &format!("{AGENT_PREFIX}{name}") || c == *name) - }) + .filter(|(name, _)| !containers.iter().any(|c| &c.name == *name)) .map(|(name, st)| TransientView { name: name.clone(), kind: transient_label(st.kind), @@ -1034,7 +919,10 @@ async fn post_purge_tombstone( .fail_pending_for_agent(&name, "agent state purged"); if errors.is_empty() { tracing::info!(%name, "tombstone purged"); - Redirect::to("/").into_response() + // Tombstones aren't event-derived yet, so the client still + // refetches /api/state to see this one disappear (matching + // form omits `data-no-refresh`). + (StatusCode::OK, "ok").into_response() } else { error_response(&format!("purge {name} partial: {}", errors.join(", "))) } @@ -1086,7 +974,10 @@ async fn post_meta_update( tokio::spawn(async move { run_meta_update(&coord, &inputs_clone).await; }); - Redirect::to("/").into_response() + // Background task — each per-agent rebuild emits its own + // `ContainerStateChanged`; the meta inputs panel still relies on + // /api/state freshness (matching form omits `data-no-refresh`). + (StatusCode::OK, "ok").into_response() } /// Background task: run `nix flake update ` in meta + commit, @@ -1260,7 +1151,13 @@ where match result { Ok(()) => { extra(state, &logical); - Redirect::to("/").into_response() + // Rescan so the running/needs_login/needs_update flip on + // the affected row lands on every dashboard's SSE channel + // without waiting for a snapshot poll. 200 + matching + // `data-no-refresh` on the form skip the post-submit + // /api/state refetch. + state.coord.rescan_containers_and_emit().await; + (StatusCode::OK, "ok").into_response() } Err(e) => error_response(&format!("{verb} {logical} failed: {e:#}")), } @@ -1336,7 +1233,8 @@ async fn post_update_all(State(state): State) -> Response { } } if errors.is_empty() { - Redirect::to("/").into_response() + // Each rebuild_agent rescanned; no extra refetch needed. + (StatusCode::OK, "ok").into_response() } else { error_response(&format!( "update-all partial failure:\n{}", @@ -1380,8 +1278,11 @@ async fn post_destroy( ) -> Response { // Checkbox semantics: any non-empty value (axum sends "on") = purge. let purge = form.purge.as_deref().is_some_and(|v| !v.is_empty()); + // `actions::destroy` rescans the container list on success, so the + // `ContainerRemoved` event lands before we return 200. The matching + // form carries `data-no-refresh`. match actions::destroy(&state.coord, &name, purge).await { - Ok(()) => Redirect::to("/").into_response(), + Ok(()) => (StatusCode::OK, "ok").into_response(), Err(e) => error_response(&format!("destroy {name} failed: {e:#}")), } } @@ -1429,18 +1330,6 @@ fn gc_orphans(coord: &Coordinator, approvals: Vec) -> Vec { .collect() } -/// Host-side mirror of `hive_ag3nt::login::has_session`. Returns true if the -/// agent's bound `~/.claude/` dir on disk contains any regular file. The -/// dashboard reads this each render so logins driven from the agent web UI -/// (Phase 8 step 4) reflect within one auto-refresh cycle. -fn claude_has_session(dir: &Path) -> bool { - let Ok(entries) = std::fs::read_dir(dir) else { - return false; - }; - entries - .flatten() - .any(|e| e.file_type().is_ok_and(|t| t.is_file())) -} /// Multi-file unified diff between the currently-deployed tree and /// the proposal for this approval. Runs against the applied repo diff --git a/hive-c0re/src/dashboard_events.rs b/hive-c0re/src/dashboard_events.rs index 4426108..ac508bc 100644 --- a/hive-c0re/src/dashboard_events.rs +++ b/hive-c0re/src/dashboard_events.rs @@ -25,6 +25,8 @@ use serde::Serialize; +use crate::container_view::ContainerView; + #[derive(Debug, Clone, Serialize)] #[serde(rename_all = "snake_case", tag = "kind")] pub enum DashboardEvent { @@ -121,4 +123,24 @@ pub enum DashboardEvent { /// The matching lifecycle action resolved (success or failure). /// Clients drop the spinner row. TransientCleared { seq: u64, name: String }, + /// One container row changed — new container appeared (post-spawn + /// finalise), an existing one flipped running/needs_update/sha, + /// etc. Clients upsert by `container.name`. Payload carries the + /// full row so cold-loaded clients and event-driven clients + /// converge on the same render. + /// + /// Fired by `Coordinator::rescan_containers_and_emit`, which diffs + /// a fresh `nixos-container list`–derived snapshot against the + /// last one cached on the coordinator. Mutation sites (lifecycle + /// endpoints, actions::destroy / approve, crash_watch's poll loop) + /// call the rescan after their work lands. + ContainerStateChanged { + seq: u64, + container: ContainerView, + }, + /// A container that was in the previous snapshot is gone. Clients + /// drop the row by name. Fired alongside any + /// `nixos-container destroy` (operator-driven or otherwise) on the + /// next rescan. + ContainerRemoved { seq: u64, name: String }, } diff --git a/hive-c0re/src/main.rs b/hive-c0re/src/main.rs index 6c10a6e..a8ed2e8 100644 --- a/hive-c0re/src/main.rs +++ b/hive-c0re/src/main.rs @@ -11,6 +11,7 @@ mod approvals; mod auto_update; mod broker; mod client; +mod container_view; mod coordinator; mod crash_watch; mod dashboard;