new DashboardEvent::ContainerStateChanged + ContainerRemoved close the last refetch loop on the dashboard. Coordinator's rescan_containers_and_emit diffs a fresh container_view::build_all against a cached last_containers map and fires per-row events. called from actions::approve (post-spawn), actions::destroy, the lifecycle_action wrapper, auto_update::rebuild_agent, and the existing 10s crash_watch poll. ContainerView extracted to its own module so coordinator and dashboard can both build it. dashboard endpoints flip to 200; container-lifecycle forms carry data-no-refresh. client drops the periodic poll entirely — initial cold load + SSE for everything afterwards. pending overlay reads from the existing transientsState since the new event payload doesn't carry it. PURG3 + meta-update keep the post-submit refetch since tombstones + meta_inputs aren't event-derived yet; tracked in TODO.md.
171 lines
6.7 KiB
Rust
171 lines
6.7 KiB
Rust
//! Per-container state watcher. Polls every managed container on a
|
|
//! fixed interval, tracks three orthogonal state-sets across ticks,
|
|
//! and emits a `HelperEvent` to the manager on each transition:
|
|
//!
|
|
//! - **running**: container is up. running → stopped without an
|
|
//! operator-initiated transient (`Stopping` / `Restarting` /
|
|
//! `Destroying` / `Rebuilding`) → `ContainerCrash`.
|
|
//! - **logged-in**: claude session dir is populated. ! → ✓ →
|
|
//! `LoggedIn`; ✓ → ! → `NeedsLogin` (rare — usually only fires
|
|
//! on a fresh spawn / purge).
|
|
//! - **up-to-date**: agent's recorded flake rev matches current. ✓
|
|
//! → ! → `NeedsUpdate`. The reverse direction (`NeedsUpdate`
|
|
//! resolved) is covered by `Rebuilt`, so no separate event.
|
|
//!
|
|
//! D-Bus subscription would be lower-latency for the first axis,
|
|
//! but polling is simpler and a 10s detection delay is fine.
|
|
|
|
use std::collections::HashSet;
|
|
use std::sync::Arc;
|
|
use std::time::Duration;
|
|
|
|
use crate::container_view::claude_has_session;
|
|
use crate::coordinator::{Coordinator, TransientKind};
|
|
use crate::lifecycle::{self, AGENT_PREFIX, MANAGER_NAME};
|
|
|
|
const POLL_INTERVAL: Duration = Duration::from_secs(10);
|
|
|
|
pub fn spawn(coord: Arc<Coordinator>) {
|
|
tokio::spawn(async move {
|
|
let mut prev_running: HashSet<String> = HashSet::new();
|
|
let mut prev_logged_in: HashSet<String> = HashSet::new();
|
|
let mut prev_updated: HashSet<String> = HashSet::new();
|
|
let mut seeded = false;
|
|
loop {
|
|
let raw = lifecycle::list().await.unwrap_or_default();
|
|
let current_rev = crate::auto_update::current_flake_rev(&coord.hyperhive_flake);
|
|
let mut current_running = HashSet::new();
|
|
let mut current_logged_in = HashSet::new();
|
|
let mut current_updated = HashSet::new();
|
|
let mut sub_agents: Vec<String> = Vec::new();
|
|
for c in &raw {
|
|
let logical = if c == MANAGER_NAME {
|
|
MANAGER_NAME.to_owned()
|
|
} else if let Some(n) = c.strip_prefix(AGENT_PREFIX) {
|
|
n.to_owned()
|
|
} else {
|
|
continue;
|
|
};
|
|
if logical != MANAGER_NAME {
|
|
sub_agents.push(logical.clone());
|
|
}
|
|
if lifecycle::is_running(&logical).await {
|
|
current_running.insert(logical.clone());
|
|
}
|
|
if logical != MANAGER_NAME
|
|
&& claude_has_session(&Coordinator::agent_claude_dir(&logical))
|
|
{
|
|
current_logged_in.insert(logical.clone());
|
|
}
|
|
if let Some(rev) = current_rev.as_deref()
|
|
&& !crate::auto_update::agent_needs_update(&logical, rev)
|
|
{
|
|
current_updated.insert(logical.clone());
|
|
}
|
|
}
|
|
|
|
if seeded {
|
|
emit_crash_transitions(&coord, &prev_running, ¤t_running);
|
|
emit_login_transitions(&coord, &prev_logged_in, ¤t_logged_in, &sub_agents);
|
|
emit_update_transitions(&coord, &prev_updated, ¤t_updated, &sub_agents);
|
|
}
|
|
// Periodic container rescan — catches state flips that
|
|
// happen outside our mutation surface (operator runs
|
|
// `nixos-container stop` over ssh, agent logs in via its
|
|
// own web UI, etc.) so the dashboard converges within one
|
|
// POLL_INTERVAL. Idempotent + cheap when nothing changed.
|
|
coord.rescan_containers_and_emit().await;
|
|
prev_running = current_running;
|
|
prev_logged_in = current_logged_in;
|
|
prev_updated = current_updated;
|
|
seeded = true;
|
|
|
|
tokio::time::sleep(POLL_INTERVAL).await;
|
|
}
|
|
});
|
|
}
|
|
|
|
fn emit_crash_transitions(coord: &Coordinator, prev: &HashSet<String>, current: &HashSet<String>) {
|
|
let transients = coord.transient_snapshot();
|
|
for stopped in prev.difference(current) {
|
|
let deliberate = transients.get(stopped).is_some_and(|st| {
|
|
matches!(
|
|
st.kind,
|
|
TransientKind::Stopping
|
|
| TransientKind::Restarting
|
|
| TransientKind::Destroying
|
|
| TransientKind::Rebuilding
|
|
)
|
|
});
|
|
if deliberate {
|
|
continue;
|
|
}
|
|
tracing::warn!(agent = %stopped, "container crash detected");
|
|
coord.notify_manager(&hive_sh4re::HelperEvent::ContainerCrash {
|
|
agent: stopped.clone(),
|
|
note: Some("container stopped without an operator action".into()),
|
|
});
|
|
}
|
|
}
|
|
|
|
fn emit_login_transitions(
|
|
coord: &Coordinator,
|
|
prev: &HashSet<String>,
|
|
current: &HashSet<String>,
|
|
sub_agents: &[String],
|
|
) {
|
|
for agent in current.difference(prev) {
|
|
tracing::info!(%agent, "agent logged in");
|
|
coord.notify_manager(&hive_sh4re::HelperEvent::LoggedIn {
|
|
agent: agent.clone(),
|
|
});
|
|
}
|
|
// Only count NeedsLogin transitions for agents that exist and
|
|
// are *not* logged in — the difference set above already gives
|
|
// us "was in prev, gone from current" but we also want to fire
|
|
// for agents that newly appeared as not-logged-in (post-spawn /
|
|
// post-purge). Treat sub_agents minus current as the
|
|
// currently-needs-login set; emit when an agent enters it.
|
|
let prev_needs: HashSet<&str> = sub_agents
|
|
.iter()
|
|
.map(String::as_str)
|
|
.filter(|n| !prev.contains(*n))
|
|
.collect();
|
|
let current_needs: HashSet<&str> = sub_agents
|
|
.iter()
|
|
.map(String::as_str)
|
|
.filter(|n| !current.contains(*n))
|
|
.collect();
|
|
for agent in current_needs.difference(&prev_needs) {
|
|
tracing::info!(%agent, "agent needs login");
|
|
coord.notify_manager(&hive_sh4re::HelperEvent::NeedsLogin {
|
|
agent: (*agent).to_owned(),
|
|
});
|
|
}
|
|
}
|
|
|
|
fn emit_update_transitions(
|
|
coord: &Coordinator,
|
|
prev_updated: &HashSet<String>,
|
|
current_updated: &HashSet<String>,
|
|
sub_agents: &[String],
|
|
) {
|
|
// Fired on the "was up-to-date, now isn't" transition. The
|
|
// reverse (rebuilt) is already covered by HelperEvent::Rebuilt.
|
|
let prev_stale: HashSet<&str> = sub_agents
|
|
.iter()
|
|
.map(String::as_str)
|
|
.filter(|n| !prev_updated.contains(*n))
|
|
.collect();
|
|
let current_stale: HashSet<&str> = sub_agents
|
|
.iter()
|
|
.map(String::as_str)
|
|
.filter(|n| !current_updated.contains(*n))
|
|
.collect();
|
|
for agent in current_stale.difference(&prev_stale) {
|
|
tracing::info!(%agent, "agent needs update");
|
|
coord.notify_manager(&hive_sh4re::HelperEvent::NeedsUpdate {
|
|
agent: (*agent).to_owned(),
|
|
});
|
|
}
|
|
}
|