container crash watcher → HelperEvent::ContainerCrash

new hive_c0re::crash_watch task polls every 10s, builds the set of
currently-running containers, and on running→stopped transitions
checks the transient snapshot: if no Stopping / Restarting /
Destroying / Rebuilding flag is set, the container exited
unexpectedly and we fire HelperEvent::ContainerCrash into the
manager's inbox so it can react (typically: start it again).

first poll is a seeding pass — no events on harness startup. dbus
subscription would be lower-latency but polling is honest and
debuggable, and a 10s delay on crash detection is fine for our
scale.

manager prompt + approvals doc updated to advertise the new
event variant. todo drops the entry (and the journald-viewer
entry that already shipped).
This commit is contained in:
müde 2026-05-15 21:02:05 +02:00
parent 6db38cf70c
commit 58c3cd853b
6 changed files with 92 additions and 7 deletions

View file

@ -0,0 +1,72 @@
//! Container crash watcher. Polls every managed container's running
//! state on a fixed interval; when a previously-running container is
//! suddenly stopped AND no operator-initiated transient (`Stopping`,
//! `Restarting`, `Destroying`) was set, fire `HelperEvent::ContainerCrash`
//! into the manager's inbox. The manager can then react — usually
//! a `start` or a config rebuild.
//!
//! D-Bus subscription would be lower-latency, but polling is far
//! simpler and the failure modes are honest (a crash discovered 10s
//! late is fine for our scale).
use std::collections::HashSet;
use std::sync::Arc;
use std::time::Duration;
use crate::coordinator::{Coordinator, TransientKind};
use crate::lifecycle::{self, AGENT_PREFIX, MANAGER_NAME};
const POLL_INTERVAL: Duration = Duration::from_secs(10);
pub fn spawn(coord: Arc<Coordinator>) {
tokio::spawn(async move {
// Seed the running-set from the first poll so we don't emit a
// crash for every agent on startup. First tick fills it; only
// running→stopped transitions across subsequent ticks count.
let mut prev_running: HashSet<String> = HashSet::new();
let mut seeded = false;
loop {
let raw = lifecycle::list().await.unwrap_or_default();
let mut current_running = HashSet::new();
for c in &raw {
let logical = if c == MANAGER_NAME {
MANAGER_NAME.to_owned()
} else if let Some(n) = c.strip_prefix(AGENT_PREFIX) {
n.to_owned()
} else {
continue;
};
if lifecycle::is_running(&logical).await {
current_running.insert(logical);
}
}
if seeded {
let transients = coord.transient_snapshot();
for stopped in prev_running.difference(&current_running) {
let deliberate = transients.get(stopped).is_some_and(|st| {
matches!(
st.kind,
TransientKind::Stopping
| TransientKind::Restarting
| TransientKind::Destroying
| TransientKind::Rebuilding
)
});
if deliberate {
continue;
}
tracing::warn!(agent = %stopped, "container crash detected");
coord.notify_manager(&hive_sh4re::HelperEvent::ContainerCrash {
agent: stopped.clone(),
note: Some("container stopped without an operator action".into()),
});
}
}
prev_running = current_running;
seeded = true;
tokio::time::sleep(POLL_INTERVAL).await;
}
});
}

View file

@ -12,6 +12,7 @@ mod auto_update;
mod broker;
mod client;
mod coordinator;
mod crash_watch;
mod dashboard;
mod events_vacuum;
mod lifecycle;
@ -130,6 +131,10 @@ async fn main() -> Result<()> {
// Per-agent events.sqlite vacuum: host-side so the harness
// doesn't need any retention wiring of its own.
events_vacuum::spawn(coord.clone());
// Container crash watcher: emits HelperEvent::ContainerCrash
// when a previously-running container goes away without an
// operator-initiated transient state.
crash_watch::spawn(coord.clone());
let dash_coord = coord.clone();
tokio::spawn(async move {
if let Err(e) = dashboard::serve(dashboard_port, dash_coord).await {