container crash watcher → HelperEvent::ContainerCrash

new hive_c0re::crash_watch task polls every 10s, builds the set of currently-running containers, and on running→stopped transitions checks the transient snapshot: if no Stopping / Restarting / Destroying / Rebuilding flag is set, the container exited unexpectedly and we fire HelperEvent::ContainerCrash into the manager's inbox so it can react (typically: start it again). first poll is a seeding pass — no events on harness startup. dbus subscription would be lower-latency but polling is honest and debuggable, and a 10s delay on crash detection is fine for our scale. manager prompt + approvals doc updated to advertise the new event variant. todo drops the entry (and the journald-viewer entry that already shipped).
2026-05-15 21:02:05 +02:00 · 2026-05-15 21:02:05 +02:00 · 58c3cd853b
commit 58c3cd853b
parent 6db38cf70c
6 changed files with 92 additions and 7 deletions
--- a/hive-c0re/src/crash_watch.rs
+++ b/hive-c0re/src/crash_watch.rs
@ -0,0 +1,72 @@
+//! Container crash watcher. Polls every managed container's running
+//! state on a fixed interval; when a previously-running container is
+//! suddenly stopped AND no operator-initiated transient (`Stopping`,
+//! `Restarting`, `Destroying`) was set, fire `HelperEvent::ContainerCrash`
+//! into the manager's inbox. The manager can then react — usually
+//! a `start` or a config rebuild.
+//!
+//! D-Bus subscription would be lower-latency, but polling is far
+//! simpler and the failure modes are honest (a crash discovered 10s
+//! late is fine for our scale).
+
+use std::collections::HashSet;
+use std::sync::Arc;
+use std::time::Duration;
+
+use crate::coordinator::{Coordinator, TransientKind};
+use crate::lifecycle::{self, AGENT_PREFIX, MANAGER_NAME};
+
+const POLL_INTERVAL: Duration = Duration::from_secs(10);
+
+pub fn spawn(coord: Arc<Coordinator>) {
+    tokio::spawn(async move {
+        // Seed the running-set from the first poll so we don't emit a
+        // crash for every agent on startup. First tick fills it; only
+        // running→stopped transitions across subsequent ticks count.
+        let mut prev_running: HashSet<String> = HashSet::new();
+        let mut seeded = false;
+        loop {
+            let raw = lifecycle::list().await.unwrap_or_default();
+            let mut current_running = HashSet::new();
+            for c in &raw {
+                let logical = if c == MANAGER_NAME {
+                    MANAGER_NAME.to_owned()
+                } else if let Some(n) = c.strip_prefix(AGENT_PREFIX) {
+                    n.to_owned()
+                } else {
+                    continue;
+                };
+                if lifecycle::is_running(&logical).await {
+                    current_running.insert(logical);
+                }
+            }
+
+            if seeded {
+                let transients = coord.transient_snapshot();
+                for stopped in prev_running.difference(&current_running) {
+                    let deliberate = transients.get(stopped).is_some_and(|st| {
+                        matches!(
+                            st.kind,
+                            TransientKind::Stopping
+                                | TransientKind::Restarting
+                                | TransientKind::Destroying
+                                | TransientKind::Rebuilding
+                        )
+                    });
+                    if deliberate {
+                        continue;
+                    }
+                    tracing::warn!(agent = %stopped, "container crash detected");
+                    coord.notify_manager(&hive_sh4re::HelperEvent::ContainerCrash {
+                        agent: stopped.clone(),
+                        note: Some("container stopped without an operator action".into()),
+                    });
+                }
+            }
+            prev_running = current_running;
+            seeded = true;
+
+            tokio::time::sleep(POLL_INTERVAL).await;
+        }
+    });
+}
--- a/hive-c0re/src/main.rs
+++ b/hive-c0re/src/main.rs
@ -12,6 +12,7 @@ mod auto_update;
 mod broker;
 mod client;
 mod coordinator;
+mod crash_watch;
 mod dashboard;
 mod events_vacuum;
 mod lifecycle;
@ -130,6 +131,10 @@ async fn main() -> Result<()> {
            // Per-agent events.sqlite vacuum: host-side so the harness
            // doesn't need any retention wiring of its own.
            events_vacuum::spawn(coord.clone());
+            // Container crash watcher: emits HelperEvent::ContainerCrash
+            // when a previously-running container goes away without an
+            // operator-initiated transient state.
+            crash_watch::spawn(coord.clone());
            let dash_coord = coord.clone();
            tokio::spawn(async move {
                if let Err(e) = dashboard::serve(dashboard_port, dash_coord).await {