diff --git a/TODO.md b/TODO.md index 684c0d6..6f0ab2a 100644 --- a/TODO.md +++ b/TODO.md @@ -99,9 +99,3 @@ Pick anything from here when relevant. Cross-cutting design notes live in that takes the existing notes + a "compact this" prompt and rewrites them in place. Add when the notes start bloating. -## Lifecycle / reliability - -- **Container crash events.** Watch `container@*.service` via D-Bus, push - `HelperEvent::ContainerCrash` to the manager's inbox so the manager can - react (restart, escalate, etc.). - diff --git a/docs/approvals.md b/docs/approvals.md index af80169..88027d6 100644 --- a/docs/approvals.md +++ b/docs/approvals.md @@ -115,6 +115,10 @@ regular claude turn so the manager can react. Variants - `Killed { agent }` — admin `HostRequest::Kill` + dashboard `/kill` + manager `Kill` MCP tool. - `Destroyed { agent }` — `actions::destroy`. +- `ContainerCrash { agent, note }` — `crash_watch`: a previously- + running container went away with no operator-initiated transient + state (Stopping / Restarting / Destroying / Rebuilding). Manager + can `start` it again or escalate. - `OperatorAnswered { id, question, answer }` — dashboard `/answer-question/{id}` after the operator submits the answer form. diff --git a/hive-ag3nt/prompts/manager.md b/hive-ag3nt/prompts/manager.md index 3cf86dc..9a239f4 100644 --- a/hive-ag3nt/prompts/manager.md +++ b/hive-ag3nt/prompts/manager.md @@ -26,7 +26,7 @@ You're the policy gate between sub-agents and the operator's approval queue — Two ways to talk to the operator: `send(to: "operator", ...)` for fire-and-forget status / pointers (surfaces in the operator inbox), or `ask_operator(question, options?)` when you need a decision. `ask_operator` is non-blocking — it queues the question and returns an id immediately; the answer arrives on a future turn as an `operator_answered` system event. Prefer `ask_operator` over an open-ended `send` for anything you actually need to wait on. -Messages from sender `system` are hyperhive helper events (JSON body, `event` field discriminates): `approval_resolved`, `spawned`, `rebuilt`, `killed`, `destroyed`, `operator_answered`. Use these to react to lifecycle changes — e.g. greet a freshly-spawned agent, retry a failed rebuild, or pick up the operator's answer to a question you previously asked. +Messages from sender `system` are hyperhive helper events (JSON body, `event` field discriminates): `approval_resolved`, `spawned`, `rebuilt`, `killed`, `destroyed`, `container_crash`, `operator_answered`. Use these to react to lifecycle changes — e.g. greet a freshly-spawned agent, retry a failed rebuild, restart an agent whose container crashed, or pick up the operator's answer to a question you previously asked. Durable knowledge: diff --git a/hive-c0re/src/crash_watch.rs b/hive-c0re/src/crash_watch.rs new file mode 100644 index 0000000..a0b7dc0 --- /dev/null +++ b/hive-c0re/src/crash_watch.rs @@ -0,0 +1,72 @@ +//! Container crash watcher. Polls every managed container's running +//! state on a fixed interval; when a previously-running container is +//! suddenly stopped AND no operator-initiated transient (`Stopping`, +//! `Restarting`, `Destroying`) was set, fire `HelperEvent::ContainerCrash` +//! into the manager's inbox. The manager can then react — usually +//! a `start` or a config rebuild. +//! +//! D-Bus subscription would be lower-latency, but polling is far +//! simpler and the failure modes are honest (a crash discovered 10s +//! late is fine for our scale). + +use std::collections::HashSet; +use std::sync::Arc; +use std::time::Duration; + +use crate::coordinator::{Coordinator, TransientKind}; +use crate::lifecycle::{self, AGENT_PREFIX, MANAGER_NAME}; + +const POLL_INTERVAL: Duration = Duration::from_secs(10); + +pub fn spawn(coord: Arc) { + tokio::spawn(async move { + // Seed the running-set from the first poll so we don't emit a + // crash for every agent on startup. First tick fills it; only + // running→stopped transitions across subsequent ticks count. + let mut prev_running: HashSet = HashSet::new(); + let mut seeded = false; + loop { + let raw = lifecycle::list().await.unwrap_or_default(); + let mut current_running = HashSet::new(); + for c in &raw { + let logical = if c == MANAGER_NAME { + MANAGER_NAME.to_owned() + } else if let Some(n) = c.strip_prefix(AGENT_PREFIX) { + n.to_owned() + } else { + continue; + }; + if lifecycle::is_running(&logical).await { + current_running.insert(logical); + } + } + + if seeded { + let transients = coord.transient_snapshot(); + for stopped in prev_running.difference(¤t_running) { + let deliberate = transients.get(stopped).is_some_and(|st| { + matches!( + st.kind, + TransientKind::Stopping + | TransientKind::Restarting + | TransientKind::Destroying + | TransientKind::Rebuilding + ) + }); + if deliberate { + continue; + } + tracing::warn!(agent = %stopped, "container crash detected"); + coord.notify_manager(&hive_sh4re::HelperEvent::ContainerCrash { + agent: stopped.clone(), + note: Some("container stopped without an operator action".into()), + }); + } + } + prev_running = current_running; + seeded = true; + + tokio::time::sleep(POLL_INTERVAL).await; + } + }); +} diff --git a/hive-c0re/src/main.rs b/hive-c0re/src/main.rs index fde97e0..c16410d 100644 --- a/hive-c0re/src/main.rs +++ b/hive-c0re/src/main.rs @@ -12,6 +12,7 @@ mod auto_update; mod broker; mod client; mod coordinator; +mod crash_watch; mod dashboard; mod events_vacuum; mod lifecycle; @@ -130,6 +131,10 @@ async fn main() -> Result<()> { // Per-agent events.sqlite vacuum: host-side so the harness // doesn't need any retention wiring of its own. events_vacuum::spawn(coord.clone()); + // Container crash watcher: emits HelperEvent::ContainerCrash + // when a previously-running container goes away without an + // operator-initiated transient state. + crash_watch::spawn(coord.clone()); let dash_coord = coord.clone(); tokio::spawn(async move { if let Err(e) = dashboard::serve(dashboard_port, dash_coord).await { diff --git a/hive-sh4re/src/lib.rs b/hive-sh4re/src/lib.rs index 90949d8..e8558d9 100644 --- a/hive-sh4re/src/lib.rs +++ b/hive-sh4re/src/lib.rs @@ -259,6 +259,16 @@ pub enum HelperEvent { /// A sub-agent's container was torn down (container removed; state /// dirs preserved per `destroy` semantics). Destroyed { agent: String }, + /// Container exited without an operator-initiated stop. Fired by + /// the crash watcher when an agent's container transitions from + /// running → stopped and no `Stopping` / `Restarting` / + /// `Destroying` transient was set, so the operator (or the + /// manager) knows it crashed rather than was killed on purpose. + ContainerCrash { + agent: String, + #[serde(default, skip_serializing_if = "Option::is_none")] + note: Option, + }, /// The operator answered a question that was queued via /// `AskOperator`. `id` matches the `QuestionQueued.id` returned to the /// asker; `question` echoes the original prompt so the manager can