hyperhive/hive-sh4re/src/lib.rs
müde 58c3cd853b container crash watcher → HelperEvent::ContainerCrash
new hive_c0re::crash_watch task polls every 10s, builds the set of
currently-running containers, and on running→stopped transitions
checks the transient snapshot: if no Stopping / Restarting /
Destroying / Rebuilding flag is set, the container exited
unexpectedly and we fire HelperEvent::ContainerCrash into the
manager's inbox so it can react (typically: start it again).

first poll is a seeding pass — no events on harness startup. dbus
subscription would be lower-latency but polling is honest and
debuggable, and a 10s delay on crash detection is fine for our
scale.

manager prompt + approvals doc updated to advertise the new
event variant. todo drops the entry (and the journald-viewer
entry that already shipped).
2026-05-15 21:02:05 +02:00

386 lines
14 KiB
Rust

//! Wire types shared between `hive-c0re` and the in-container harness.
use serde::{Deserialize, Serialize};
// -----------------------------------------------------------------------------
// Host admin socket — /run/hyperhive/host.sock
// -----------------------------------------------------------------------------
/// Requests on the host admin socket.
///
/// Wire format: one JSON object per line.
#[derive(Debug, Clone, Serialize, Deserialize)]
#[serde(tag = "cmd", rename_all = "snake_case")]
pub enum HostRequest {
/// Create and start a sub-agent container directly (no approval). Use
/// this from privileged contexts (operator on the host); it bypasses the
/// approval queue intentionally so test scripts and one-off recoveries
/// don't need a separate approve step.
Spawn { name: String },
/// Submit a spawn request for the user to approve. On approval the host
/// creates and starts the container. Mirrors the manager's
/// `RequestSpawn` — exposed on the admin socket so the dashboard and CLI
/// can also queue spawns through the approval flow.
RequestSpawn { name: String },
/// Stop a managed container (graceful).
Kill { name: String },
/// Tear down a sub-agent container: stop + remove + drop the systemd
/// drop-in, purge pending approvals. Persistent state (proposed/applied
/// repos, Claude credentials) is KEPT by default — recreating the agent
/// with the same name reuses prior config + login. With `purge=true`
/// the agent's `/var/lib/hyperhive/{agents,applied}/<name>/` trees are
/// also wiped (config history + creds + notes gone forever). Manager
/// not destroyable.
Destroy {
name: String,
#[serde(default)]
purge: bool,
},
/// Apply pending config to a managed container.
Rebuild { name: String },
/// List managed containers.
List,
/// List pending approval requests.
Pending,
/// Approve a pending request by id; the action runs immediately.
Approve { id: i64 },
/// Deny a pending request by id.
Deny { id: i64 },
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct HostResponse {
pub ok: bool,
#[serde(default, skip_serializing_if = "Option::is_none")]
pub error: Option<String>,
#[serde(default, skip_serializing_if = "Option::is_none")]
pub agents: Option<Vec<String>>,
#[serde(default, skip_serializing_if = "Option::is_none")]
pub approvals: Option<Vec<Approval>>,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct Approval {
pub id: i64,
pub agent: String,
#[serde(default)]
pub kind: ApprovalKind,
/// For `ApplyCommit`: the git sha to apply. For `Spawn`: empty.
pub commit_ref: String,
pub requested_at: i64,
pub status: ApprovalStatus,
#[serde(default, skip_serializing_if = "Option::is_none")]
pub resolved_at: Option<i64>,
#[serde(default, skip_serializing_if = "Option::is_none")]
pub note: Option<String>,
}
/// What action the approval, when granted, will trigger.
#[derive(Debug, Clone, Copy, Default, Serialize, Deserialize, PartialEq, Eq)]
#[serde(rename_all = "snake_case")]
pub enum ApprovalKind {
/// Apply a manager-proposed config commit (existing flow).
#[default]
ApplyCommit,
/// Create + start a new sub-agent container with the given name.
Spawn,
}
#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
#[serde(rename_all = "snake_case")]
pub enum ApprovalStatus {
Pending,
Approved,
Denied,
Failed,
}
impl HostResponse {
pub fn success() -> Self {
Self {
ok: true,
error: None,
agents: None,
approvals: None,
}
}
pub fn error(message: impl Into<String>) -> Self {
Self {
ok: false,
error: Some(message.into()),
agents: None,
approvals: None,
}
}
pub fn list(agents: Vec<String>) -> Self {
Self {
ok: true,
error: None,
agents: Some(agents),
approvals: None,
}
}
pub fn pending(approvals: Vec<Approval>) -> Self {
Self {
ok: true,
error: None,
agents: None,
approvals: Some(approvals),
}
}
}
// -----------------------------------------------------------------------------
// Per-agent socket — /run/hyperhive/agents/<name>/mcp.sock on the host,
// bind-mounted into the container at /run/hive/mcp.sock.
// -----------------------------------------------------------------------------
/// A logical message between agents.
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct Message {
pub from: String,
pub to: String,
pub body: String,
}
/// One row of a broker inbox query — what the dashboard renders in
/// its operator-inbox section and what a per-agent web UI returns
/// from a `Recent` request. Lives in `hive_sh4re` so it can travel
/// over both the dashboard's `/api/state` and the agent socket
/// without an internal-to-wire conversion.
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct InboxRow {
pub id: i64,
pub from: String,
pub body: String,
pub at: i64,
}
/// Requests on a per-agent socket. The agent's identity is the socket
/// it came in on; `Send.from` is filled in by the server, not the client.
#[derive(Debug, Clone, Serialize, Deserialize)]
#[serde(tag = "cmd", rename_all = "snake_case")]
pub enum AgentRequest {
/// Send a message to another agent.
Send { to: String, body: String },
/// Pop one pending message from this agent's inbox. Long-polls
/// up to `wait_seconds` (capped at 60s server-side, default 30s
/// when None) before returning `Empty`.
Recv {
#[serde(default)]
wait_seconds: Option<u64>,
},
/// Non-mutating: how many pending messages are addressed to me?
/// Used by the harness to render a status line after each tool call.
Status,
/// Operator-injected message TO this agent (from this agent's own web
/// UI). Recipient is implicit — `from` is `"operator"`. Effectively the
/// per-agent equivalent of the old dashboard T4LK form, but scoped to
/// the agent whose page the operator is on.
OperatorMsg { body: String },
/// Last `limit` messages addressed to this agent, newest-first.
/// Non-mutating — pulls from the broker without delivering. The
/// per-agent web UI uses this to render its own inbox section.
Recent { limit: u64 },
}
/// Responses on a per-agent socket.
#[derive(Debug, Clone, Serialize, Deserialize)]
#[serde(tag = "kind", rename_all = "snake_case")]
pub enum AgentResponse {
/// `Send` succeeded.
Ok,
/// Either `Send` failed or `Recv` errored.
Err { message: String },
/// `Recv` produced a message.
Message { from: String, body: String },
/// `Recv` found nothing pending.
Empty,
/// `Status` result: how many pending messages are in this agent's inbox.
Status { unread: u64 },
/// `Recent` result: newest-first inbox rows.
Recent { rows: Vec<InboxRow> },
}
// -----------------------------------------------------------------------------
// Manager socket — /run/hyperhive/manager/mcp.sock on the host, bind-mounted
// into the manager container at /run/hive/mcp.sock.
// -----------------------------------------------------------------------------
/// Logical name the broker uses for the manager.
pub const MANAGER_AGENT: &str = "manager";
/// Logical name the broker uses for the human operator. Messages with
/// `to = OPERATOR_RECIPIENT` accumulate in sqlite and surface on the
/// dashboard's inbox view — they are never `recv`'d by an agent harness.
pub const OPERATOR_RECIPIENT: &str = "operator";
/// Sender hive-c0re uses for events it pushes into the manager's inbox.
/// Manager harness recognises this and parses the body as a `HelperEvent`.
pub const SYSTEM_SENDER: &str = "system";
/// Out-of-band events the host-side daemon pushes to the manager's inbox.
/// Serialised as JSON in `Message::body` (sender = `SYSTEM_SENDER`).
#[derive(Debug, Clone, Serialize, Deserialize)]
#[serde(tag = "event", rename_all = "snake_case")]
pub enum HelperEvent {
/// An approval was approved/denied/failed; if approved, the underlying
/// action (rebuild or spawn) has already run by the time this lands.
ApprovalResolved {
id: i64,
agent: String,
commit_ref: String,
status: ApprovalStatus,
#[serde(default, skip_serializing_if = "Option::is_none")]
note: Option<String>,
},
/// A new container was spawned (post-approval or via the admin CLI
/// bypass path). `ok=false` means the spawn failed.
Spawned {
agent: String,
ok: bool,
#[serde(default, skip_serializing_if = "Option::is_none")]
note: Option<String>,
},
/// A container was rebuilt (auto-update on flake rev change, or a
/// manual rebuild from CLI/dashboard).
Rebuilt {
agent: String,
ok: bool,
#[serde(default, skip_serializing_if = "Option::is_none")]
note: Option<String>,
},
/// A sub-agent's container was stopped (the systemd unit is down;
/// persistent state is unchanged).
Killed { agent: String },
/// A sub-agent's container was torn down (container removed; state
/// dirs preserved per `destroy` semantics).
Destroyed { agent: String },
/// Container exited without an operator-initiated stop. Fired by
/// the crash watcher when an agent's container transitions from
/// running → stopped and no `Stopping` / `Restarting` /
/// `Destroying` transient was set, so the operator (or the
/// manager) knows it crashed rather than was killed on purpose.
ContainerCrash {
agent: String,
#[serde(default, skip_serializing_if = "Option::is_none")]
note: Option<String>,
},
/// The operator answered a question that was queued via
/// `AskOperator`. `id` matches the `QuestionQueued.id` returned to the
/// asker; `question` echoes the original prompt so the manager can
/// stitch the answer back to context across compactions.
OperatorAnswered {
id: i64,
question: String,
answer: String,
},
}
/// Requests on the manager socket. Manager has the agent surface (send/recv)
/// plus privileged lifecycle verbs.
#[derive(Debug, Clone, Serialize, Deserialize)]
#[serde(tag = "cmd", rename_all = "snake_case")]
pub enum ManagerRequest {
Send {
to: String,
body: String,
},
/// Same shape as `AgentRequest::Recv` — caller-tunable long-poll
/// duration, capped at 60s server-side, default 30s when None.
Recv {
#[serde(default)]
wait_seconds: Option<u64>,
},
/// Non-mutating: pending message count, used to render a status line
/// after each MCP tool call (mirrors `AgentRequest::Status`).
Status,
/// Operator-injected message TO the manager (from the manager's own web
/// UI). Same shape as `AgentRequest::OperatorMsg`.
OperatorMsg {
body: String,
},
/// Last `limit` messages addressed to the manager, newest-first.
/// Non-mutating; mirror of `AgentRequest::Recent`.
Recent {
limit: u64,
},
/// Submit a spawn request for the user to approve. On approval the host
/// creates and starts the container. Brand-new agent names only — if an
/// agent of the same name already exists, the approval will fail.
RequestSpawn {
name: String,
},
/// Stop a sub-agent (graceful).
Kill {
name: String,
},
/// Start a previously-stopped sub-agent container.
Start {
name: String,
},
/// Restart a sub-agent container (stop + start).
Restart {
name: String,
},
/// Submit a config commit for the user to approve. `commit_ref` is opaque
/// to the host (typically a git sha pointing into the agent's config repo).
/// On approval the host applies the change via `nixos-container update`.
RequestApplyCommit {
agent: String,
commit_ref: String,
},
/// Ask the operator a question. Returns immediately with the queued
/// question id; the operator's answer arrives later as a
/// `HelperEvent::OperatorAnswered` in the manager inbox.
///
/// - `options` is advisory: empty = free-text only; non-empty = the
/// dashboard renders the choices alongside a free-text fallback
/// ("Other…") so the operator is never trapped.
/// - `multi=true` lets the operator pick multiple options (rendered
/// as checkboxes). The answer is returned as a single string with
/// selections joined by ", ".
/// - `ttl_seconds`: optional auto-cancel after that many seconds. On
/// expiry the question is resolved with answer `[expired]` and the
/// manager gets the usual `OperatorAnswered` event. None = wait
/// forever for an operator answer (or manual cancel).
AskOperator {
question: String,
#[serde(default)]
options: Vec<String>,
#[serde(default)]
multi: bool,
#[serde(default)]
ttl_seconds: Option<u64>,
},
}
#[derive(Debug, Clone, Serialize, Deserialize)]
#[serde(tag = "kind", rename_all = "snake_case")]
pub enum ManagerResponse {
Ok,
Err {
message: String,
},
Message {
from: String,
body: String,
},
Empty,
Status {
unread: u64,
},
/// Result of `AskOperator`: the queued question id. The actual answer
/// arrives later as a `HelperEvent::OperatorAnswered` in the manager
/// inbox, so this returns immediately rather than blocking the turn.
QuestionQueued {
id: i64,
},
/// `Recent` result: mirror of `AgentResponse::Recent`.
Recent {
rows: Vec<InboxRow>,
},
}