new mcp tool on the manager surface that queues a question on the
dashboard and returns the question id immediately. operator submits an
answer via /answer-question/<id>; the dashboard fires
HelperEvent::OperatorAnswered { id, question, answer } into the manager
inbox so the next turn picks it up.
also: fix async-form button stuck on spinner after successful submit
(refreshState skipped re-rendering, so the button was never re-enabled).
197 lines
7.7 KiB
Rust
197 lines
7.7 KiB
Rust
//! Runtime state + config shared between the host admin socket, the manager
|
|
//! socket, and the per-agent sockets: the broker, configured `agent_flake`,
|
|
//! and the map of registered agent sockets.
|
|
|
|
use std::collections::HashMap;
|
|
use std::path::{Path, PathBuf};
|
|
use std::sync::{Arc, Mutex};
|
|
|
|
use anyhow::{Context, Result};
|
|
|
|
use crate::agent_server::{self, AgentSocket};
|
|
use crate::approvals::Approvals;
|
|
use crate::broker::Broker;
|
|
use crate::operator_questions::OperatorQuestions;
|
|
|
|
const AGENT_RUNTIME_ROOT: &str = "/run/hyperhive/agents";
|
|
const MANAGER_RUNTIME_ROOT: &str = "/run/hyperhive/manager";
|
|
/// Manager-editable per-agent config repos. Bind-mounted RW into the manager
|
|
/// container as `/agents/<name>/`. Hive-c0re only writes to these on first
|
|
/// spawn (initial commit); after that it's manager-only.
|
|
const AGENT_STATE_ROOT: &str = "/var/lib/hyperhive/agents";
|
|
/// Hive-c0re-only authoritative per-agent config repos. Containers build from
|
|
/// these. Manager has no filesystem access; the only way to update is via
|
|
/// `request_apply_commit` + user approval.
|
|
const APPLIED_STATE_ROOT: &str = "/var/lib/hyperhive/applied";
|
|
|
|
pub struct Coordinator {
|
|
pub broker: Arc<Broker>,
|
|
pub approvals: Arc<Approvals>,
|
|
pub questions: Arc<OperatorQuestions>,
|
|
/// URL of the hyperhive flake (no fragment). Inlined into per-agent
|
|
/// `flake.nix` files as `inputs.hyperhive.url`.
|
|
pub hyperhive_flake: String,
|
|
/// TCP port the host's hive-c0re dashboard listens on. Inlined into
|
|
/// each per-agent flake so the agent's web UI can build the right
|
|
/// rebuild-button URL pointing back at the dashboard.
|
|
pub dashboard_port: u16,
|
|
agents: Mutex<HashMap<String, AgentSocket>>,
|
|
/// Agents whose lifecycle action (currently just spawn) is in flight.
|
|
/// Read by the dashboard to render a spinner; cleared when the action
|
|
/// resolves (success or failure).
|
|
transient: Mutex<HashMap<String, TransientState>>,
|
|
}
|
|
|
|
/// Per-agent in-progress state that the dashboard surfaces between approve
|
|
/// click and container ready.
|
|
#[derive(Debug, Clone)]
|
|
pub struct TransientState {
|
|
pub kind: TransientKind,
|
|
pub since: std::time::Instant,
|
|
}
|
|
|
|
#[derive(Debug, Clone, Copy)]
|
|
pub enum TransientKind {
|
|
/// `lifecycle::spawn` is running (nixos-container create + update + start).
|
|
Spawning,
|
|
}
|
|
|
|
impl Coordinator {
|
|
pub fn open(db_path: &Path, hyperhive_flake: String, dashboard_port: u16) -> Result<Self> {
|
|
let broker = Broker::open(db_path).context("open broker")?;
|
|
let approvals = Approvals::open(db_path).context("open approvals")?;
|
|
let questions = OperatorQuestions::open(db_path).context("open operator_questions")?;
|
|
Ok(Self {
|
|
broker: Arc::new(broker),
|
|
approvals: Arc::new(approvals),
|
|
questions: Arc::new(questions),
|
|
hyperhive_flake,
|
|
dashboard_port,
|
|
agents: Mutex::new(HashMap::new()),
|
|
transient: Mutex::new(HashMap::new()),
|
|
})
|
|
}
|
|
|
|
pub fn register_agent(&self, name: &str) -> Result<PathBuf> {
|
|
// Idempotent: drop any existing listener so re-registration (e.g. on rebuild,
|
|
// or after a hive-c0re restart cleared /run/hyperhive) gets a fresh socket.
|
|
self.unregister_agent(name);
|
|
let agent_dir = Self::agent_dir(name);
|
|
std::fs::create_dir_all(&agent_dir)
|
|
.with_context(|| format!("create agent dir {}", agent_dir.display()))?;
|
|
let socket_path = Self::socket_path(name);
|
|
let socket = agent_server::start(name, &socket_path, self.broker.clone())?;
|
|
self.agents.lock().unwrap().insert(name.to_owned(), socket);
|
|
Ok(agent_dir)
|
|
}
|
|
|
|
pub fn unregister_agent(&self, name: &str) {
|
|
if let Some(socket) = self.agents.lock().unwrap().remove(name) {
|
|
socket.handle.abort();
|
|
let _ = std::fs::remove_file(&socket.path);
|
|
}
|
|
}
|
|
|
|
/// Mark an agent as in-progress (only one state per agent for now).
|
|
pub fn set_transient(&self, name: &str, kind: TransientKind) {
|
|
self.transient.lock().unwrap().insert(
|
|
name.to_owned(),
|
|
TransientState {
|
|
kind,
|
|
since: std::time::Instant::now(),
|
|
},
|
|
);
|
|
}
|
|
|
|
pub fn clear_transient(&self, name: &str) {
|
|
self.transient.lock().unwrap().remove(name);
|
|
}
|
|
|
|
pub fn transient_snapshot(&self) -> HashMap<String, TransientState> {
|
|
self.transient.lock().unwrap().clone()
|
|
}
|
|
|
|
/// Push a `HelperEvent` into the manager's inbox. Encoded as JSON in
|
|
/// `Message::body`; sender = `SYSTEM_SENDER`. The manager harness
|
|
/// recognises the sender and parses the body. Best-effort: a serde or
|
|
/// broker error is logged but does not propagate.
|
|
pub fn notify_manager(&self, event: &hive_sh4re::HelperEvent) {
|
|
let body = match serde_json::to_string(event) {
|
|
Ok(s) => s,
|
|
Err(e) => {
|
|
tracing::warn!(error = ?e, "failed to encode helper event");
|
|
return;
|
|
}
|
|
};
|
|
if let Err(e) = self.broker.send(&hive_sh4re::Message {
|
|
from: hive_sh4re::SYSTEM_SENDER.to_owned(),
|
|
to: hive_sh4re::MANAGER_AGENT.to_owned(),
|
|
body,
|
|
}) {
|
|
tracing::warn!(error = ?e, "failed to push helper event to manager");
|
|
}
|
|
}
|
|
|
|
pub fn agent_dir(name: &str) -> PathBuf {
|
|
PathBuf::from(format!("{AGENT_RUNTIME_ROOT}/{name}"))
|
|
}
|
|
|
|
pub fn socket_path(name: &str) -> PathBuf {
|
|
Self::agent_dir(name).join("mcp.sock")
|
|
}
|
|
|
|
pub fn manager_dir() -> PathBuf {
|
|
PathBuf::from(MANAGER_RUNTIME_ROOT)
|
|
}
|
|
|
|
pub fn manager_socket_path() -> PathBuf {
|
|
Self::manager_dir().join("mcp.sock")
|
|
}
|
|
|
|
/// Ensure a runtime dir + (for sub-agents) per-agent socket exists. For
|
|
/// the manager, `manager_server::start` owns the socket — just return
|
|
/// the dir. For sub-agents this is `register_agent` (creates a fresh
|
|
/// listener bound to `socket_path(name)`). Source directory of the
|
|
/// `/run/hive/mcp.sock` bind that ends up in `set_nspawn_flags`.
|
|
pub fn ensure_runtime(&self, name: &str) -> Result<PathBuf> {
|
|
if name == crate::lifecycle::MANAGER_NAME {
|
|
let dir = Self::manager_dir();
|
|
std::fs::create_dir_all(&dir)
|
|
.with_context(|| format!("create manager dir {}", dir.display()))?;
|
|
return Ok(dir);
|
|
}
|
|
self.register_agent(name)
|
|
}
|
|
|
|
/// Per-agent state root (parent of `config/`, future `prompts/`, etc.).
|
|
pub fn agent_state_root(name: &str) -> PathBuf {
|
|
PathBuf::from(format!("{AGENT_STATE_ROOT}/{name}"))
|
|
}
|
|
|
|
/// Manager-editable proposed config repo. Bind-mounted into the manager
|
|
/// container as `/agents/<name>/config/`.
|
|
pub fn agent_proposed_dir(name: &str) -> PathBuf {
|
|
Self::agent_state_root(name).join("config")
|
|
}
|
|
|
|
/// Per-agent Claude credentials dir. Bind-mounted RW into the agent
|
|
/// container at `/root/.claude` so OAuth state survives container
|
|
/// destroy/recreate. Each agent owns its own token lineage — sharing
|
|
/// would break on the first refresh-token rotation.
|
|
pub fn agent_claude_dir(name: &str) -> PathBuf {
|
|
Self::agent_state_root(name).join("claude")
|
|
}
|
|
|
|
/// Per-agent durable knowledge dir. Bind-mounted RW into the agent
|
|
/// container at `/state`. Survives destroy/recreate alongside the
|
|
/// claude dir. Agents are told (via the system prompt) to write
|
|
/// long-lived notes / scratch state here.
|
|
pub fn agent_notes_dir(name: &str) -> PathBuf {
|
|
Self::agent_state_root(name).join("state")
|
|
}
|
|
|
|
/// Authoritative applied config repo. Hive-c0re-only.
|
|
pub fn agent_applied_dir(name: &str) -> PathBuf {
|
|
PathBuf::from(format!("{APPLIED_STATE_ROOT}/{name}"))
|
|
}
|
|
}
|