hyperhive/hive-ag3nt/src/bin/hive-ag3nt.rs

484 lines
20 KiB
Rust

use std::path::{Path, PathBuf};
use std::sync::{Arc, Mutex};
use std::time::Duration;
use hive_ag3nt::web_ui::TurnLock;
use anyhow::Result;
use clap::{Parser, Subcommand};
use hive_ag3nt::events::{Bus, LiveEvent, TurnState};
use hive_ag3nt::login::{self, LoginState};
use hive_ag3nt::turn_stats::{TurnStatRow, TurnStats};
use hive_ag3nt::{DEFAULT_SOCKET, DEFAULT_WEB_PORT, client, mcp, plugins, turn, web_ui};
use hive_sh4re::{AgentRequest, AgentResponse};
#[derive(Parser)]
#[command(name = "hive-ag3nt", about = "hyperhive sub-agent harness")]
struct Cli {
/// Path to the per-agent MCP socket (bind-mounted from the host).
#[arg(long, global = true, default_value = DEFAULT_SOCKET)]
socket: PathBuf,
#[command(subcommand)]
cmd: Cmd,
}
#[derive(Subcommand)]
enum Cmd {
/// Run the long-lived harness loop. Polls inbox; replies via `claude --print`
/// when available, falling back to a simple echo otherwise.
Serve {
/// Inbox poll interval in milliseconds.
#[arg(long, default_value_t = 1000)]
poll_ms: u64,
},
/// Run the agent's MCP server on stdio. Spawned by `claude` via
/// `--mcp-config`; tools dispatch through `/run/hive/mcp.sock` back into
/// the hyperhive broker.
Mcp,
/// Inject a wake-up event into this agent's inbox so the next turn
/// fires with the given body. Intended for extra MCP servers /
/// helpers running inside the container (matrix bridge, scraper,
/// webhook listener) that need to nudge claude on external events.
/// `from` is the sender label that appears in the wake prompt
/// (claude sees "from: matrix" etc.).
Wake {
#[arg(long)]
from: String,
/// Body of the wake message. Pass `-` to read from stdin.
#[arg(long)]
body: String,
},
}
#[tokio::main]
async fn main() -> Result<()> {
tracing_subscriber::fmt()
.with_env_filter(
tracing_subscriber::EnvFilter::try_from_default_env()
.unwrap_or_else(|_| tracing_subscriber::EnvFilter::new("info")),
)
.init();
let cli = Cli::parse();
match cli.cmd {
Cmd::Serve { poll_ms } => {
let port = std::env::var("HIVE_PORT")
.ok()
.and_then(|s| s.parse::<u16>().ok())
.unwrap_or(DEFAULT_WEB_PORT);
let label = std::env::var("HIVE_LABEL").unwrap_or_else(|_| "hive-ag3nt".into());
let claude_dir = login::default_dir();
let initial = LoginState::from_dir(&claude_dir);
tracing::info!(state = ?initial, claude_dir = %claude_dir.display(), "harness boot");
let login_state = Arc::new(Mutex::new(initial));
let bus = Bus::new();
let stats = TurnStats::open_default();
if let Some(s) = &stats {
let (ctx, cost) = s.last_usage();
if ctx.is_some() || cost.is_some() {
bus.seed_usage(ctx, cost);
}
}
let files = turn::TurnFiles::prepare(&cli.socket, &label, mcp::Flavor::Agent).await?;
let turn_lock: TurnLock = Arc::new(tokio::sync::Mutex::new(()));
plugins::install_configured(&cli.socket, Some("manager")).await;
tokio::spawn(web_ui::serve(
label.clone(),
port,
login_state.clone(),
bus.clone(),
cli.socket.clone(),
files.clone(),
turn_lock.clone(),
));
match initial {
LoginState::Online => {
serve(
&cli.socket,
Duration::from_millis(poll_ms),
login_state,
bus,
stats,
&files,
turn_lock,
&label,
)
.await
}
LoginState::NeedsLogin => {
// Partial-run mode: keep the harness alive (so the web UI
// stays bound) but don't drive the turn loop. Poll the
// claude dir; once a session lands we enter `serve`.
turn::wait_for_login(&claude_dir, login_state.clone(), &bus, poll_ms).await;
serve(
&cli.socket,
Duration::from_millis(poll_ms),
login_state,
bus,
stats,
&files,
turn_lock,
&label,
)
.await
}
}
}
Cmd::Mcp => mcp::serve_agent_stdio(cli.socket).await,
Cmd::Wake { from, body } => {
// Read body from stdin if caller passed `-`. Same convention
// many CLI tools use; keeps multi-line / shell-quoting
// friction out of the body content.
let body = if body == "-" {
let mut buf = String::new();
std::io::Read::read_to_string(&mut std::io::stdin(), &mut buf)?;
buf
} else {
body
};
let resp: AgentResponse =
client::request(&cli.socket, &AgentRequest::Wake { from, body }).await?;
match resp {
AgentResponse::Ok => Ok(()),
AgentResponse::Err { message } => anyhow::bail!("wake: {message}"),
other => anyhow::bail!("wake: unexpected response {other:?}"),
}
}
}
}
#[allow(clippy::too_many_arguments, clippy::similar_names)]
async fn serve(
socket: &Path,
interval: Duration,
state: Arc<Mutex<LoginState>>,
bus: Bus,
stats: Option<TurnStats>,
files: &turn::TurnFiles,
turn_lock: TurnLock,
label: &str,
) -> Result<()> {
tracing::info!(socket = %socket.display(), "hive-ag3nt serve");
let _ = state; // reserved for future state transitions (turn-loop -> needs-login)
// Boot-time recovery: ask the broker to resurface anything we
// popped in a previous harness session but never acked
// (crashed mid-turn / OOM / container restart). The broker
// resets `delivered_at = NULL` on those rows and remembers
// their ids so the next `Recv` tags them `redelivered: true`;
// we then prepend a "may already be handled" hint to the wake
// prompt. Single shot before entering the serve loop; idempotent
// when there's nothing inflight.
requeue_inflight(socket).await;
loop {
let recv: Result<AgentResponse> =
// Explicit long-poll: the new agent_server semantics treat
// `None` as "peek, don't wait", which would tight-loop on
// sleep(interval). The harness wants to park until a
// message arrives, so opt into the full 180s cap.
// `max: None` (= 1) — the serve loop drives one turn per
// wake; claude itself calls recv(max: N) in-turn to drain
// a burst when the wake prompt mentions pending.
client::request(
socket,
&AgentRequest::Recv {
wait_seconds: Some(180),
max: None,
},
)
.await;
match recv {
Ok(AgentResponse::Messages { messages }) if !messages.is_empty() => {
let first = messages.into_iter().next().expect("checked non-empty");
let from = first.from;
let body = first.body;
let redelivered = first.redelivered;
tracing::info!(%from, %body, %redelivered, "inbox");
let unread = inbox_unread(socket).await;
bus.emit(LiveEvent::TurnStart {
from: from.clone(),
body: body.clone(),
unread,
});
bus.set_state(TurnState::Thinking);
let started_at = now_unix();
let started_instant = std::time::Instant::now();
let model_at_start = bus.model();
let prompt = format_wake_prompt(&from, &body, unread, redelivered);
let outcome = {
let _guard = turn_lock.lock().await;
turn::drive_turn(&prompt, files, &bus).await
};
turn::emit_turn_end(&bus, &outcome);
bus.set_state(TurnState::Idle);
// Ack only on a clean turn-end. `Failed` leaves every
// message popped during the turn in the unacked list;
// next harness boot's `RequeueInflight` will reset
// `delivered_at = NULL` and tag them `redelivered`.
// `PromptTooLong` is absorbed inside `drive_turn` via
// compaction so it shouldn't reach here, but if it
// does we also skip the ack (safer to redeliver than
// to lose the message).
if matches!(outcome, turn::TurnOutcome::Ok) {
ack_turn(socket).await;
}
// Rate-limited: park until the quota resets, then requeue
// the unacked message so it resurfaces in the same session.
if matches!(outcome, turn::TurnOutcome::RateLimited) {
let secs = turn::rate_limit_sleep_secs();
bus.emit_status("rate_limited");
bus.emit(LiveEvent::Note {
text: format!(
"API rate-limited — sleeping {secs}s before retry"
),
});
tracing::warn!(sleep_secs = secs, "rate-limited; parking");
tokio::time::sleep(Duration::from_secs(secs)).await;
requeue_inflight(socket).await;
bus.emit_status("online");
}
// Failures are unhandled by definition — PromptTooLong is
// absorbed inside drive_turn via compaction, so anything
// that reaches Failed here is a real crash. Notify the
// manager so it can investigate / restart / page the
// operator; best-effort, swallow the send error.
if let turn::TurnOutcome::Failed(e) = &outcome {
notify_manager_of_failure(socket, label, e).await;
}
if let Some(s) = &stats {
let ended_at = now_unix();
let duration_ms =
i64::try_from(started_instant.elapsed().as_millis()).unwrap_or(i64::MAX);
let (open_threads, open_reminders) = fetch_agent_post_turn_counts(socket).await;
let row = build_row(
started_at,
ended_at,
duration_ms,
model_at_start,
from.clone(),
&outcome,
&bus,
open_threads,
open_reminders,
);
s.record(&row);
}
// After turn completes, log whether messages arrived during
// the turn — the outer loop will iterate back to recv() on
// its own (the Empty-arm sleep only fires when recv
// actually returned Empty), so no explicit continue needed.
let pending = inbox_unread(socket).await;
if pending > 0 {
tracing::info!(%pending, "pending messages after turn; fetching next");
}
}
Ok(AgentResponse::Messages { .. }) => {
// Idle: empty list = nothing pending. Brief sleep
// before next poll so a stretch of empty long-poll
// returns doesn't tight-loop.
tokio::time::sleep(interval).await;
}
Ok(
AgentResponse::Ok
| AgentResponse::Status { .. }
| AgentResponse::Recent { .. }
| AgentResponse::QuestionQueued { .. }
| AgentResponse::LooseEnds { .. }
| AgentResponse::PendingRemindersCount { .. }
| AgentResponse::ReminderRollup { .. }
| AgentResponse::Whoami { .. },
) => {
tracing::warn!("recv produced unexpected response kind");
}
Ok(AgentResponse::Err { message }) => {
tracing::warn!(%message, "recv error");
}
Err(e) => {
tracing::warn!(error = ?e, "recv failed; retrying");
}
}
}
}
/// Per-turn user prompt. The role/tools/etc. is in the system prompt
/// (`prompts/agent.md` → `claude --system-prompt-file`); this is just the
/// wake signal claude reacts to. `unread` is the count of *other*
/// messages in the inbox right after this one was popped.
/// `redelivered` flags messages that were popped in a prior harness
/// session, never acked, and resurfaced after a restart — a banner
/// at the top of the wake prompt warns that any side-effects of
/// previous handling may already have happened.
fn format_wake_prompt(from: &str, body: &str, unread: u64, redelivered: bool) -> String {
let banner = if redelivered {
hive_ag3nt::mcp::REDELIVERY_HINT
} else {
""
};
let pending = if unread == 0 {
String::new()
} else {
format!(
"\n\n({unread} more message(s) pending in your inbox — call `mcp__hyperhive__recv` \
with `max: {unread}` to drain them all in one round-trip before acting.)"
)
};
format!("{banner}Incoming message from `{from}`:\n---\n{body}\n---{pending}")
}
/// Best-effort: tell the broker every message we popped during the
/// turn is now fully handled (turn-end-OK). Swallows transport
/// errors — the worst case is a redundant requeue on next boot.
async fn ack_turn(socket: &Path) {
match client::request::<_, AgentResponse>(socket, &AgentRequest::AckTurn).await {
Ok(AgentResponse::Ok) => {}
Ok(AgentResponse::Err { message }) => {
tracing::warn!(%message, "ack_turn rejected by broker");
}
Ok(other) => {
tracing::warn!(?other, "ack_turn unexpected response");
}
Err(e) => tracing::warn!(error = ?e, "ack_turn transport error"),
}
}
/// Boot-time recovery: ask the broker to resurface anything we
/// popped in a previous harness session but never acked. The broker
/// resets `delivered_at = NULL` on those rows and remembers their
/// ids so the next `Recv` carries `redelivered: true`. Swallows
/// transport errors — they degrade to "no recovery this boot",
/// which is no worse than the pre-feature behaviour (silent drop).
async fn requeue_inflight(socket: &Path) {
match client::request::<_, AgentResponse>(socket, &AgentRequest::RequeueInflight).await {
Ok(AgentResponse::Ok) => {}
Ok(AgentResponse::Err { message }) => {
tracing::warn!(%message, "requeue_inflight rejected by broker");
}
Ok(other) => {
tracing::warn!(?other, "requeue_inflight unexpected response");
}
Err(e) => tracing::warn!(error = ?e, "requeue_inflight transport error"),
}
}
/// Best-effort: tell the manager that this agent's last turn crashed
/// (claude exited non-zero, compaction didn't help, etc.). Routed
/// through the normal send path so the manager's inbox surfaces it
/// as a system-style event; `label` is included explicitly in the
/// body so the manager can identify the failing agent without having
/// to look at the `from` field (which is broker-stamped and may
/// differ from what the operator sees in the dashboard). Swallows
/// transport errors — we just logged the failure, the worst case is
/// the manager learns about the crash from the dashboard instead of
/// inbox.
async fn notify_manager_of_failure(socket: &Path, label: &str, err: &anyhow::Error) {
let body = format!("[system] agent `{label}` claude turn failed:\n{err:#}");
let res = client::request::<_, AgentResponse>(
socket,
&AgentRequest::Send {
to: "manager".into(),
body,
},
)
.await;
if let Err(e) = res {
tracing::warn!(error = ?e, "failed to notify manager of turn failure");
}
}
/// Best-effort: ask our own per-agent socket how many messages are still
/// pending after the wake-up Recv. Returns 0 if anything goes wrong.
async fn inbox_unread(socket: &Path) -> u64 {
match client::request::<_, AgentResponse>(socket, &AgentRequest::Status).await {
Ok(AgentResponse::Status { unread }) => unread,
_ => 0,
}
}
fn now_unix() -> i64 {
std::time::SystemTime::now()
.duration_since(std::time::UNIX_EPOCH)
.ok()
.and_then(|d| i64::try_from(d.as_secs()).ok())
.unwrap_or(0)
}
/// Best-effort: ask hive-c0re for this agent's open thread count + pending
/// reminder count, after the turn finishes. Either roundtrip can fail
/// (transport hiccup, race with hive-c0re restart) — in those cases we
/// just drop a `None` into the stats row rather than blocking the loop.
async fn fetch_agent_post_turn_counts(socket: &Path) -> (Option<u64>, Option<u64>) {
let threads = match client::request::<_, AgentResponse>(
socket,
&AgentRequest::GetLooseEnds,
)
.await
{
Ok(AgentResponse::LooseEnds { loose_ends }) => u64::try_from(loose_ends.len()).ok(),
_ => None,
};
let reminders = match client::request::<_, AgentResponse>(
socket,
&AgentRequest::CountPendingReminders,
)
.await
{
Ok(AgentResponse::PendingRemindersCount { count }) => Some(count),
_ => None,
};
(threads, reminders)
}
/// Assemble a `TurnStatRow` from the harness's per-turn state. Shared
/// shape between the agent + manager bin loops (each lives in its own
/// crate root so this helper is duplicated; the savings of a shared
/// module aren't worth the cross-crate ceremony at this size).
#[allow(clippy::too_many_arguments)]
fn build_row(
started_at: i64,
ended_at: i64,
duration_ms: i64,
model: String,
wake_from: String,
outcome: &turn::TurnOutcome,
bus: &Bus,
open_threads_count: Option<u64>,
open_reminders_count: Option<u64>,
) -> TurnStatRow {
let cost = bus.last_cost_usage().unwrap_or_default();
let ctx = bus.last_ctx_usage().unwrap_or(cost);
let tool_calls = bus.take_tool_calls();
let tool_call_count: u64 = tool_calls.values().copied().sum();
let tool_call_breakdown_json = if tool_calls.is_empty() {
None
} else {
serde_json::to_string(&tool_calls).ok()
};
let (result_kind, note) = match outcome {
turn::TurnOutcome::Ok => ("ok", None),
turn::TurnOutcome::PromptTooLong => ("prompt_too_long", None),
turn::TurnOutcome::RateLimited => ("rate_limited", None),
turn::TurnOutcome::Failed(e) => ("failed", Some(format!("{e:#}"))),
};
TurnStatRow {
started_at,
ended_at,
duration_ms,
model,
wake_from,
input_tokens: cost.input_tokens,
output_tokens: cost.output_tokens,
cache_read_input_tokens: cost.cache_read_input_tokens,
cache_creation_input_tokens: cost.cache_creation_input_tokens,
last_input_tokens: ctx.input_tokens,
last_output_tokens: ctx.output_tokens,
last_cache_read_input_tokens: ctx.cache_read_input_tokens,
last_cache_creation_input_tokens: ctx.cache_creation_input_tokens,
tool_call_count,
tool_call_breakdown_json,
open_threads_count,
open_reminders_count,
result_kind,
note,
}
}