hyperhive/hive-ag3nt/src/bin/hive-ag3nt.rs

use std::path::{Path, PathBuf};
use std::sync::{Arc, Mutex};
use std::time::Duration;

use hive_ag3nt::web_ui::TurnLock;

use anyhow::Result;
use clap::{Parser, Subcommand};
use hive_ag3nt::events::{Bus, LiveEvent, TurnState};
use hive_ag3nt::login::{self, LoginState};
use hive_ag3nt::turn_stats::{TurnStatRow, TurnStats};
use hive_ag3nt::{DEFAULT_SOCKET, DEFAULT_WEB_PORT, client, mcp, plugins, turn, web_ui};
use hive_sh4re::{AgentRequest, AgentResponse};

#[derive(Parser)]
#[command(name = "hive-ag3nt", about = "hyperhive sub-agent harness")]
struct Cli {
    /// Path to the per-agent MCP socket (bind-mounted from the host).
    #[arg(long, global = true, default_value = DEFAULT_SOCKET)]
    socket: PathBuf,

    #[command(subcommand)]
    cmd: Cmd,
}

#[derive(Subcommand)]
enum Cmd {
    /// Run the long-lived harness loop. Polls inbox; replies via `claude --print`
    /// when available, falling back to a simple echo otherwise.
    Serve {
        /// Inbox poll interval in milliseconds.
        #[arg(long, default_value_t = 1000)]
        poll_ms: u64,
    },
    /// Run the agent's MCP server on stdio. Spawned by `claude` via
    /// `--mcp-config`; tools dispatch through `/run/hive/mcp.sock` back into
    /// the hyperhive broker.
    Mcp,
    /// Inject a wake-up event into this agent's inbox so the next turn
    /// fires with the given body. Intended for extra MCP servers /
    /// helpers running inside the container (matrix bridge, scraper,
    /// webhook listener) that need to nudge claude on external events.
    /// `from` is the sender label that appears in the wake prompt
    /// (claude sees "from: matrix" etc.).
    Wake {
        #[arg(long)]
        from: String,
        /// Body of the wake message. Pass `-` to read from stdin.
        #[arg(long)]
        body: String,
    },
}

#[tokio::main]
async fn main() -> Result<()> {
    tracing_subscriber::fmt()
        .with_env_filter(
            tracing_subscriber::EnvFilter::try_from_default_env()
                .unwrap_or_else(|_| tracing_subscriber::EnvFilter::new("info")),
        )
        .init();

    let cli = Cli::parse();
    match cli.cmd {
        Cmd::Serve { poll_ms } => {
            let port = std::env::var("HIVE_PORT")
                .ok()
                .and_then(|s| s.parse::<u16>().ok())
                .unwrap_or(DEFAULT_WEB_PORT);
            let label = std::env::var("HIVE_LABEL").unwrap_or_else(|_| "hive-ag3nt".into());
            let claude_dir = login::default_dir();
            let initial = LoginState::from_dir(&claude_dir);
            tracing::info!(state = ?initial, claude_dir = %claude_dir.display(), "harness boot");
            let login_state = Arc::new(Mutex::new(initial));
            let bus = Bus::new();
            let stats = TurnStats::open_default();
            if let Some(s) = &stats {
                let (ctx, cost) = s.last_usage();
                if ctx.is_some() || cost.is_some() {
                    bus.seed_usage(ctx, cost);
                }
            }
            let files = turn::TurnFiles::prepare(&cli.socket, &label, mcp::Flavor::Agent).await?;
            let turn_lock: TurnLock = Arc::new(tokio::sync::Mutex::new(()));
            plugins::install_configured(&cli.socket, Some("manager")).await;
            tokio::spawn(web_ui::serve(
                label.clone(),
                port,
                login_state.clone(),
                bus.clone(),
                cli.socket.clone(),
                files.clone(),
                turn_lock.clone(),
            ));
            match initial {
                LoginState::Online => {
                    serve(
                        &cli.socket,
                        Duration::from_millis(poll_ms),
                        login_state,
                        bus,
                        stats,
                        &files,
                        turn_lock,
                        &label,
                    )
                    .await
                }
                LoginState::NeedsLogin => {
                    // Partial-run mode: keep the harness alive (so the web UI
                    // stays bound) but don't drive the turn loop. Poll the
                    // claude dir; once a session lands we enter `serve`.
                    turn::wait_for_login(&claude_dir, login_state.clone(), &bus, poll_ms).await;
                    serve(
                        &cli.socket,
                        Duration::from_millis(poll_ms),
                        login_state,
                        bus,
                        stats,
                        &files,
                        turn_lock,
                        &label,
                    )
                    .await
                }
            }
        }
        Cmd::Mcp => mcp::serve_agent_stdio(cli.socket).await,
        Cmd::Wake { from, body } => {
            // Read body from stdin if caller passed `-`. Same convention
            // many CLI tools use; keeps multi-line / shell-quoting
            // friction out of the body content.
            let body = if body == "-" {
                let mut buf = String::new();
                std::io::Read::read_to_string(&mut std::io::stdin(), &mut buf)?;
                buf
            } else {
                body
            };
            let resp: AgentResponse =
                client::request(&cli.socket, &AgentRequest::Wake { from, body }).await?;
            match resp {
                AgentResponse::Ok => Ok(()),
                AgentResponse::Err { message } => anyhow::bail!("wake: {message}"),
                other => anyhow::bail!("wake: unexpected response {other:?}"),
            }
        }
    }
}

#[allow(clippy::too_many_arguments, clippy::similar_names)]
async fn serve(
    socket: &Path,
    interval: Duration,
    state: Arc<Mutex<LoginState>>,
    bus: Bus,
    stats: Option<TurnStats>,
    files: &turn::TurnFiles,
    turn_lock: TurnLock,
    label: &str,
) -> Result<()> {
    tracing::info!(socket = %socket.display(), "hive-ag3nt serve");
    let _ = state; // reserved for future state transitions (turn-loop -> needs-login)
    // Boot-time recovery: ask the broker to resurface anything we
    // popped in a previous harness session but never acked
    // (crashed mid-turn / OOM / container restart). The broker
    // resets `delivered_at = NULL` on those rows and remembers
    // their ids so the next `Recv` tags them `redelivered: true`;
    // we then prepend a "may already be handled" hint to the wake
    // prompt. Single shot before entering the serve loop; idempotent
    // when there's nothing inflight.
    requeue_inflight(socket).await;
    loop {
        let recv: Result<AgentResponse> =
            // Explicit long-poll: the new agent_server semantics treat
            // `None` as "peek, don't wait", which would tight-loop on
            // sleep(interval). The harness wants to park until a
            // message arrives, so opt into the full 180s cap.
            // `max: None` (= 1) — the serve loop drives one turn per
            // wake; claude itself calls recv(max: N) in-turn to drain
            // a burst when the wake prompt mentions pending.
            client::request(
                socket,
                &AgentRequest::Recv {
                    wait_seconds: Some(180),
                    max: None,
                },
            )
            .await;
        match recv {
            Ok(AgentResponse::Messages { messages }) if !messages.is_empty() => {
                let first = messages.into_iter().next().expect("checked non-empty");
                let from = first.from;
                let body = first.body;
                let redelivered = first.redelivered;
                tracing::info!(%from, %body, %redelivered, "inbox");
                let unread = inbox_unread(socket).await;
                bus.emit(LiveEvent::TurnStart {
                    from: from.clone(),
                    body: body.clone(),
                    unread,
                });
                bus.set_state(TurnState::Thinking);
                let started_at = now_unix();
                let started_instant = std::time::Instant::now();
                let model_at_start = bus.model();
                let prompt = format_wake_prompt(&from, &body, unread, redelivered);
                let outcome = {
                    let _guard = turn_lock.lock().await;
                    turn::drive_turn(&prompt, files, &bus).await
                };
                turn::emit_turn_end(&bus, &outcome);
                bus.set_state(TurnState::Idle);
                // Ack only on a clean turn-end. `Failed` leaves every
                // message popped during the turn in the unacked list;
                // next harness boot's `RequeueInflight` will reset
                // `delivered_at = NULL` and tag them `redelivered`.
                // `PromptTooLong` is absorbed inside `drive_turn` via
                // compaction so it shouldn't reach here, but if it
                // does we also skip the ack (safer to redeliver than
                // to lose the message).
                if matches!(outcome, turn::TurnOutcome::Ok) {
                    ack_turn(socket).await;
                }
                // Rate-limited: park until the quota resets, then requeue
                // the unacked message so it resurfaces in the same session.
                if matches!(outcome, turn::TurnOutcome::RateLimited) {
                    let secs = turn::rate_limit_sleep_secs();
                    bus.emit_status("rate_limited");
                    bus.emit(LiveEvent::Note {
                        text: format!(
                            "API rate-limited — sleeping {secs}s before retry"
                        ),
                    });
                    tracing::warn!(sleep_secs = secs, "rate-limited; parking");
                    tokio::time::sleep(Duration::from_secs(secs)).await;
                    requeue_inflight(socket).await;
                    bus.emit_status("online");
                }
                // Failures are unhandled by definition — PromptTooLong is
                // absorbed inside drive_turn via compaction, so anything
                // that reaches Failed here is a real crash. Notify the
                // manager so it can investigate / restart / page the
                // operator; best-effort, swallow the send error.
                if let turn::TurnOutcome::Failed(e) = &outcome {
                    notify_manager_of_failure(socket, label, e).await;
                }
                if let Some(s) = &stats {
                    let ended_at = now_unix();
                    let duration_ms =
                        i64::try_from(started_instant.elapsed().as_millis()).unwrap_or(i64::MAX);
                    let (open_threads, open_reminders) = fetch_agent_post_turn_counts(socket).await;
                    let row = build_row(
                        started_at,
                        ended_at,
                        duration_ms,
                        model_at_start,
                        from.clone(),
                        &outcome,
                        &bus,
                        open_threads,
                        open_reminders,
                    );
                    s.record(&row);
                }

                // After turn completes, log whether messages arrived during
                // the turn — the outer loop will iterate back to recv() on
                // its own (the Empty-arm sleep only fires when recv
                // actually returned Empty), so no explicit continue needed.
                let pending = inbox_unread(socket).await;
                if pending > 0 {
                    tracing::info!(%pending, "pending messages after turn; fetching next");
                }
            }
            Ok(AgentResponse::Messages { .. }) => {
                // Idle: empty list = nothing pending. Brief sleep
                // before next poll so a stretch of empty long-poll
                // returns doesn't tight-loop.
                tokio::time::sleep(interval).await;
            }
            Ok(
                AgentResponse::Ok
                | AgentResponse::Status { .. }
                | AgentResponse::Recent { .. }
                | AgentResponse::QuestionQueued { .. }
                | AgentResponse::LooseEnds { .. }
                | AgentResponse::PendingRemindersCount { .. }
                | AgentResponse::ReminderRollup { .. }
                | AgentResponse::Whoami { .. },
            ) => {
                tracing::warn!("recv produced unexpected response kind");
            }
            Ok(AgentResponse::Err { message }) => {
                tracing::warn!(%message, "recv error");
            }
            Err(e) => {
                tracing::warn!(error = ?e, "recv failed; retrying");
            }
        }
    }
}

/// Per-turn user prompt. The role/tools/etc. is in the system prompt
/// (`prompts/agent.md` → `claude --system-prompt-file`); this is just the
/// wake signal claude reacts to. `unread` is the count of *other*
/// messages in the inbox right after this one was popped.
/// `redelivered` flags messages that were popped in a prior harness
/// session, never acked, and resurfaced after a restart — a banner
/// at the top of the wake prompt warns that any side-effects of
/// previous handling may already have happened.
fn format_wake_prompt(from: &str, body: &str, unread: u64, redelivered: bool) -> String {
    let banner = if redelivered {
        hive_ag3nt::mcp::REDELIVERY_HINT
    } else {
        ""
    };
    let pending = if unread == 0 {
        String::new()
    } else {
        format!(
            "\n\n({unread} more message(s) pending in your inbox — call `mcp__hyperhive__recv` \
             with `max: {unread}` to drain them all in one round-trip before acting.)"
        )
    };
    format!("{banner}Incoming message from `{from}`:\n---\n{body}\n---{pending}")
}

/// Best-effort: tell the broker every message we popped during the
/// turn is now fully handled (turn-end-OK). Swallows transport
/// errors — the worst case is a redundant requeue on next boot.
async fn ack_turn(socket: &Path) {
    match client::request::<_, AgentResponse>(socket, &AgentRequest::AckTurn).await {
        Ok(AgentResponse::Ok) => {}
        Ok(AgentResponse::Err { message }) => {
            tracing::warn!(%message, "ack_turn rejected by broker");
        }
        Ok(other) => {
            tracing::warn!(?other, "ack_turn unexpected response");
        }
        Err(e) => tracing::warn!(error = ?e, "ack_turn transport error"),
    }
}

/// Boot-time recovery: ask the broker to resurface anything we
/// popped in a previous harness session but never acked. The broker
/// resets `delivered_at = NULL` on those rows and remembers their
/// ids so the next `Recv` carries `redelivered: true`. Swallows
/// transport errors — they degrade to "no recovery this boot",
/// which is no worse than the pre-feature behaviour (silent drop).
async fn requeue_inflight(socket: &Path) {
    match client::request::<_, AgentResponse>(socket, &AgentRequest::RequeueInflight).await {
        Ok(AgentResponse::Ok) => {}
        Ok(AgentResponse::Err { message }) => {
            tracing::warn!(%message, "requeue_inflight rejected by broker");
        }
        Ok(other) => {
            tracing::warn!(?other, "requeue_inflight unexpected response");
        }
        Err(e) => tracing::warn!(error = ?e, "requeue_inflight transport error"),
    }
}

/// Best-effort: tell the manager that this agent's last turn crashed
/// (claude exited non-zero, compaction didn't help, etc.). Routed
/// through the normal send path so the manager's inbox surfaces it
/// as a system-style event; `label` is included explicitly in the
/// body so the manager can identify the failing agent without having
/// to look at the `from` field (which is broker-stamped and may
/// differ from what the operator sees in the dashboard). Swallows
/// transport errors — we just logged the failure, the worst case is
/// the manager learns about the crash from the dashboard instead of
/// inbox.
async fn notify_manager_of_failure(socket: &Path, label: &str, err: &anyhow::Error) {
    let body = format!("[system] agent `{label}` claude turn failed:\n{err:#}");
    let res = client::request::<_, AgentResponse>(
        socket,
        &AgentRequest::Send {
            to: "manager".into(),
            body,
        },
    )
    .await;
    if let Err(e) = res {
        tracing::warn!(error = ?e, "failed to notify manager of turn failure");
    }
}

/// Best-effort: ask our own per-agent socket how many messages are still
/// pending after the wake-up Recv. Returns 0 if anything goes wrong.
async fn inbox_unread(socket: &Path) -> u64 {
    match client::request::<_, AgentResponse>(socket, &AgentRequest::Status).await {
        Ok(AgentResponse::Status { unread }) => unread,
        _ => 0,
    }
}

fn now_unix() -> i64 {
    std::time::SystemTime::now()
        .duration_since(std::time::UNIX_EPOCH)
        .ok()
        .and_then(|d| i64::try_from(d.as_secs()).ok())
        .unwrap_or(0)
}

/// Best-effort: ask hive-c0re for this agent's open thread count + pending
/// reminder count, after the turn finishes. Either roundtrip can fail
/// (transport hiccup, race with hive-c0re restart) — in those cases we
/// just drop a `None` into the stats row rather than blocking the loop.
async fn fetch_agent_post_turn_counts(socket: &Path) -> (Option<u64>, Option<u64>) {
    let threads = match client::request::<_, AgentResponse>(
        socket,
        &AgentRequest::GetLooseEnds,
    )
    .await
    {
        Ok(AgentResponse::LooseEnds { loose_ends }) => u64::try_from(loose_ends.len()).ok(),
        _ => None,
    };
    let reminders = match client::request::<_, AgentResponse>(
        socket,
        &AgentRequest::CountPendingReminders,
    )
    .await
    {
        Ok(AgentResponse::PendingRemindersCount { count }) => Some(count),
        _ => None,
    };
    (threads, reminders)
}

/// Assemble a `TurnStatRow` from the harness's per-turn state. Shared
/// shape between the agent + manager bin loops (each lives in its own
/// crate root so this helper is duplicated; the savings of a shared
/// module aren't worth the cross-crate ceremony at this size).
#[allow(clippy::too_many_arguments)]
fn build_row(
    started_at: i64,
    ended_at: i64,
    duration_ms: i64,
    model: String,
    wake_from: String,
    outcome: &turn::TurnOutcome,
    bus: &Bus,
    open_threads_count: Option<u64>,
    open_reminders_count: Option<u64>,
) -> TurnStatRow {
    let cost = bus.last_cost_usage().unwrap_or_default();
    let ctx = bus.last_ctx_usage().unwrap_or(cost);
    let tool_calls = bus.take_tool_calls();
    let tool_call_count: u64 = tool_calls.values().copied().sum();
    let tool_call_breakdown_json = if tool_calls.is_empty() {
        None
    } else {
        serde_json::to_string(&tool_calls).ok()
    };
    let (result_kind, note) = match outcome {
        turn::TurnOutcome::Ok => ("ok", None),
        turn::TurnOutcome::PromptTooLong => ("prompt_too_long", None),
        turn::TurnOutcome::RateLimited => ("rate_limited", None),
        turn::TurnOutcome::Failed(e) => ("failed", Some(format!("{e:#}"))),
    };
    TurnStatRow {
        started_at,
        ended_at,
        duration_ms,
        model,
        wake_from,
        input_tokens: cost.input_tokens,
        output_tokens: cost.output_tokens,
        cache_read_input_tokens: cost.cache_read_input_tokens,
        cache_creation_input_tokens: cost.cache_creation_input_tokens,
        last_input_tokens: ctx.input_tokens,
        last_output_tokens: ctx.output_tokens,
        last_cache_read_input_tokens: ctx.cache_read_input_tokens,
        last_cache_creation_input_tokens: ctx.cache_creation_input_tokens,
        tool_call_count,
        tool_call_breakdown_json,
        open_threads_count,
        open_reminders_count,
        result_kind,
        note,
    }
}