hyperhive/hive-ag3nt/src/turn.rs

//! Per-turn claude invocation shared by `hive-ag3nt` and `hive-m1nd`. The
//! two binaries differ only in their MCP `Flavor` (agent surface vs.
//! manager surface) and their wake-prompt wording; the spawn shape,
//! arg-vector, stdin plumbing, and stream-json pumping are identical.

use std::collections::VecDeque;
use std::path::{Path, PathBuf};
use std::process::Stdio;
use std::sync::atomic::{AtomicBool, Ordering};
use std::sync::{Arc, Mutex};
use std::time::Duration;

use anyhow::{Result, bail};
use tokio::io::{AsyncBufReadExt, AsyncWriteExt, BufReader};
use tokio::process::Command;

use crate::events::{Bus, LiveEvent};
use crate::login::{self, LoginState};
use crate::mcp;

/// `--settings` JSON applied to every claude invocation. Lives as a
/// properly-formatted file in `prompts/claude-settings.json` so it's easy
/// to read and edit; we ship it via `include_str!`. We turn off claude's
/// in-session auto-compaction and its cross-session auto-memory because
/// hyperhive owns those concerns (`/compact` on overflow, notes
/// persistence under `/state`). Unknown keys are silently ignored by
/// claude-code; if a key gets renamed we'll spot it because the
/// corresponding behavior will start firing mid-turn again.
const CLAUDE_SETTINGS: &str = include_str!("../prompts/claude-settings.json");

/// Regex-ish marker claude-code emits when context overflows. Same string
/// bitburner-agent watches for. Empirically reliable across claude-code
/// versions; if it ever changes, compaction won't fire and we'll see a
/// claude exit with a useful error in the live view.
const PROMPT_TOO_LONG_MARKER: &str = "Prompt is too long";

/// The set of files claude reads on every invocation: the MCP server
/// config (`--mcp-config`), static settings (`--settings`), and the
/// pre-rendered role/tools system prompt (`--system-prompt-file`).
/// Materialised once at harness startup; shared between the turn loop
/// and the operator-driven `/compact` path so both invocations look
/// identical to claude (same MCP surface, same allowed tools, same
/// role prompt — only the stdin payload differs).
#[derive(Clone)]
pub struct TurnFiles {
    pub mcp_config: PathBuf,
    pub settings: PathBuf,
    pub system_prompt: PathBuf,
    pub flavor: mcp::Flavor,
}

impl TurnFiles {
    /// Write all three files into the per-agent runtime dir alongside
    /// `socket`. Idempotent — overwrites whatever was there.
    pub async fn prepare(socket: &Path, label: &str, flavor: mcp::Flavor) -> Result<Self> {
        Ok(Self {
            mcp_config: write_mcp_config(socket).await?,
            settings: write_settings(socket).await?,
            system_prompt: write_system_prompt(socket, label, flavor).await?,
            flavor,
        })
    }
}

/// Drop the MCP config blob claude reads from `--mcp-config <path>`.
/// `socket` is the hyperhive per-container socket (forwarded to the child
/// as `--socket <path>`); `binary_subcommand` is e.g. `"mcp"` for sub-agents
/// or `"mcp"` for the manager (both binaries name their MCP subcommand the
/// same — the differentiator is which binary `/proc/self/exe` resolves to).
pub async fn write_mcp_config(socket: &Path) -> Result<PathBuf> {
    let parent = socket.parent().unwrap_or_else(|| Path::new("/run/hive"));
    tokio::fs::create_dir_all(parent).await.ok();
    let path = parent.join("claude-mcp-config.json");
    let exe = std::env::current_exe()
        .ok()
        .map_or_else(|| "hive-ag3nt".into(), |p| p.display().to_string());
    let body = mcp::render_claude_config(&exe, socket);
    tokio::fs::write(&path, body).await?;
    tracing::info!(path = %path.display(), "wrote claude MCP config");
    Ok(path)
}

/// Drop the static `--settings` JSON next to the MCP config so we can
/// pass a path (`--settings <file>`) instead of an ever-growing inline
/// blob — the CLI argv has a finite length budget.
pub async fn write_settings(socket: &Path) -> Result<PathBuf> {
    let parent = socket.parent().unwrap_or_else(|| Path::new("/run/hive"));
    tokio::fs::create_dir_all(parent).await.ok();
    let path = parent.join("claude-settings.json");
    tokio::fs::write(&path, CLAUDE_SETTINGS).await?;
    tracing::info!(path = %path.display(), "wrote claude settings");
    Ok(path)
}

/// Write the agent's / manager's static system prompt to a file next to
/// the MCP config and return the path. Passed to claude via
/// `--system-prompt-file`, replacing claude's default system prompt with
/// the role + tools instructions. Per-turn prompts become much smaller
/// (just the wake message body).
pub async fn write_system_prompt(
    socket: &Path,
    label: &str,
    flavor: mcp::Flavor,
) -> Result<PathBuf> {
    let parent = socket.parent().unwrap_or_else(|| Path::new("/run/hive"));
    tokio::fs::create_dir_all(parent).await.ok();
    let template = match flavor {
        mcp::Flavor::Agent => include_str!("../prompts/agent.md"),
        mcp::Flavor::Manager => include_str!("../prompts/manager.md"),
    };
    let pronouns = std::env::var("HIVE_OPERATOR_PRONOUNS").unwrap_or_else(|_| "she/her".to_owned());
    let body = template
        .replace("{label}", label)
        .replace("{operator_pronouns}", &pronouns);
    let path = parent.join("claude-system-prompt.md");
    tokio::fs::write(&path, body).await?;
    tracing::info!(path = %path.display(), "wrote claude system prompt");
    Ok(path)
}

/// One claude turn's outcome. The harness uses this to decide whether to
/// transparently kick off a compaction and retry.
#[derive(Debug)]
pub enum TurnOutcome {
    Ok,
    /// claude saw "Prompt is too long" — the session needs compacting.
    /// Run `compact_session()` then retry the same wake-up prompt.
    PromptTooLong,
    Failed(anyhow::Error),
}

/// Drive one turn end-to-end, transparently compacting + retrying once on
/// `Prompt is too long`. Both the sub-agent and manager loops call this.
pub async fn drive_turn(prompt: &str, files: &TurnFiles, bus: &Bus) -> TurnOutcome {
    match run_turn(prompt, files, bus).await {
        TurnOutcome::PromptTooLong => {
            if let Err(e) = compact_session(files, bus).await {
                tracing::warn!(error = %format!("{e:#}"), "compact failed");
                return TurnOutcome::Failed(e);
            }
            run_turn(prompt, files, bus).await
        }
        other => other,
    }
}

/// Emit the per-turn `TurnEnd` event + log line. Single owner so the
/// agent and manager loops agree on outcome semantics.
pub fn emit_turn_end(bus: &Bus, outcome: &TurnOutcome) {
    match outcome {
        TurnOutcome::Ok | TurnOutcome::PromptTooLong => {
            bus.emit(LiveEvent::TurnEnd {
                ok: true,
                note: None,
            });
            tracing::info!("turn finished");
        }
        TurnOutcome::Failed(e) => {
            let note = format!("{e:#}");
            bus.emit(LiveEvent::TurnEnd {
                ok: false,
                note: Some(note.clone()),
            });
            tracing::warn!(error = %note, "turn failed");
        }
    }
}

/// Block until the bound `~/.claude/` dir contains a session, polling
/// `claude_dir` on a `poll_ms` interval (min 2s). Flips `state` to
/// `Online` when login lands; caller resumes its serve loop.
pub async fn wait_for_login(claude_dir: &Path, state: Arc<Mutex<LoginState>>, poll_ms: u64) {
    tracing::warn!(
        claude_dir = %claude_dir.display(),
        "no claude session — staying in partial-run mode (web UI only)"
    );
    let probe = Duration::from_millis(poll_ms.max(2000));
    loop {
        tokio::time::sleep(probe).await;
        if login::has_session(claude_dir) {
            tracing::info!("claude session detected — entering turn loop");
            *state.lock().unwrap() = LoginState::Online;
            return;
        }
    }
}

/// Spawn `claude` for one turn and pump `stream-json` stdout into the
/// live event bus. Prompt goes over stdin (variadic
/// `--allowedTools`/`--tools` would otherwise eat a trailing positional
/// prompt). The session is persistent across turns via `--continue` and
/// claude's in-session auto-compact is disabled via `--settings` so it
/// doesn't stall mid-turn — hyperhive owns compaction.
pub async fn run_turn(prompt: &str, files: &TurnFiles, bus: &Bus) -> TurnOutcome {
    match run_claude(prompt, files, bus).await {
        Ok(too_long) if too_long => TurnOutcome::PromptTooLong,
        Ok(_) => TurnOutcome::Ok,
        Err(e) => TurnOutcome::Failed(e),
    }
}

/// Run claude's built-in `/compact` slash command on the persistent
/// session. Takes the *same* params as `run_turn` because compact
/// re-initialises claude with the full session shape — same MCP
/// surface, same system prompt, same allowed-tools — so the post-
/// compact state matches a normal turn's. Only the prompt over stdin
/// differs (`/compact` vs the wake-up payload).
pub async fn compact_session(files: &TurnFiles, bus: &Bus) -> Result<()> {
    bus.emit(LiveEvent::Note(
        "context overflow — running /compact on the persistent session".into(),
    ));
    let _ = run_claude("/compact", files, bus).await?;
    bus.emit(LiveEvent::Note("/compact done".into()));
    Ok(())
}

async fn run_claude(prompt: &str, files: &TurnFiles, bus: &Bus) -> Result<bool> {
    let model = bus.model();
    let resume = !bus.take_skip_continue();
    if !resume {
        bus.emit(LiveEvent::Note(
            "fresh session (--continue suppressed for this turn)".into(),
        ));
    }
    let mut cmd = Command::new("claude");
    // Spawn inside the agent's state dir so relative paths in tool calls
    // (Read foo.md, Bash ls, Write notes.md) land in the durable dir
    // instead of wherever the harness systemd unit started. Falls back
    // silently if the dir is missing (dev / test without the bind mount).
    let state_dir = crate::paths::state_dir();
    if state_dir.is_dir() {
        cmd.current_dir(&state_dir);
    }
    cmd.arg("--print")
        .arg("--verbose")
        .arg("--output-format")
        .arg("stream-json")
        .arg("--model")
        .arg(&model)
        .arg("--settings")
        .arg(&files.settings);
    if resume {
        cmd.arg("--continue");
    }
    cmd.arg("--system-prompt-file").arg(&files.system_prompt);
    cmd.arg("--mcp-config")
        .arg(&files.mcp_config)
        .arg("--strict-mcp-config")
        .arg("--tools")
        .arg(mcp::builtin_tools_arg())
        .arg("--allowedTools")
        .arg(mcp::allowed_tools_arg(files.flavor));
    let mut child = cmd
        .stdin(Stdio::piped())
        .stdout(Stdio::piped())
        .stderr(Stdio::piped())
        .spawn()?;

    if let Some(mut stdin) = child.stdin.take() {
        stdin.write_all(prompt.as_bytes()).await?;
        stdin.shutdown().await.ok();
        drop(stdin);
    }
    let stdout = child.stdout.take().expect("piped stdout");
    let stderr = child.stderr.take().expect("piped stderr");

    let prompt_too_long = Arc::new(AtomicBool::new(false));
    let flag_out = prompt_too_long.clone();
    let flag_err = prompt_too_long.clone();
    let bus_out = bus.clone();
    let bus_err = bus.clone();
    let pump_stdout = tokio::spawn(async move {
        let mut reader = BufReader::new(stdout).lines();
        while let Ok(Some(line)) = reader.next_line().await {
            if line.contains(PROMPT_TOO_LONG_MARKER) {
                flag_out.store(true, Ordering::Relaxed);
            }
            match serde_json::from_str::<serde_json::Value>(&line) {
                Ok(v) => {
                    // Extract token usage from the final `result` event and
                    // store it in the bus for the web UI to surface.
                    if v.get("type").and_then(|t| t.as_str()) == Some("result") {
                        if let Some(u) = v.get("usage") {
                            let usage = crate::events::TokenUsage {
                                input_tokens: u
                                    .get("input_tokens")
                                    .and_then(|v| v.as_u64())
                                    .unwrap_or(0),
                                output_tokens: u
                                    .get("output_tokens")
                                    .and_then(|v| v.as_u64())
                                    .unwrap_or(0),
                                cache_read_input_tokens: u
                                    .get("cache_read_input_tokens")
                                    .and_then(|v| v.as_u64())
                                    .unwrap_or(0),
                                cache_creation_input_tokens: u
                                    .get("cache_creation_input_tokens")
                                    .and_then(|v| v.as_u64())
                                    .unwrap_or(0),
                            };
                            bus_out.record_usage(usage);
                        }
                    }
                    bus_out.emit(LiveEvent::Stream(v));
                }
                Err(_) => bus_out.emit(LiveEvent::Note(format!("(non-json) {line}"))),
            }
        }
    });
    // Keep the last STDERR_TAIL_LINES of stderr so a non-zero exit can
    // include real context in the bail message (and downstream in the
    // failure notification to the manager) instead of just "exit 1".
    const STDERR_TAIL_LINES: usize = 20;
    let stderr_tail: Arc<Mutex<VecDeque<String>>> =
        Arc::new(Mutex::new(VecDeque::with_capacity(STDERR_TAIL_LINES)));
    let tail_clone = stderr_tail.clone();
    let pump_stderr = tokio::spawn(async move {
        let mut reader = BufReader::new(stderr).lines();
        while let Ok(Some(line)) = reader.next_line().await {
            if line.contains(PROMPT_TOO_LONG_MARKER) {
                flag_err.store(true, Ordering::Relaxed);
            }
            // Mirror to journald so post-mortems work without the web UI
            // or the events sqlite. The bus event is what the dashboard
            // renders; the tracing line is what `journalctl -M <c> -b`
            // surfaces when claude exits non-zero.
            tracing::warn!(line = %line, "claude stderr");
            bus_err.emit(LiveEvent::Note(format!("stderr: {line}")));
            let mut t = tail_clone.lock().unwrap();
            if t.len() >= STDERR_TAIL_LINES {
                t.pop_front();
            }
            t.push_back(line);
        }
    });

    let status = child.wait().await?;
    let _ = pump_stdout.await;
    let _ = pump_stderr.await;
    let too_long = prompt_too_long.load(Ordering::Relaxed);
    if !status.success() && !too_long {
        let tail = stderr_tail.lock().unwrap();
        if tail.is_empty() {
            bail!("claude exited {status} (no stderr)");
        }
        let tail_str = tail.iter().cloned().collect::<Vec<_>>().join("\n");
        bail!("claude exited {status}\nstderr tail:\n{tail_str}");
    }
    Ok(too_long)
}