model: runtime override via /model slash; fixes for port + bind

- runtime model override: Bus::{model,set_model} + POST /api/model (form-encoded {model: name}). turn.rs reads bus.model() per turn so a flip lands on the next claude invocation. /api/state grows a model field; agent page shows a 'model · <name>' chip in the state row. '/model <name>' slash command POSTs to the endpoint and refreshes state. - port regression fix: agent_web_port no longer probes forward for *existing* agents (the previous fix shifted ports for any agent without a port file, including legacy ones whose container was already bound to the bare hashed port — dashboard rendered the new port, container was still on the old one, conn errors). new rule: port file exists → use it; absent + applied flake present → legacy, persist port_hash without probing; absent + no applied flake → fresh spawn, probe forward. - SO_REUSEADDR on both the dashboard and per-agent web UI binds via tokio::net::TcpSocket. operator hit 12 retries failing on manager :8000 — REUSEADDR handles the TIME_WAIT case cleanly without a new dep; retry still covers the genuine process-still-alive overlap. todo: drops the model-override entry (shipped); adds two new items — model persistence (optional, future), and custom per-agent MCP tools (groundwork for moving bitburner-agent into hyperhive).
2026-05-15 20:59:45 +02:00 · 2026-05-15 20:59:45 +02:00 · 6db38cf70c
commit 6db38cf70c
parent 7d93dd9db4
9 changed files with 196 additions and 39 deletions
--- a/TODO.md
+++ b/TODO.md
@ -16,14 +16,29 @@ Pick anything from here when relevant. Cross-cutting design notes live in
  claude-code's `--allowedTools` extended grammar. Likely lives in
  `agent.nix` so each agent can scope its own shell surface.

+## Per-agent extension
+
+- **Custom per-agent MCP tools.** Today every sub-agent gets the
+  same fixed MCP surface (`send`, `recv`). To move bitburner-agent
+  (and anything else with rich domain tooling) into hyperhive, an
+  agent needs a way to ship its own tools alongside hyperhive's.
+  Sketch: `agent.nix` declares a list of extra MCP servers
+  (command + args + env), each registered into the agent's
+  `--mcp-config` blob at flake-render time. The harness MCP server
+  remains the hyperhive surface; new servers slot in as additional
+  entries under `mcpServers.<name>` so claude sees them as
+  `mcp__<name>__<tool>`. Per-agent tool whitelist (`allowedTools`)
+  derived from the same config so the operator stays in control of
+  what's exposed.
+
 ## Per-agent settings

- **Model override.** Hard-coded to `haiku` in the turn loop right now.
-  Surface as a per-agent override: operator via dashboard, manager via
-  `request_apply_commit` setting an attr on the agent's flake (most natural
-  place since the flake already carries per-agent env/identity). Pair with
-  a **model status** indicator on the agent page (active / queued / last
-  switched) once the override is in place.
+- **Model override persistence.** `/model <name>` already switches
+  the model at runtime via `Bus::set_model`; the chip on the agent
+  page reflects the current value. Override is in-memory only and
+  resets on harness restart — by design for now, but consider
+  optional persistence (`/state/model` file?) so an operator-set
+  model survives a rebuild.

 ## UI / UX

--- a/hive-ag3nt/assets/agent.css
+++ b/hive-ag3nt/assets/agent.css
@ -171,6 +171,15 @@ pre.diff {
  font-size: 0.8em;
  letter-spacing: 0.05em;
 }
+.model-chip {
+  display: inline-block;
+  padding: 0.1em 0.6em;
+  border: 1px solid var(--purple-dim);
+  border-radius: 999px;
+  color: var(--cyan);
+  font-size: 0.78em;
+  letter-spacing: 0.04em;
+}
 .btn-dashlink {
  color: var(--cyan);
  border: 1px solid var(--cyan);
--- a/hive-ag3nt/assets/app.js
+++ b/hive-ag3nt/assets/app.js
@ -174,8 +174,31 @@
    { name: '/clear',   desc: 'wipe the terminal panel (local-only)' },
    { name: '/cancel',  desc: 'SIGINT the in-flight claude turn' },
    { name: '/compact', desc: 'compact the persistent claude session' },
+    { name: '/model',   desc: '/model <name>  — switch claude model for future turns' },
  ];

+  async function postModel(name) {
+    try {
+      const resp = await fetch('/api/model', {
+        method: 'POST',
+        headers: { 'Content-Type': 'application/x-www-form-urlencoded' },
+        body: new URLSearchParams({ model: name }),
+        redirect: 'manual',
+      });
+      const ok = resp.ok || resp.type === 'opaqueredirect'
+        || (resp.status >= 200 && resp.status < 400);
+      if (!ok && termAPI) {
+        const text = await resp.text().catch(() => '');
+        termAPI.row('turn-end-fail', '✗ /model failed: ' + resp.status
+          + (text ? ' — ' + text : ''));
+      } else {
+        refreshState();
+      }
+    } catch (err) {
+      if (termAPI) termAPI.row('turn-end-fail', '✗ /model failed: ' + err);
+    }
+  }
+
  async function postSimple(url, label) {
    try {
      const resp = await fetch(url, { method: 'POST', redirect: 'manual' });
@ -213,6 +236,16 @@
      case '/compact':
        postCompact();
        return true;
+      case '/model': {
+        const parts = trimmed.split(/\s+/);
+        if (parts.length < 2 || !parts[1]) {
+          termAPI.row('turn-end-fail',
+            '✗ /model needs a name (e.g. /model haiku, /model sonnet, /model opus)');
+        } else {
+          postModel(parts[1]);
+        }
+        return true;
+      }
      default:
        termAPI.row('turn-end-fail', '✗ unknown slash command: ' + cmd + ' — try /help');
        return true;
@ -365,6 +398,13 @@
      list.append(li);
    }
  }
+  function renderModelChip(model) {
+    const el_ = $('model-chip');
+    if (!el_) return;
+    if (!model) { el_.hidden = true; return; }
+    el_.hidden = false;
+    el_.textContent = 'model · ' + model;
+  }
  function renderLastTurn(ms) {
    const el_ = $('last-turn');
    if (!el_) return;
@ -424,6 +464,7 @@
      } else if (s.turn_state) {
        setStateAbs(s.turn_state, s.turn_state_since);
      }
+      renderModelChip(s.model);
      // Skip the re-render if nothing structurally changed. The most
      // common case is `online` polling itself — without this guard, the
      // operator's <input value> gets clobbered every cycle.
--- a/hive-ag3nt/assets/index.html
+++ b/hive-ag3nt/assets/index.html
@ -15,6 +15,7 @@

  <div id="state-row">
    <span id="state-badge" class="state-badge state-loading">… booting</span>
+    <span id="model-chip" class="model-chip" hidden></span>
    <span id="last-turn" class="last-turn" hidden></span>
    <button type="button" id="cancel-btn" class="btn-cancel-turn" hidden>■ cancel turn</button>
  </div>
--- a/hive-ag3nt/src/events.rs
+++ b/hive-ag3nt/src/events.rs
@ -140,6 +140,13 @@ pub enum TurnState {
    Compacting,
 }

+/// Default claude model when nothing's been set at runtime. The
+/// operator can switch via `/model <name>` in the web terminal; the
+/// chosen model lives in `Bus::model` for the rest of the harness
+/// process's life (resets on restart, by design — operator overrides
+/// shouldn't survive accidentally).
+pub const DEFAULT_MODEL: &str = "haiku";
+
 #[derive(Clone)]
 pub struct Bus {
    tx: Arc<broadcast::Sender<LiveEvent>>,
@ -149,6 +156,9 @@ pub struct Bus {
    store: Option<Arc<EventStore>>,
    /// Current turn-loop state + since-when (unix seconds).
    state: Arc<Mutex<(TurnState, i64)>>,
+    /// Model name passed to `claude --model`. Default `haiku`; the
+    /// operator can override at runtime via `POST /api/model`.
+    model: Arc<Mutex<String>>,
 }

 impl Bus {
@ -171,9 +181,23 @@ impl Bus {
            tx: Arc::new(tx),
            store,
            state: Arc::new(Mutex::new((TurnState::Idle, now_unix()))),
+            model: Arc::new(Mutex::new(DEFAULT_MODEL.to_owned())),
        }
    }

+    /// Currently-selected claude model name. Read on every turn so a
+    /// `/model <name>` flip takes effect on the next turn.
+    #[must_use]
+    pub fn model(&self) -> String {
+        self.model.lock().unwrap().clone()
+    }
+
+    /// Switch the model for future turns. The current turn (if any)
+    /// keeps the model it was already running.
+    pub fn set_model(&self, name: impl Into<String>) {
+        *self.model.lock().unwrap() = name.into();
+    }
+
    /// Update the harness's authoritative turn-loop state. Records
    /// the transition time so `state_snapshot` can return a since-age.
    pub fn set_state(&self, next: TurnState) {
--- a/hive-ag3nt/src/turn.rs
+++ b/hive-ag3nt/src/turn.rs
@ -227,13 +227,14 @@ async fn run_claude(
    flavor: mcp::Flavor,
    mode: ClaudeMode,
 ) -> Result<bool> {
+    let model = bus.model();
    let mut cmd = Command::new("claude");
    cmd.arg("--print")
        .arg("--verbose")
        .arg("--output-format")
        .arg("stream-json")
        .arg("--model")
-        .arg("haiku")
+        .arg(&model)
        .arg("--continue")
        .arg("--settings")
        .arg(settings);
--- a/hive-ag3nt/src/web_ui.rs
+++ b/hive-ag3nt/src/web_ui.rs
@ -81,6 +81,7 @@ pub async fn serve(
        .route("/login/cancel", post(post_login_cancel))
        .route("/api/cancel", post(post_cancel_turn))
        .route("/api/compact", post(post_compact))
+        .route("/api/model", post(post_set_model))
        .with_state(state);
    let addr = SocketAddr::from(([0, 0, 0, 0], port));
    let listener = bind_with_retry(addr, "web UI").await?;
@ -93,16 +94,17 @@ pub async fn serve(
 // Static assets + state snapshot
 // ---------------------------------------------------------------------------

-/// Bind a TCP listener, retrying on `AddrInUse` for up to ~20s.
-/// nspawn restarts can race the previous harness's socket release;
-/// without retry the new harness fails to bind and systemd just
-/// keeps restarting it. `SO_REUSEADDR` would be the proper fix but
-/// would require socket2; retry is good enough here.
+/// Bind a TCP listener with `SO_REUSEADDR` set, retrying on
+/// `AddrInUse` for up to ~20s. nspawn restarts can race the previous
+/// harness's socket release; `SO_REUSEADDR` lets us reclaim a port
+/// still in `TIME_WAIT` from a clean previous exit, and the retry
+/// covers the case where the previous process is genuinely still
+/// alive (systemd restart-delay overlap).
 async fn bind_with_retry(addr: SocketAddr, label: &str) -> Result<tokio::net::TcpListener> {
    let mut delay_ms = 250u64;
    let mut attempts = 0u32;
    loop {
-        match tokio::net::TcpListener::bind(addr).await {
+        match try_bind(addr) {
            Ok(l) => return Ok(l),
            Err(e) if e.kind() == std::io::ErrorKind::AddrInUse && attempts < 12 => {
                tracing::warn!(
@ -120,6 +122,16 @@ async fn bind_with_retry(addr: SocketAddr, label: &str) -> Result<tokio::net::Tc
    }
 }

+fn try_bind(addr: SocketAddr) -> std::io::Result<tokio::net::TcpListener> {
+    let sock = match addr {
+        SocketAddr::V4(_) => tokio::net::TcpSocket::new_v4()?,
+        SocketAddr::V6(_) => tokio::net::TcpSocket::new_v6()?,
+    };
+    sock.set_reuseaddr(true)?;
+    sock.bind(addr)?;
+    sock.listen(1024)
+}
+
 async fn serve_index() -> impl IntoResponse {
    (
        [("content-type", "text/html; charset=utf-8")],
@ -158,6 +170,10 @@ struct StateSnapshot {
    /// client-side off this rather than tracking it from SSE events.
    turn_state: crate::events::TurnState,
    turn_state_since: i64,
+    /// Currently-active claude model name. Reflected on the page so
+    /// the operator can see what they just switched to (and what's
+    /// in flight). Mutable at runtime via `POST /api/model`.
+    model: String,
 }

 #[derive(Serialize)]
@ -193,6 +209,7 @@ async fn api_state(State(state): State<AppState>) -> axum::Json<StateSnapshot> {
        .unwrap_or(7000);
    let inbox = recent_inbox(&state.socket, state.flavor).await;
    let (turn_state, turn_state_since) = state.bus.state_snapshot();
+    let model = state.bus.model();
    axum::Json(StateSnapshot {
        label: state.label.clone(),
        dashboard_port,
@ -201,6 +218,7 @@ async fn api_state(State(state): State<AppState>) -> axum::Json<StateSnapshot> {
        inbox,
        turn_state,
        turn_state_since,
+        model,
    })
 }

@ -351,6 +369,30 @@ async fn post_login_cancel(State(state): State<AppState>) -> Response {
 /// the "/compact done" note) lands in the live event panel like any
 /// other turn. If a regular turn is in flight, claude's own session
 /// lock will reject this one and we surface the error as a Note.
+#[derive(Deserialize)]
+struct ModelForm {
+    model: String,
+}
+
+/// Switch the model for future turns. The current turn (if any)
+/// keeps its model; `/model <name>` applies starting with the next
+/// `recv` cycle. Empty / whitespace-only inputs are rejected. No
+/// claude-side validation — we just hand the string through to
+/// `claude --model <name>`; an unknown model surfaces as a turn
+/// failure in the live panel and the operator can revert.
+async fn post_set_model(State(state): State<AppState>, Form(form): Form<ModelForm>) -> Response {
+    let name = form.model.trim();
+    if name.is_empty() {
+        return error_response("model: name required");
+    }
+    state.bus.set_model(name);
+    state.bus.emit(crate::events::LiveEvent::Note(format!(
+        "operator: /model — claude model set to '{name}' for future turns"
+    )));
+    tracing::info!(%name, "operator set model");
+    Redirect::to("/").into_response()
+}
+
 async fn post_compact(State(state): State<AppState>) -> Response {
    let bus = state.bus.clone();
    let socket = state.socket.clone();
--- a/hive-c0re/src/dashboard.rs
+++ b/hive-c0re/src/dashboard.rs
@ -72,13 +72,13 @@ pub async fn serve(port: u16, coord: Arc<Coordinator>) -> Result<()> {
 // `/messages/stream` for broker traffic.
 // ---------------------------------------------------------------------------

-/// Retry-on-AddrInUse bind. Same shape as the per-agent variant —
+/// `SO_REUSEADDR` bind with retry. Mirrors the per-agent variant —
 /// hive-c0re restarts also race the previous process's socket release.
 async fn bind_with_retry(addr: SocketAddr) -> Result<tokio::net::TcpListener> {
    let mut delay_ms = 250u64;
    let mut attempts = 0u32;
    loop {
-        match tokio::net::TcpListener::bind(addr).await {
+        match try_bind(addr) {
            Ok(l) => return Ok(l),
            Err(e) if e.kind() == std::io::ErrorKind::AddrInUse && attempts < 12 => {
                tracing::warn!(
@ -96,6 +96,16 @@ async fn bind_with_retry(addr: SocketAddr) -> Result<tokio::net::TcpListener> {
    }
 }

+fn try_bind(addr: SocketAddr) -> std::io::Result<tokio::net::TcpListener> {
+    let sock = match addr {
+        SocketAddr::V4(_) => tokio::net::TcpSocket::new_v4()?,
+        SocketAddr::V6(_) => tokio::net::TcpSocket::new_v6()?,
+    };
+    sock.set_reuseaddr(true)?;
+    sock.bind(addr)?;
+    sock.listen(1024)
+}
+
 async fn serve_index() -> impl IntoResponse {
    Html(include_str!("../assets/index.html"))
 }
--- a/hive-c0re/src/lifecycle.rs
+++ b/hive-c0re/src/lifecycle.rs
@ -46,13 +46,18 @@ const DEFAULT_MEMORY_MAX: &str = "2G";
 const DEFAULT_CPU_QUOTA: &str = "50%";

 /// Returns the per-agent web UI port. Manager is fixed at `MANAGER_PORT`.
-/// For sub-agents the port is sticky once chosen: looked up from
-/// `agent_state_root(name)/port` if present, otherwise derived from
-/// the FNV-1a hash of the name and *probed forward* through the
-/// allocated range to skip any port another sub-agent has already
-/// claimed (birthday-paradox collisions are real even at 2–3
-/// agents). The chosen port is written back so subsequent calls
-/// resolve to the same value without re-probing.
+/// For sub-agents the port is sticky once chosen:
+///
+/// - **Port file present** (`state_root/port`): use it. End of story.
+/// - **Port file absent, applied flake present**: this is a legacy
+///   agent whose container is already bound to the bare
+///   `port_hash(name)`. Don't probe; just migrate by writing that
+///   value to the port file. The container stays where it is and
+///   subsequent renders agree with it.
+/// - **Port file absent, no applied flake**: this is a fresh spawn.
+///   Probe forward from `port_hash(name)` to skip any port another
+///   sub-agent has already claimed (via port file or legacy hash).
+///   Write the chosen port back.
 #[must_use]
 pub fn agent_web_port(name: &str) -> u16 {
    if name == MANAGER_NAME {
@ -66,27 +71,36 @@ pub fn agent_web_port(name: &str) -> u16 {
    {
        return port;
    }
-    let taken = scan_taken_ports(name);
-    let start = port_hash(name);
-    let mut port = start;
-    for _ in 0..WEB_PORT_RANGE {
-        if !taken.contains(&port) {
-            break;
+    let applied_exists = crate::coordinator::Coordinator::agent_applied_dir(name).exists();
+    let chosen = if applied_exists {
+        // Legacy agent — container already running on the hashed
+        // port. Don't move it; just persist the value so future
+        // calls bypass this path.
+        port_hash(name)
+    } else {
+        let taken = scan_taken_ports(name);
+        let start = port_hash(name);
+        let mut port = start;
+        for _ in 0..WEB_PORT_RANGE {
+            if !taken.contains(&port) {
+                break;
+            }
+            port = next_port(port);
+            if port == start {
+                // Range fully exhausted (very unlikely — 900 slots) —
+                // give up and use the hashed value; collisions are
+                // surfaced as bind errors by the harness retry loop.
+                tracing::warn!(%name, "agent_web_port: range exhausted, returning hash");
+                break;
+            }
        }
-        port = next_port(port);
-        if port == start {
-            // Range fully exhausted (very unlikely — 900 slots) —
-            // give up and just use the hashed value; collisions are
-            // surfaced as bind errors by the harness retry loop.
-            tracing::warn!(%name, "agent_web_port: range exhausted, returning hash");
-            return start;
-        }
-    }
+        port
+    };
    let _ = std::fs::create_dir_all(&state_root);
-    if let Err(e) = std::fs::write(&port_file, format!("{port}\n")) {
+    if let Err(e) = std::fs::write(&port_file, format!("{chosen}\n")) {
        tracing::warn!(error = ?e, file = %port_file.display(), "persisting agent port failed");
    }
-    port
+    chosen
 }

 fn port_hash(name: &str) -> u16 {