model: runtime override via /model slash; fixes for port + bind

- runtime model override: Bus::{model,set_model} + POST /api/model
  (form-encoded {model: name}). turn.rs reads bus.model() per turn
  so a flip lands on the next claude invocation. /api/state grows
  a model field; agent page shows a 'model · <name>' chip in the
  state row. '/model <name>' slash command POSTs to the endpoint
  and refreshes state.

- port regression fix: agent_web_port no longer probes forward for
  *existing* agents (the previous fix shifted ports for any agent
  without a port file, including legacy ones whose container was
  already bound to the bare hashed port — dashboard rendered the
  new port, container was still on the old one, conn errors). new
  rule: port file exists → use it; absent + applied flake present
  → legacy, persist port_hash without probing; absent + no applied
  flake → fresh spawn, probe forward.

- SO_REUSEADDR on both the dashboard and per-agent web UI binds
  via tokio::net::TcpSocket. operator hit 12 retries failing on
  manager :8000 — REUSEADDR handles the TIME_WAIT case cleanly
  without a new dep; retry still covers the genuine
  process-still-alive overlap.

todo: drops the model-override entry (shipped); adds two new
items — model persistence (optional, future), and custom
per-agent MCP tools (groundwork for moving bitburner-agent into
hyperhive).
This commit is contained in:
müde 2026-05-15 20:59:45 +02:00
parent 7d93dd9db4
commit 6db38cf70c
9 changed files with 196 additions and 39 deletions

View file

@ -72,13 +72,13 @@ pub async fn serve(port: u16, coord: Arc<Coordinator>) -> Result<()> {
// `/messages/stream` for broker traffic.
// ---------------------------------------------------------------------------
/// Retry-on-AddrInUse bind. Same shape as the per-agent variant —
/// `SO_REUSEADDR` bind with retry. Mirrors the per-agent variant —
/// hive-c0re restarts also race the previous process's socket release.
async fn bind_with_retry(addr: SocketAddr) -> Result<tokio::net::TcpListener> {
let mut delay_ms = 250u64;
let mut attempts = 0u32;
loop {
match tokio::net::TcpListener::bind(addr).await {
match try_bind(addr) {
Ok(l) => return Ok(l),
Err(e) if e.kind() == std::io::ErrorKind::AddrInUse && attempts < 12 => {
tracing::warn!(
@ -96,6 +96,16 @@ async fn bind_with_retry(addr: SocketAddr) -> Result<tokio::net::TcpListener> {
}
}
fn try_bind(addr: SocketAddr) -> std::io::Result<tokio::net::TcpListener> {
let sock = match addr {
SocketAddr::V4(_) => tokio::net::TcpSocket::new_v4()?,
SocketAddr::V6(_) => tokio::net::TcpSocket::new_v6()?,
};
sock.set_reuseaddr(true)?;
sock.bind(addr)?;
sock.listen(1024)
}
async fn serve_index() -> impl IntoResponse {
Html(include_str!("../assets/index.html"))
}

View file

@ -46,13 +46,18 @@ const DEFAULT_MEMORY_MAX: &str = "2G";
const DEFAULT_CPU_QUOTA: &str = "50%";
/// Returns the per-agent web UI port. Manager is fixed at `MANAGER_PORT`.
/// For sub-agents the port is sticky once chosen: looked up from
/// `agent_state_root(name)/port` if present, otherwise derived from
/// the FNV-1a hash of the name and *probed forward* through the
/// allocated range to skip any port another sub-agent has already
/// claimed (birthday-paradox collisions are real even at 23
/// agents). The chosen port is written back so subsequent calls
/// resolve to the same value without re-probing.
/// For sub-agents the port is sticky once chosen:
///
/// - **Port file present** (`state_root/port`): use it. End of story.
/// - **Port file absent, applied flake present**: this is a legacy
/// agent whose container is already bound to the bare
/// `port_hash(name)`. Don't probe; just migrate by writing that
/// value to the port file. The container stays where it is and
/// subsequent renders agree with it.
/// - **Port file absent, no applied flake**: this is a fresh spawn.
/// Probe forward from `port_hash(name)` to skip any port another
/// sub-agent has already claimed (via port file or legacy hash).
/// Write the chosen port back.
#[must_use]
pub fn agent_web_port(name: &str) -> u16 {
if name == MANAGER_NAME {
@ -66,27 +71,36 @@ pub fn agent_web_port(name: &str) -> u16 {
{
return port;
}
let taken = scan_taken_ports(name);
let start = port_hash(name);
let mut port = start;
for _ in 0..WEB_PORT_RANGE {
if !taken.contains(&port) {
break;
let applied_exists = crate::coordinator::Coordinator::agent_applied_dir(name).exists();
let chosen = if applied_exists {
// Legacy agent — container already running on the hashed
// port. Don't move it; just persist the value so future
// calls bypass this path.
port_hash(name)
} else {
let taken = scan_taken_ports(name);
let start = port_hash(name);
let mut port = start;
for _ in 0..WEB_PORT_RANGE {
if !taken.contains(&port) {
break;
}
port = next_port(port);
if port == start {
// Range fully exhausted (very unlikely — 900 slots) —
// give up and use the hashed value; collisions are
// surfaced as bind errors by the harness retry loop.
tracing::warn!(%name, "agent_web_port: range exhausted, returning hash");
break;
}
}
port = next_port(port);
if port == start {
// Range fully exhausted (very unlikely — 900 slots) —
// give up and just use the hashed value; collisions are
// surfaced as bind errors by the harness retry loop.
tracing::warn!(%name, "agent_web_port: range exhausted, returning hash");
return start;
}
}
port
};
let _ = std::fs::create_dir_all(&state_root);
if let Err(e) = std::fs::write(&port_file, format!("{port}\n")) {
if let Err(e) = std::fs::write(&port_file, format!("{chosen}\n")) {
tracing::warn!(error = ?e, file = %port_file.display(), "persisting agent port failed");
}
port
chosen
}
fn port_hash(name: &str) -> u16 {