model: runtime override via /model slash; fixes for port + bind
- runtime model override: Bus::{model,set_model} + POST /api/model
(form-encoded {model: name}). turn.rs reads bus.model() per turn
so a flip lands on the next claude invocation. /api/state grows
a model field; agent page shows a 'model · <name>' chip in the
state row. '/model <name>' slash command POSTs to the endpoint
and refreshes state.
- port regression fix: agent_web_port no longer probes forward for
*existing* agents (the previous fix shifted ports for any agent
without a port file, including legacy ones whose container was
already bound to the bare hashed port — dashboard rendered the
new port, container was still on the old one, conn errors). new
rule: port file exists → use it; absent + applied flake present
→ legacy, persist port_hash without probing; absent + no applied
flake → fresh spawn, probe forward.
- SO_REUSEADDR on both the dashboard and per-agent web UI binds
via tokio::net::TcpSocket. operator hit 12 retries failing on
manager :8000 — REUSEADDR handles the TIME_WAIT case cleanly
without a new dep; retry still covers the genuine
process-still-alive overlap.
todo: drops the model-override entry (shipped); adds two new
items — model persistence (optional, future), and custom
per-agent MCP tools (groundwork for moving bitburner-agent into
hyperhive).
This commit is contained in:
parent
7d93dd9db4
commit
6db38cf70c
9 changed files with 196 additions and 39 deletions
27
TODO.md
27
TODO.md
|
|
@ -16,14 +16,29 @@ Pick anything from here when relevant. Cross-cutting design notes live in
|
||||||
claude-code's `--allowedTools` extended grammar. Likely lives in
|
claude-code's `--allowedTools` extended grammar. Likely lives in
|
||||||
`agent.nix` so each agent can scope its own shell surface.
|
`agent.nix` so each agent can scope its own shell surface.
|
||||||
|
|
||||||
|
## Per-agent extension
|
||||||
|
|
||||||
|
- **Custom per-agent MCP tools.** Today every sub-agent gets the
|
||||||
|
same fixed MCP surface (`send`, `recv`). To move bitburner-agent
|
||||||
|
(and anything else with rich domain tooling) into hyperhive, an
|
||||||
|
agent needs a way to ship its own tools alongside hyperhive's.
|
||||||
|
Sketch: `agent.nix` declares a list of extra MCP servers
|
||||||
|
(command + args + env), each registered into the agent's
|
||||||
|
`--mcp-config` blob at flake-render time. The harness MCP server
|
||||||
|
remains the hyperhive surface; new servers slot in as additional
|
||||||
|
entries under `mcpServers.<name>` so claude sees them as
|
||||||
|
`mcp__<name>__<tool>`. Per-agent tool whitelist (`allowedTools`)
|
||||||
|
derived from the same config so the operator stays in control of
|
||||||
|
what's exposed.
|
||||||
|
|
||||||
## Per-agent settings
|
## Per-agent settings
|
||||||
|
|
||||||
- **Model override.** Hard-coded to `haiku` in the turn loop right now.
|
- **Model override persistence.** `/model <name>` already switches
|
||||||
Surface as a per-agent override: operator via dashboard, manager via
|
the model at runtime via `Bus::set_model`; the chip on the agent
|
||||||
`request_apply_commit` setting an attr on the agent's flake (most natural
|
page reflects the current value. Override is in-memory only and
|
||||||
place since the flake already carries per-agent env/identity). Pair with
|
resets on harness restart — by design for now, but consider
|
||||||
a **model status** indicator on the agent page (active / queued / last
|
optional persistence (`/state/model` file?) so an operator-set
|
||||||
switched) once the override is in place.
|
model survives a rebuild.
|
||||||
|
|
||||||
## UI / UX
|
## UI / UX
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -171,6 +171,15 @@ pre.diff {
|
||||||
font-size: 0.8em;
|
font-size: 0.8em;
|
||||||
letter-spacing: 0.05em;
|
letter-spacing: 0.05em;
|
||||||
}
|
}
|
||||||
|
.model-chip {
|
||||||
|
display: inline-block;
|
||||||
|
padding: 0.1em 0.6em;
|
||||||
|
border: 1px solid var(--purple-dim);
|
||||||
|
border-radius: 999px;
|
||||||
|
color: var(--cyan);
|
||||||
|
font-size: 0.78em;
|
||||||
|
letter-spacing: 0.04em;
|
||||||
|
}
|
||||||
.btn-dashlink {
|
.btn-dashlink {
|
||||||
color: var(--cyan);
|
color: var(--cyan);
|
||||||
border: 1px solid var(--cyan);
|
border: 1px solid var(--cyan);
|
||||||
|
|
|
||||||
|
|
@ -174,8 +174,31 @@
|
||||||
{ name: '/clear', desc: 'wipe the terminal panel (local-only)' },
|
{ name: '/clear', desc: 'wipe the terminal panel (local-only)' },
|
||||||
{ name: '/cancel', desc: 'SIGINT the in-flight claude turn' },
|
{ name: '/cancel', desc: 'SIGINT the in-flight claude turn' },
|
||||||
{ name: '/compact', desc: 'compact the persistent claude session' },
|
{ name: '/compact', desc: 'compact the persistent claude session' },
|
||||||
|
{ name: '/model', desc: '/model <name> — switch claude model for future turns' },
|
||||||
];
|
];
|
||||||
|
|
||||||
|
async function postModel(name) {
|
||||||
|
try {
|
||||||
|
const resp = await fetch('/api/model', {
|
||||||
|
method: 'POST',
|
||||||
|
headers: { 'Content-Type': 'application/x-www-form-urlencoded' },
|
||||||
|
body: new URLSearchParams({ model: name }),
|
||||||
|
redirect: 'manual',
|
||||||
|
});
|
||||||
|
const ok = resp.ok || resp.type === 'opaqueredirect'
|
||||||
|
|| (resp.status >= 200 && resp.status < 400);
|
||||||
|
if (!ok && termAPI) {
|
||||||
|
const text = await resp.text().catch(() => '');
|
||||||
|
termAPI.row('turn-end-fail', '✗ /model failed: ' + resp.status
|
||||||
|
+ (text ? ' — ' + text : ''));
|
||||||
|
} else {
|
||||||
|
refreshState();
|
||||||
|
}
|
||||||
|
} catch (err) {
|
||||||
|
if (termAPI) termAPI.row('turn-end-fail', '✗ /model failed: ' + err);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
async function postSimple(url, label) {
|
async function postSimple(url, label) {
|
||||||
try {
|
try {
|
||||||
const resp = await fetch(url, { method: 'POST', redirect: 'manual' });
|
const resp = await fetch(url, { method: 'POST', redirect: 'manual' });
|
||||||
|
|
@ -213,6 +236,16 @@
|
||||||
case '/compact':
|
case '/compact':
|
||||||
postCompact();
|
postCompact();
|
||||||
return true;
|
return true;
|
||||||
|
case '/model': {
|
||||||
|
const parts = trimmed.split(/\s+/);
|
||||||
|
if (parts.length < 2 || !parts[1]) {
|
||||||
|
termAPI.row('turn-end-fail',
|
||||||
|
'✗ /model needs a name (e.g. /model haiku, /model sonnet, /model opus)');
|
||||||
|
} else {
|
||||||
|
postModel(parts[1]);
|
||||||
|
}
|
||||||
|
return true;
|
||||||
|
}
|
||||||
default:
|
default:
|
||||||
termAPI.row('turn-end-fail', '✗ unknown slash command: ' + cmd + ' — try /help');
|
termAPI.row('turn-end-fail', '✗ unknown slash command: ' + cmd + ' — try /help');
|
||||||
return true;
|
return true;
|
||||||
|
|
@ -365,6 +398,13 @@
|
||||||
list.append(li);
|
list.append(li);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
function renderModelChip(model) {
|
||||||
|
const el_ = $('model-chip');
|
||||||
|
if (!el_) return;
|
||||||
|
if (!model) { el_.hidden = true; return; }
|
||||||
|
el_.hidden = false;
|
||||||
|
el_.textContent = 'model · ' + model;
|
||||||
|
}
|
||||||
function renderLastTurn(ms) {
|
function renderLastTurn(ms) {
|
||||||
const el_ = $('last-turn');
|
const el_ = $('last-turn');
|
||||||
if (!el_) return;
|
if (!el_) return;
|
||||||
|
|
@ -424,6 +464,7 @@
|
||||||
} else if (s.turn_state) {
|
} else if (s.turn_state) {
|
||||||
setStateAbs(s.turn_state, s.turn_state_since);
|
setStateAbs(s.turn_state, s.turn_state_since);
|
||||||
}
|
}
|
||||||
|
renderModelChip(s.model);
|
||||||
// Skip the re-render if nothing structurally changed. The most
|
// Skip the re-render if nothing structurally changed. The most
|
||||||
// common case is `online` polling itself — without this guard, the
|
// common case is `online` polling itself — without this guard, the
|
||||||
// operator's <input value> gets clobbered every cycle.
|
// operator's <input value> gets clobbered every cycle.
|
||||||
|
|
|
||||||
|
|
@ -15,6 +15,7 @@
|
||||||
|
|
||||||
<div id="state-row">
|
<div id="state-row">
|
||||||
<span id="state-badge" class="state-badge state-loading">… booting</span>
|
<span id="state-badge" class="state-badge state-loading">… booting</span>
|
||||||
|
<span id="model-chip" class="model-chip" hidden></span>
|
||||||
<span id="last-turn" class="last-turn" hidden></span>
|
<span id="last-turn" class="last-turn" hidden></span>
|
||||||
<button type="button" id="cancel-btn" class="btn-cancel-turn" hidden>■ cancel turn</button>
|
<button type="button" id="cancel-btn" class="btn-cancel-turn" hidden>■ cancel turn</button>
|
||||||
</div>
|
</div>
|
||||||
|
|
|
||||||
|
|
@ -140,6 +140,13 @@ pub enum TurnState {
|
||||||
Compacting,
|
Compacting,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Default claude model when nothing's been set at runtime. The
|
||||||
|
/// operator can switch via `/model <name>` in the web terminal; the
|
||||||
|
/// chosen model lives in `Bus::model` for the rest of the harness
|
||||||
|
/// process's life (resets on restart, by design — operator overrides
|
||||||
|
/// shouldn't survive accidentally).
|
||||||
|
pub const DEFAULT_MODEL: &str = "haiku";
|
||||||
|
|
||||||
#[derive(Clone)]
|
#[derive(Clone)]
|
||||||
pub struct Bus {
|
pub struct Bus {
|
||||||
tx: Arc<broadcast::Sender<LiveEvent>>,
|
tx: Arc<broadcast::Sender<LiveEvent>>,
|
||||||
|
|
@ -149,6 +156,9 @@ pub struct Bus {
|
||||||
store: Option<Arc<EventStore>>,
|
store: Option<Arc<EventStore>>,
|
||||||
/// Current turn-loop state + since-when (unix seconds).
|
/// Current turn-loop state + since-when (unix seconds).
|
||||||
state: Arc<Mutex<(TurnState, i64)>>,
|
state: Arc<Mutex<(TurnState, i64)>>,
|
||||||
|
/// Model name passed to `claude --model`. Default `haiku`; the
|
||||||
|
/// operator can override at runtime via `POST /api/model`.
|
||||||
|
model: Arc<Mutex<String>>,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl Bus {
|
impl Bus {
|
||||||
|
|
@ -171,9 +181,23 @@ impl Bus {
|
||||||
tx: Arc::new(tx),
|
tx: Arc::new(tx),
|
||||||
store,
|
store,
|
||||||
state: Arc::new(Mutex::new((TurnState::Idle, now_unix()))),
|
state: Arc::new(Mutex::new((TurnState::Idle, now_unix()))),
|
||||||
|
model: Arc::new(Mutex::new(DEFAULT_MODEL.to_owned())),
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Currently-selected claude model name. Read on every turn so a
|
||||||
|
/// `/model <name>` flip takes effect on the next turn.
|
||||||
|
#[must_use]
|
||||||
|
pub fn model(&self) -> String {
|
||||||
|
self.model.lock().unwrap().clone()
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Switch the model for future turns. The current turn (if any)
|
||||||
|
/// keeps the model it was already running.
|
||||||
|
pub fn set_model(&self, name: impl Into<String>) {
|
||||||
|
*self.model.lock().unwrap() = name.into();
|
||||||
|
}
|
||||||
|
|
||||||
/// Update the harness's authoritative turn-loop state. Records
|
/// Update the harness's authoritative turn-loop state. Records
|
||||||
/// the transition time so `state_snapshot` can return a since-age.
|
/// the transition time so `state_snapshot` can return a since-age.
|
||||||
pub fn set_state(&self, next: TurnState) {
|
pub fn set_state(&self, next: TurnState) {
|
||||||
|
|
|
||||||
|
|
@ -227,13 +227,14 @@ async fn run_claude(
|
||||||
flavor: mcp::Flavor,
|
flavor: mcp::Flavor,
|
||||||
mode: ClaudeMode,
|
mode: ClaudeMode,
|
||||||
) -> Result<bool> {
|
) -> Result<bool> {
|
||||||
|
let model = bus.model();
|
||||||
let mut cmd = Command::new("claude");
|
let mut cmd = Command::new("claude");
|
||||||
cmd.arg("--print")
|
cmd.arg("--print")
|
||||||
.arg("--verbose")
|
.arg("--verbose")
|
||||||
.arg("--output-format")
|
.arg("--output-format")
|
||||||
.arg("stream-json")
|
.arg("stream-json")
|
||||||
.arg("--model")
|
.arg("--model")
|
||||||
.arg("haiku")
|
.arg(&model)
|
||||||
.arg("--continue")
|
.arg("--continue")
|
||||||
.arg("--settings")
|
.arg("--settings")
|
||||||
.arg(settings);
|
.arg(settings);
|
||||||
|
|
|
||||||
|
|
@ -81,6 +81,7 @@ pub async fn serve(
|
||||||
.route("/login/cancel", post(post_login_cancel))
|
.route("/login/cancel", post(post_login_cancel))
|
||||||
.route("/api/cancel", post(post_cancel_turn))
|
.route("/api/cancel", post(post_cancel_turn))
|
||||||
.route("/api/compact", post(post_compact))
|
.route("/api/compact", post(post_compact))
|
||||||
|
.route("/api/model", post(post_set_model))
|
||||||
.with_state(state);
|
.with_state(state);
|
||||||
let addr = SocketAddr::from(([0, 0, 0, 0], port));
|
let addr = SocketAddr::from(([0, 0, 0, 0], port));
|
||||||
let listener = bind_with_retry(addr, "web UI").await?;
|
let listener = bind_with_retry(addr, "web UI").await?;
|
||||||
|
|
@ -93,16 +94,17 @@ pub async fn serve(
|
||||||
// Static assets + state snapshot
|
// Static assets + state snapshot
|
||||||
// ---------------------------------------------------------------------------
|
// ---------------------------------------------------------------------------
|
||||||
|
|
||||||
/// Bind a TCP listener, retrying on `AddrInUse` for up to ~20s.
|
/// Bind a TCP listener with `SO_REUSEADDR` set, retrying on
|
||||||
/// nspawn restarts can race the previous harness's socket release;
|
/// `AddrInUse` for up to ~20s. nspawn restarts can race the previous
|
||||||
/// without retry the new harness fails to bind and systemd just
|
/// harness's socket release; `SO_REUSEADDR` lets us reclaim a port
|
||||||
/// keeps restarting it. `SO_REUSEADDR` would be the proper fix but
|
/// still in `TIME_WAIT` from a clean previous exit, and the retry
|
||||||
/// would require socket2; retry is good enough here.
|
/// covers the case where the previous process is genuinely still
|
||||||
|
/// alive (systemd restart-delay overlap).
|
||||||
async fn bind_with_retry(addr: SocketAddr, label: &str) -> Result<tokio::net::TcpListener> {
|
async fn bind_with_retry(addr: SocketAddr, label: &str) -> Result<tokio::net::TcpListener> {
|
||||||
let mut delay_ms = 250u64;
|
let mut delay_ms = 250u64;
|
||||||
let mut attempts = 0u32;
|
let mut attempts = 0u32;
|
||||||
loop {
|
loop {
|
||||||
match tokio::net::TcpListener::bind(addr).await {
|
match try_bind(addr) {
|
||||||
Ok(l) => return Ok(l),
|
Ok(l) => return Ok(l),
|
||||||
Err(e) if e.kind() == std::io::ErrorKind::AddrInUse && attempts < 12 => {
|
Err(e) if e.kind() == std::io::ErrorKind::AddrInUse && attempts < 12 => {
|
||||||
tracing::warn!(
|
tracing::warn!(
|
||||||
|
|
@ -120,6 +122,16 @@ async fn bind_with_retry(addr: SocketAddr, label: &str) -> Result<tokio::net::Tc
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
fn try_bind(addr: SocketAddr) -> std::io::Result<tokio::net::TcpListener> {
|
||||||
|
let sock = match addr {
|
||||||
|
SocketAddr::V4(_) => tokio::net::TcpSocket::new_v4()?,
|
||||||
|
SocketAddr::V6(_) => tokio::net::TcpSocket::new_v6()?,
|
||||||
|
};
|
||||||
|
sock.set_reuseaddr(true)?;
|
||||||
|
sock.bind(addr)?;
|
||||||
|
sock.listen(1024)
|
||||||
|
}
|
||||||
|
|
||||||
async fn serve_index() -> impl IntoResponse {
|
async fn serve_index() -> impl IntoResponse {
|
||||||
(
|
(
|
||||||
[("content-type", "text/html; charset=utf-8")],
|
[("content-type", "text/html; charset=utf-8")],
|
||||||
|
|
@ -158,6 +170,10 @@ struct StateSnapshot {
|
||||||
/// client-side off this rather than tracking it from SSE events.
|
/// client-side off this rather than tracking it from SSE events.
|
||||||
turn_state: crate::events::TurnState,
|
turn_state: crate::events::TurnState,
|
||||||
turn_state_since: i64,
|
turn_state_since: i64,
|
||||||
|
/// Currently-active claude model name. Reflected on the page so
|
||||||
|
/// the operator can see what they just switched to (and what's
|
||||||
|
/// in flight). Mutable at runtime via `POST /api/model`.
|
||||||
|
model: String,
|
||||||
}
|
}
|
||||||
|
|
||||||
#[derive(Serialize)]
|
#[derive(Serialize)]
|
||||||
|
|
@ -193,6 +209,7 @@ async fn api_state(State(state): State<AppState>) -> axum::Json<StateSnapshot> {
|
||||||
.unwrap_or(7000);
|
.unwrap_or(7000);
|
||||||
let inbox = recent_inbox(&state.socket, state.flavor).await;
|
let inbox = recent_inbox(&state.socket, state.flavor).await;
|
||||||
let (turn_state, turn_state_since) = state.bus.state_snapshot();
|
let (turn_state, turn_state_since) = state.bus.state_snapshot();
|
||||||
|
let model = state.bus.model();
|
||||||
axum::Json(StateSnapshot {
|
axum::Json(StateSnapshot {
|
||||||
label: state.label.clone(),
|
label: state.label.clone(),
|
||||||
dashboard_port,
|
dashboard_port,
|
||||||
|
|
@ -201,6 +218,7 @@ async fn api_state(State(state): State<AppState>) -> axum::Json<StateSnapshot> {
|
||||||
inbox,
|
inbox,
|
||||||
turn_state,
|
turn_state,
|
||||||
turn_state_since,
|
turn_state_since,
|
||||||
|
model,
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -351,6 +369,30 @@ async fn post_login_cancel(State(state): State<AppState>) -> Response {
|
||||||
/// the "/compact done" note) lands in the live event panel like any
|
/// the "/compact done" note) lands in the live event panel like any
|
||||||
/// other turn. If a regular turn is in flight, claude's own session
|
/// other turn. If a regular turn is in flight, claude's own session
|
||||||
/// lock will reject this one and we surface the error as a Note.
|
/// lock will reject this one and we surface the error as a Note.
|
||||||
|
#[derive(Deserialize)]
|
||||||
|
struct ModelForm {
|
||||||
|
model: String,
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Switch the model for future turns. The current turn (if any)
|
||||||
|
/// keeps its model; `/model <name>` applies starting with the next
|
||||||
|
/// `recv` cycle. Empty / whitespace-only inputs are rejected. No
|
||||||
|
/// claude-side validation — we just hand the string through to
|
||||||
|
/// `claude --model <name>`; an unknown model surfaces as a turn
|
||||||
|
/// failure in the live panel and the operator can revert.
|
||||||
|
async fn post_set_model(State(state): State<AppState>, Form(form): Form<ModelForm>) -> Response {
|
||||||
|
let name = form.model.trim();
|
||||||
|
if name.is_empty() {
|
||||||
|
return error_response("model: name required");
|
||||||
|
}
|
||||||
|
state.bus.set_model(name);
|
||||||
|
state.bus.emit(crate::events::LiveEvent::Note(format!(
|
||||||
|
"operator: /model — claude model set to '{name}' for future turns"
|
||||||
|
)));
|
||||||
|
tracing::info!(%name, "operator set model");
|
||||||
|
Redirect::to("/").into_response()
|
||||||
|
}
|
||||||
|
|
||||||
async fn post_compact(State(state): State<AppState>) -> Response {
|
async fn post_compact(State(state): State<AppState>) -> Response {
|
||||||
let bus = state.bus.clone();
|
let bus = state.bus.clone();
|
||||||
let socket = state.socket.clone();
|
let socket = state.socket.clone();
|
||||||
|
|
|
||||||
|
|
@ -72,13 +72,13 @@ pub async fn serve(port: u16, coord: Arc<Coordinator>) -> Result<()> {
|
||||||
// `/messages/stream` for broker traffic.
|
// `/messages/stream` for broker traffic.
|
||||||
// ---------------------------------------------------------------------------
|
// ---------------------------------------------------------------------------
|
||||||
|
|
||||||
/// Retry-on-AddrInUse bind. Same shape as the per-agent variant —
|
/// `SO_REUSEADDR` bind with retry. Mirrors the per-agent variant —
|
||||||
/// hive-c0re restarts also race the previous process's socket release.
|
/// hive-c0re restarts also race the previous process's socket release.
|
||||||
async fn bind_with_retry(addr: SocketAddr) -> Result<tokio::net::TcpListener> {
|
async fn bind_with_retry(addr: SocketAddr) -> Result<tokio::net::TcpListener> {
|
||||||
let mut delay_ms = 250u64;
|
let mut delay_ms = 250u64;
|
||||||
let mut attempts = 0u32;
|
let mut attempts = 0u32;
|
||||||
loop {
|
loop {
|
||||||
match tokio::net::TcpListener::bind(addr).await {
|
match try_bind(addr) {
|
||||||
Ok(l) => return Ok(l),
|
Ok(l) => return Ok(l),
|
||||||
Err(e) if e.kind() == std::io::ErrorKind::AddrInUse && attempts < 12 => {
|
Err(e) if e.kind() == std::io::ErrorKind::AddrInUse && attempts < 12 => {
|
||||||
tracing::warn!(
|
tracing::warn!(
|
||||||
|
|
@ -96,6 +96,16 @@ async fn bind_with_retry(addr: SocketAddr) -> Result<tokio::net::TcpListener> {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
fn try_bind(addr: SocketAddr) -> std::io::Result<tokio::net::TcpListener> {
|
||||||
|
let sock = match addr {
|
||||||
|
SocketAddr::V4(_) => tokio::net::TcpSocket::new_v4()?,
|
||||||
|
SocketAddr::V6(_) => tokio::net::TcpSocket::new_v6()?,
|
||||||
|
};
|
||||||
|
sock.set_reuseaddr(true)?;
|
||||||
|
sock.bind(addr)?;
|
||||||
|
sock.listen(1024)
|
||||||
|
}
|
||||||
|
|
||||||
async fn serve_index() -> impl IntoResponse {
|
async fn serve_index() -> impl IntoResponse {
|
||||||
Html(include_str!("../assets/index.html"))
|
Html(include_str!("../assets/index.html"))
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -46,13 +46,18 @@ const DEFAULT_MEMORY_MAX: &str = "2G";
|
||||||
const DEFAULT_CPU_QUOTA: &str = "50%";
|
const DEFAULT_CPU_QUOTA: &str = "50%";
|
||||||
|
|
||||||
/// Returns the per-agent web UI port. Manager is fixed at `MANAGER_PORT`.
|
/// Returns the per-agent web UI port. Manager is fixed at `MANAGER_PORT`.
|
||||||
/// For sub-agents the port is sticky once chosen: looked up from
|
/// For sub-agents the port is sticky once chosen:
|
||||||
/// `agent_state_root(name)/port` if present, otherwise derived from
|
///
|
||||||
/// the FNV-1a hash of the name and *probed forward* through the
|
/// - **Port file present** (`state_root/port`): use it. End of story.
|
||||||
/// allocated range to skip any port another sub-agent has already
|
/// - **Port file absent, applied flake present**: this is a legacy
|
||||||
/// claimed (birthday-paradox collisions are real even at 2–3
|
/// agent whose container is already bound to the bare
|
||||||
/// agents). The chosen port is written back so subsequent calls
|
/// `port_hash(name)`. Don't probe; just migrate by writing that
|
||||||
/// resolve to the same value without re-probing.
|
/// value to the port file. The container stays where it is and
|
||||||
|
/// subsequent renders agree with it.
|
||||||
|
/// - **Port file absent, no applied flake**: this is a fresh spawn.
|
||||||
|
/// Probe forward from `port_hash(name)` to skip any port another
|
||||||
|
/// sub-agent has already claimed (via port file or legacy hash).
|
||||||
|
/// Write the chosen port back.
|
||||||
#[must_use]
|
#[must_use]
|
||||||
pub fn agent_web_port(name: &str) -> u16 {
|
pub fn agent_web_port(name: &str) -> u16 {
|
||||||
if name == MANAGER_NAME {
|
if name == MANAGER_NAME {
|
||||||
|
|
@ -66,6 +71,13 @@ pub fn agent_web_port(name: &str) -> u16 {
|
||||||
{
|
{
|
||||||
return port;
|
return port;
|
||||||
}
|
}
|
||||||
|
let applied_exists = crate::coordinator::Coordinator::agent_applied_dir(name).exists();
|
||||||
|
let chosen = if applied_exists {
|
||||||
|
// Legacy agent — container already running on the hashed
|
||||||
|
// port. Don't move it; just persist the value so future
|
||||||
|
// calls bypass this path.
|
||||||
|
port_hash(name)
|
||||||
|
} else {
|
||||||
let taken = scan_taken_ports(name);
|
let taken = scan_taken_ports(name);
|
||||||
let start = port_hash(name);
|
let start = port_hash(name);
|
||||||
let mut port = start;
|
let mut port = start;
|
||||||
|
|
@ -76,17 +88,19 @@ pub fn agent_web_port(name: &str) -> u16 {
|
||||||
port = next_port(port);
|
port = next_port(port);
|
||||||
if port == start {
|
if port == start {
|
||||||
// Range fully exhausted (very unlikely — 900 slots) —
|
// Range fully exhausted (very unlikely — 900 slots) —
|
||||||
// give up and just use the hashed value; collisions are
|
// give up and use the hashed value; collisions are
|
||||||
// surfaced as bind errors by the harness retry loop.
|
// surfaced as bind errors by the harness retry loop.
|
||||||
tracing::warn!(%name, "agent_web_port: range exhausted, returning hash");
|
tracing::warn!(%name, "agent_web_port: range exhausted, returning hash");
|
||||||
return start;
|
break;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
let _ = std::fs::create_dir_all(&state_root);
|
|
||||||
if let Err(e) = std::fs::write(&port_file, format!("{port}\n")) {
|
|
||||||
tracing::warn!(error = ?e, file = %port_file.display(), "persisting agent port failed");
|
|
||||||
}
|
|
||||||
port
|
port
|
||||||
|
};
|
||||||
|
let _ = std::fs::create_dir_all(&state_root);
|
||||||
|
if let Err(e) = std::fs::write(&port_file, format!("{chosen}\n")) {
|
||||||
|
tracing::warn!(error = ?e, file = %port_file.display(), "persisting agent port failed");
|
||||||
|
}
|
||||||
|
chosen
|
||||||
}
|
}
|
||||||
|
|
||||||
fn port_hash(name: &str) -> u16 {
|
fn port_hash(name: &str) -> u16 {
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue