From 6a2ffd521bc4ece54846396da84e37ce289915d5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?m=C3=BCde?= Date: Fri, 15 May 2026 22:08:19 +0200 Subject: [PATCH] surface agent-vs-agent port collisions (manager:8000 can't collide) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit manager is fixed at 8000, sub-agents are 8100-8999, so collisions are strictly between two sub-agents hashing to the same value. the colliding container's harness restart-loops on AddrInUse — which the user just hit on :8945. previously the only sign was a buried journalctl warn line. now surfaced two ways: - lifecycle::spawn / rebuild preflight: walks the live container list, computes each agent's hashed port, refuses with 'port N already taken by — rename one of them' if any running sub-agent shares the new agent's port. so the operator sees an actionable error in the dashboard's transient pill / approve-result instead of waiting for the harness to die. - /api/state grows a port_conflicts: [{port, agents: [...]}] array; dashboard renders a pulsing red banner above the containers list listing each cluster. matches the questions panel pulse so it's hard to miss. --- hive-c0re/assets/app.js | 12 +++++++++++ hive-c0re/assets/dashboard.css | 16 ++++++++++++++ hive-c0re/src/dashboard.rs | 34 ++++++++++++++++++++++++++++++ hive-c0re/src/lifecycle.rs | 38 ++++++++++++++++++++++++++++++++++ 4 files changed, 100 insertions(+) diff --git a/hive-c0re/assets/app.js b/hive-c0re/assets/app.js index c367693..d000976 100644 --- a/hive-c0re/assets/app.js +++ b/hive-c0re/assets/app.js @@ -215,6 +215,18 @@ const root = $('containers-section'); root.innerHTML = ''; + // Port-hash collisions: rename one of the listed agents and + // rebuild. The banner sits above the agent list so it's the + // first thing the operator sees when something's wedged. + if (s.port_conflicts && s.port_conflicts.length) { + const banner = el('div', { class: 'port-conflict' }, + el('strong', {}, '⚠ port collision'), ' — '); + const groups = s.port_conflicts.map((c) => + `:${c.port} (${c.agents.join(' + ')})`).join('; '); + banner.append(groups + '. rename one of each and ↻ R3BU1LD.'); + root.append(banner); + } + if (s.any_stale) { root.append(form( '/update-all', 'btn-rebuild', '↻ UPD4TE 4LL', diff --git a/hive-c0re/assets/dashboard.css b/hive-c0re/assets/dashboard.css index 0245ac6..b487d50 100644 --- a/hive-c0re/assets/dashboard.css +++ b/hive-c0re/assets/dashboard.css @@ -193,6 +193,22 @@ a:hover { /* Notification controls — sit between the banner and the containers section. Hidden by JS when notifications are unsupported, denied, or already in the right state. */ +/* Port-collision banner: appears above the containers list when + two sub-agents hash to the same web UI port. Critical — without + resolution, one of the harnesses will restart-loop on + AddrInUse. */ +.port-conflict { + background: rgba(243, 139, 168, 0.08); + border: 1px solid var(--red); + color: var(--red); + padding: 0.5em 0.8em; + margin-bottom: 0.6em; + border-radius: 4px; + text-shadow: 0 0 6px rgba(243, 139, 168, 0.4); + animation: questions-pulse 2.4s ease-in-out infinite; +} +.port-conflict strong { color: var(--red); } + .notif-row { display: flex; gap: 0.5em; diff --git a/hive-c0re/src/dashboard.rs b/hive-c0re/src/dashboard.rs index ceaafa1..746dcbb 100644 --- a/hive-c0re/src/dashboard.rs +++ b/hive-c0re/src/dashboard.rs @@ -146,6 +146,17 @@ struct StateSnapshot { /// survive after a destroy-without-purge. The operator can re-spawn /// with the same name to resume, or PURG3 to wipe them. tombstones: Vec, + /// Sub-agents whose FNV-1a hashed web UI port collides with at + /// least one other agent. Operator resolves by renaming. The + /// dashboard renders a banner at the top listing each cluster. + port_conflicts: Vec, +} + +#[derive(Serialize)] +struct PortConflict { + port: u16, + /// All agent names sharing this port (sorted, ≥2 entries). + agents: Vec, } #[derive(Serialize)] @@ -216,6 +227,7 @@ async fn api_state(headers: HeaderMap, State(state): State) -> axum::J let transients = build_transient_views(&raw_containers, &transient_snapshot); let approvals = build_approval_views(pending_approvals).await; let tombstones = build_tombstone_views(&state.coord, &containers, &transient_snapshot); + let port_conflicts = build_port_conflicts(&containers); let operator_inbox = state .coord @@ -234,9 +246,31 @@ async fn api_state(headers: HeaderMap, State(state): State) -> axum::J operator_inbox, questions, tombstones, + port_conflicts, }) } +/// Group live containers by their assigned web UI port; clusters with +/// more than one member are port-hash collisions the operator needs +/// to resolve by renaming. Manager (fixed at 8000) and sub-agents +/// (8100..8999) can't collide with each other — collisions are +/// strictly between sub-agents. +fn build_port_conflicts(containers: &[ContainerView]) -> Vec { + let mut by_port: std::collections::BTreeMap> = + std::collections::BTreeMap::new(); + for c in containers { + by_port.entry(c.port).or_default().push(c.name.clone()); + } + by_port + .into_iter() + .filter(|(_, agents)| agents.len() > 1) + .map(|(port, mut agents)| { + agents.sort(); + PortConflict { port, agents } + }) + .collect() +} + /// Build `ContainerView`s for every live nixos-container. Returns the /// list and whether any container is stale (drives the "↻ UPD4TE 4LL" /// banner). diff --git a/hive-c0re/src/lifecycle.rs b/hive-c0re/src/lifecycle.rs index 130ddfd..e67fa93 100644 --- a/hive-c0re/src/lifecycle.rs +++ b/hive-c0re/src/lifecycle.rs @@ -107,6 +107,32 @@ fn validate(name: &str) -> Result<()> { Ok(()) } +/// First name (≠ `self_name`) currently running whose hashed port +/// matches this agent's. The harness inside the colliding container +/// would otherwise loop on `AddrInUse` forever; we surface the +/// conflict here so spawn / rebuild fails loudly with an actionable +/// message instead. +async fn port_collision(self_name: &str) -> Option { + let port = agent_web_port(self_name); + let raw = list().await.unwrap_or_default(); + for c in raw { + let other = if c == MANAGER_NAME { + MANAGER_NAME.to_owned() + } else if let Some(n) = c.strip_prefix(AGENT_PREFIX) { + n.to_owned() + } else { + continue; + }; + if other == self_name { + continue; + } + if agent_web_port(&other) == port && is_running(&other).await { + return Some(other); + } + } + None +} + #[allow(clippy::too_many_arguments)] pub async fn spawn( name: &str, @@ -119,6 +145,12 @@ pub async fn spawn( dashboard_port: u16, ) -> Result<()> { validate(name)?; + if let Some(other) = port_collision(name).await { + bail!( + "port {} is already taken by '{other}' — rename one of them and retry", + agent_web_port(name) + ); + } setup_proposed(proposed_dir, name).await?; setup_applied(applied_dir, name, hyperhive_flake, dashboard_port).await?; ensure_claude_dir(claude_dir)?; @@ -192,6 +224,12 @@ pub async fn rebuild( dashboard_port: u16, ) -> Result<()> { validate(name)?; + if let Some(other) = port_collision(name).await { + bail!( + "port {} is already taken by '{other}' — rename one of them and retry", + agent_web_port(name) + ); + } setup_applied(applied_dir, name, hyperhive_flake, dashboard_port).await?; ensure_claude_dir(claude_dir)?; ensure_state_dir(notes_dir)?;