surface agent-vs-agent port collisions (manager:8000 can't collide)

manager is fixed at 8000, sub-agents are 8100-8999, so collisions
are strictly between two sub-agents hashing to the same value.
the colliding container's harness restart-loops on AddrInUse —
which the user just hit on :8945. previously the only sign was a
buried journalctl warn line.

now surfaced two ways:

- lifecycle::spawn / rebuild preflight: walks the live container
  list, computes each agent's hashed port, refuses with
  'port N already taken by <other> — rename one of them' if any
  running sub-agent shares the new agent's port. so the operator
  sees an actionable error in the dashboard's transient pill /
  approve-result instead of waiting for the harness to die.

- /api/state grows a port_conflicts: [{port, agents: [...]}]
  array; dashboard renders a pulsing red banner above the
  containers list listing each cluster. matches the questions
  panel pulse so it's hard to miss.
This commit is contained in:
müde 2026-05-15 22:08:19 +02:00
parent 2029840671
commit 6a2ffd521b
4 changed files with 100 additions and 0 deletions

View file

@ -215,6 +215,18 @@
const root = $('containers-section'); const root = $('containers-section');
root.innerHTML = ''; root.innerHTML = '';
// Port-hash collisions: rename one of the listed agents and
// rebuild. The banner sits above the agent list so it's the
// first thing the operator sees when something's wedged.
if (s.port_conflicts && s.port_conflicts.length) {
const banner = el('div', { class: 'port-conflict' },
el('strong', {}, '⚠ port collision'), ' — ');
const groups = s.port_conflicts.map((c) =>
`:${c.port} (${c.agents.join(' + ')})`).join('; ');
banner.append(groups + '. rename one of each and ↻ R3BU1LD.');
root.append(banner);
}
if (s.any_stale) { if (s.any_stale) {
root.append(form( root.append(form(
'/update-all', 'btn-rebuild', '↻ UPD4TE 4LL', '/update-all', 'btn-rebuild', '↻ UPD4TE 4LL',

View file

@ -193,6 +193,22 @@ a:hover {
/* Notification controls sit between the banner and the /* Notification controls sit between the banner and the
containers section. Hidden by JS when notifications are containers section. Hidden by JS when notifications are
unsupported, denied, or already in the right state. */ unsupported, denied, or already in the right state. */
/* Port-collision banner: appears above the containers list when
two sub-agents hash to the same web UI port. Critical without
resolution, one of the harnesses will restart-loop on
AddrInUse. */
.port-conflict {
background: rgba(243, 139, 168, 0.08);
border: 1px solid var(--red);
color: var(--red);
padding: 0.5em 0.8em;
margin-bottom: 0.6em;
border-radius: 4px;
text-shadow: 0 0 6px rgba(243, 139, 168, 0.4);
animation: questions-pulse 2.4s ease-in-out infinite;
}
.port-conflict strong { color: var(--red); }
.notif-row { .notif-row {
display: flex; display: flex;
gap: 0.5em; gap: 0.5em;

View file

@ -146,6 +146,17 @@ struct StateSnapshot {
/// survive after a destroy-without-purge. The operator can re-spawn /// survive after a destroy-without-purge. The operator can re-spawn
/// with the same name to resume, or PURG3 to wipe them. /// with the same name to resume, or PURG3 to wipe them.
tombstones: Vec<TombstoneView>, tombstones: Vec<TombstoneView>,
/// Sub-agents whose FNV-1a hashed web UI port collides with at
/// least one other agent. Operator resolves by renaming. The
/// dashboard renders a banner at the top listing each cluster.
port_conflicts: Vec<PortConflict>,
}
#[derive(Serialize)]
struct PortConflict {
port: u16,
/// All agent names sharing this port (sorted, ≥2 entries).
agents: Vec<String>,
} }
#[derive(Serialize)] #[derive(Serialize)]
@ -216,6 +227,7 @@ async fn api_state(headers: HeaderMap, State(state): State<AppState>) -> axum::J
let transients = build_transient_views(&raw_containers, &transient_snapshot); let transients = build_transient_views(&raw_containers, &transient_snapshot);
let approvals = build_approval_views(pending_approvals).await; let approvals = build_approval_views(pending_approvals).await;
let tombstones = build_tombstone_views(&state.coord, &containers, &transient_snapshot); let tombstones = build_tombstone_views(&state.coord, &containers, &transient_snapshot);
let port_conflicts = build_port_conflicts(&containers);
let operator_inbox = state let operator_inbox = state
.coord .coord
@ -234,9 +246,31 @@ async fn api_state(headers: HeaderMap, State(state): State<AppState>) -> axum::J
operator_inbox, operator_inbox,
questions, questions,
tombstones, tombstones,
port_conflicts,
}) })
} }
/// Group live containers by their assigned web UI port; clusters with
/// more than one member are port-hash collisions the operator needs
/// to resolve by renaming. Manager (fixed at 8000) and sub-agents
/// (8100..8999) can't collide with each other — collisions are
/// strictly between sub-agents.
fn build_port_conflicts(containers: &[ContainerView]) -> Vec<PortConflict> {
let mut by_port: std::collections::BTreeMap<u16, Vec<String>> =
std::collections::BTreeMap::new();
for c in containers {
by_port.entry(c.port).or_default().push(c.name.clone());
}
by_port
.into_iter()
.filter(|(_, agents)| agents.len() > 1)
.map(|(port, mut agents)| {
agents.sort();
PortConflict { port, agents }
})
.collect()
}
/// Build `ContainerView`s for every live nixos-container. Returns the /// Build `ContainerView`s for every live nixos-container. Returns the
/// list and whether any container is stale (drives the "↻ UPD4TE 4LL" /// list and whether any container is stale (drives the "↻ UPD4TE 4LL"
/// banner). /// banner).

View file

@ -107,6 +107,32 @@ fn validate(name: &str) -> Result<()> {
Ok(()) Ok(())
} }
/// First name (≠ `self_name`) currently running whose hashed port
/// matches this agent's. The harness inside the colliding container
/// would otherwise loop on `AddrInUse` forever; we surface the
/// conflict here so spawn / rebuild fails loudly with an actionable
/// message instead.
async fn port_collision(self_name: &str) -> Option<String> {
let port = agent_web_port(self_name);
let raw = list().await.unwrap_or_default();
for c in raw {
let other = if c == MANAGER_NAME {
MANAGER_NAME.to_owned()
} else if let Some(n) = c.strip_prefix(AGENT_PREFIX) {
n.to_owned()
} else {
continue;
};
if other == self_name {
continue;
}
if agent_web_port(&other) == port && is_running(&other).await {
return Some(other);
}
}
None
}
#[allow(clippy::too_many_arguments)] #[allow(clippy::too_many_arguments)]
pub async fn spawn( pub async fn spawn(
name: &str, name: &str,
@ -119,6 +145,12 @@ pub async fn spawn(
dashboard_port: u16, dashboard_port: u16,
) -> Result<()> { ) -> Result<()> {
validate(name)?; validate(name)?;
if let Some(other) = port_collision(name).await {
bail!(
"port {} is already taken by '{other}' — rename one of them and retry",
agent_web_port(name)
);
}
setup_proposed(proposed_dir, name).await?; setup_proposed(proposed_dir, name).await?;
setup_applied(applied_dir, name, hyperhive_flake, dashboard_port).await?; setup_applied(applied_dir, name, hyperhive_flake, dashboard_port).await?;
ensure_claude_dir(claude_dir)?; ensure_claude_dir(claude_dir)?;
@ -192,6 +224,12 @@ pub async fn rebuild(
dashboard_port: u16, dashboard_port: u16,
) -> Result<()> { ) -> Result<()> {
validate(name)?; validate(name)?;
if let Some(other) = port_collision(name).await {
bail!(
"port {} is already taken by '{other}' — rename one of them and retry",
agent_web_port(name)
);
}
setup_applied(applied_dir, name, hyperhive_flake, dashboard_port).await?; setup_applied(applied_dir, name, hyperhive_flake, dashboard_port).await?;
ensure_claude_dir(claude_dir)?; ensure_claude_dir(claude_dir)?;
ensure_state_dir(notes_dir)?; ensure_state_dir(notes_dir)?;