surface agent-vs-agent port collisions (manager:8000 can't collide)

manager is fixed at 8000, sub-agents are 8100-8999, so collisions
are strictly between two sub-agents hashing to the same value.
the colliding container's harness restart-loops on AddrInUse —
which the user just hit on :8945. previously the only sign was a
buried journalctl warn line.

now surfaced two ways:

- lifecycle::spawn / rebuild preflight: walks the live container
  list, computes each agent's hashed port, refuses with
  'port N already taken by <other> — rename one of them' if any
  running sub-agent shares the new agent's port. so the operator
  sees an actionable error in the dashboard's transient pill /
  approve-result instead of waiting for the harness to die.

- /api/state grows a port_conflicts: [{port, agents: [...]}]
  array; dashboard renders a pulsing red banner above the
  containers list listing each cluster. matches the questions
  panel pulse so it's hard to miss.
This commit is contained in:
müde 2026-05-15 22:08:19 +02:00
parent 2029840671
commit 6a2ffd521b
4 changed files with 100 additions and 0 deletions

View file

@ -146,6 +146,17 @@ struct StateSnapshot {
/// survive after a destroy-without-purge. The operator can re-spawn
/// with the same name to resume, or PURG3 to wipe them.
tombstones: Vec<TombstoneView>,
/// Sub-agents whose FNV-1a hashed web UI port collides with at
/// least one other agent. Operator resolves by renaming. The
/// dashboard renders a banner at the top listing each cluster.
port_conflicts: Vec<PortConflict>,
}
#[derive(Serialize)]
struct PortConflict {
port: u16,
/// All agent names sharing this port (sorted, ≥2 entries).
agents: Vec<String>,
}
#[derive(Serialize)]
@ -216,6 +227,7 @@ async fn api_state(headers: HeaderMap, State(state): State<AppState>) -> axum::J
let transients = build_transient_views(&raw_containers, &transient_snapshot);
let approvals = build_approval_views(pending_approvals).await;
let tombstones = build_tombstone_views(&state.coord, &containers, &transient_snapshot);
let port_conflicts = build_port_conflicts(&containers);
let operator_inbox = state
.coord
@ -234,9 +246,31 @@ async fn api_state(headers: HeaderMap, State(state): State<AppState>) -> axum::J
operator_inbox,
questions,
tombstones,
port_conflicts,
})
}
/// Group live containers by their assigned web UI port; clusters with
/// more than one member are port-hash collisions the operator needs
/// to resolve by renaming. Manager (fixed at 8000) and sub-agents
/// (8100..8999) can't collide with each other — collisions are
/// strictly between sub-agents.
fn build_port_conflicts(containers: &[ContainerView]) -> Vec<PortConflict> {
let mut by_port: std::collections::BTreeMap<u16, Vec<String>> =
std::collections::BTreeMap::new();
for c in containers {
by_port.entry(c.port).or_default().push(c.name.clone());
}
by_port
.into_iter()
.filter(|(_, agents)| agents.len() > 1)
.map(|(port, mut agents)| {
agents.sort();
PortConflict { port, agents }
})
.collect()
}
/// Build `ContainerView`s for every live nixos-container. Returns the
/// list and whether any container is stale (drives the "↻ UPD4TE 4LL"
/// banner).

View file

@ -107,6 +107,32 @@ fn validate(name: &str) -> Result<()> {
Ok(())
}
/// First name (≠ `self_name`) currently running whose hashed port
/// matches this agent's. The harness inside the colliding container
/// would otherwise loop on `AddrInUse` forever; we surface the
/// conflict here so spawn / rebuild fails loudly with an actionable
/// message instead.
async fn port_collision(self_name: &str) -> Option<String> {
let port = agent_web_port(self_name);
let raw = list().await.unwrap_or_default();
for c in raw {
let other = if c == MANAGER_NAME {
MANAGER_NAME.to_owned()
} else if let Some(n) = c.strip_prefix(AGENT_PREFIX) {
n.to_owned()
} else {
continue;
};
if other == self_name {
continue;
}
if agent_web_port(&other) == port && is_running(&other).await {
return Some(other);
}
}
None
}
#[allow(clippy::too_many_arguments)]
pub async fn spawn(
name: &str,
@ -119,6 +145,12 @@ pub async fn spawn(
dashboard_port: u16,
) -> Result<()> {
validate(name)?;
if let Some(other) = port_collision(name).await {
bail!(
"port {} is already taken by '{other}' — rename one of them and retry",
agent_web_port(name)
);
}
setup_proposed(proposed_dir, name).await?;
setup_applied(applied_dir, name, hyperhive_flake, dashboard_port).await?;
ensure_claude_dir(claude_dir)?;
@ -192,6 +224,12 @@ pub async fn rebuild(
dashboard_port: u16,
) -> Result<()> {
validate(name)?;
if let Some(other) = port_collision(name).await {
bail!(
"port {} is already taken by '{other}' — rename one of them and retry",
agent_web_port(name)
);
}
setup_applied(applied_dir, name, hyperhive_flake, dashboard_port).await?;
ensure_claude_dir(claude_dir)?;
ensure_state_dir(notes_dir)?;