surface agent-vs-agent port collisions (manager:8000 can't collide)
manager is fixed at 8000, sub-agents are 8100-8999, so collisions
are strictly between two sub-agents hashing to the same value.
the colliding container's harness restart-loops on AddrInUse —
which the user just hit on :8945. previously the only sign was a
buried journalctl warn line.
now surfaced two ways:
- lifecycle::spawn / rebuild preflight: walks the live container
list, computes each agent's hashed port, refuses with
'port N already taken by <other> — rename one of them' if any
running sub-agent shares the new agent's port. so the operator
sees an actionable error in the dashboard's transient pill /
approve-result instead of waiting for the harness to die.
- /api/state grows a port_conflicts: [{port, agents: [...]}]
array; dashboard renders a pulsing red banner above the
containers list listing each cluster. matches the questions
panel pulse so it's hard to miss.
This commit is contained in:
parent
2029840671
commit
6a2ffd521b
4 changed files with 100 additions and 0 deletions
|
|
@ -215,6 +215,18 @@
|
||||||
const root = $('containers-section');
|
const root = $('containers-section');
|
||||||
root.innerHTML = '';
|
root.innerHTML = '';
|
||||||
|
|
||||||
|
// Port-hash collisions: rename one of the listed agents and
|
||||||
|
// rebuild. The banner sits above the agent list so it's the
|
||||||
|
// first thing the operator sees when something's wedged.
|
||||||
|
if (s.port_conflicts && s.port_conflicts.length) {
|
||||||
|
const banner = el('div', { class: 'port-conflict' },
|
||||||
|
el('strong', {}, '⚠ port collision'), ' — ');
|
||||||
|
const groups = s.port_conflicts.map((c) =>
|
||||||
|
`:${c.port} (${c.agents.join(' + ')})`).join('; ');
|
||||||
|
banner.append(groups + '. rename one of each and ↻ R3BU1LD.');
|
||||||
|
root.append(banner);
|
||||||
|
}
|
||||||
|
|
||||||
if (s.any_stale) {
|
if (s.any_stale) {
|
||||||
root.append(form(
|
root.append(form(
|
||||||
'/update-all', 'btn-rebuild', '↻ UPD4TE 4LL',
|
'/update-all', 'btn-rebuild', '↻ UPD4TE 4LL',
|
||||||
|
|
|
||||||
|
|
@ -193,6 +193,22 @@ a:hover {
|
||||||
/* Notification controls — sit between the banner and the
|
/* Notification controls — sit between the banner and the
|
||||||
containers section. Hidden by JS when notifications are
|
containers section. Hidden by JS when notifications are
|
||||||
unsupported, denied, or already in the right state. */
|
unsupported, denied, or already in the right state. */
|
||||||
|
/* Port-collision banner: appears above the containers list when
|
||||||
|
two sub-agents hash to the same web UI port. Critical — without
|
||||||
|
resolution, one of the harnesses will restart-loop on
|
||||||
|
AddrInUse. */
|
||||||
|
.port-conflict {
|
||||||
|
background: rgba(243, 139, 168, 0.08);
|
||||||
|
border: 1px solid var(--red);
|
||||||
|
color: var(--red);
|
||||||
|
padding: 0.5em 0.8em;
|
||||||
|
margin-bottom: 0.6em;
|
||||||
|
border-radius: 4px;
|
||||||
|
text-shadow: 0 0 6px rgba(243, 139, 168, 0.4);
|
||||||
|
animation: questions-pulse 2.4s ease-in-out infinite;
|
||||||
|
}
|
||||||
|
.port-conflict strong { color: var(--red); }
|
||||||
|
|
||||||
.notif-row {
|
.notif-row {
|
||||||
display: flex;
|
display: flex;
|
||||||
gap: 0.5em;
|
gap: 0.5em;
|
||||||
|
|
|
||||||
|
|
@ -146,6 +146,17 @@ struct StateSnapshot {
|
||||||
/// survive after a destroy-without-purge. The operator can re-spawn
|
/// survive after a destroy-without-purge. The operator can re-spawn
|
||||||
/// with the same name to resume, or PURG3 to wipe them.
|
/// with the same name to resume, or PURG3 to wipe them.
|
||||||
tombstones: Vec<TombstoneView>,
|
tombstones: Vec<TombstoneView>,
|
||||||
|
/// Sub-agents whose FNV-1a hashed web UI port collides with at
|
||||||
|
/// least one other agent. Operator resolves by renaming. The
|
||||||
|
/// dashboard renders a banner at the top listing each cluster.
|
||||||
|
port_conflicts: Vec<PortConflict>,
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(Serialize)]
|
||||||
|
struct PortConflict {
|
||||||
|
port: u16,
|
||||||
|
/// All agent names sharing this port (sorted, ≥2 entries).
|
||||||
|
agents: Vec<String>,
|
||||||
}
|
}
|
||||||
|
|
||||||
#[derive(Serialize)]
|
#[derive(Serialize)]
|
||||||
|
|
@ -216,6 +227,7 @@ async fn api_state(headers: HeaderMap, State(state): State<AppState>) -> axum::J
|
||||||
let transients = build_transient_views(&raw_containers, &transient_snapshot);
|
let transients = build_transient_views(&raw_containers, &transient_snapshot);
|
||||||
let approvals = build_approval_views(pending_approvals).await;
|
let approvals = build_approval_views(pending_approvals).await;
|
||||||
let tombstones = build_tombstone_views(&state.coord, &containers, &transient_snapshot);
|
let tombstones = build_tombstone_views(&state.coord, &containers, &transient_snapshot);
|
||||||
|
let port_conflicts = build_port_conflicts(&containers);
|
||||||
|
|
||||||
let operator_inbox = state
|
let operator_inbox = state
|
||||||
.coord
|
.coord
|
||||||
|
|
@ -234,9 +246,31 @@ async fn api_state(headers: HeaderMap, State(state): State<AppState>) -> axum::J
|
||||||
operator_inbox,
|
operator_inbox,
|
||||||
questions,
|
questions,
|
||||||
tombstones,
|
tombstones,
|
||||||
|
port_conflicts,
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Group live containers by their assigned web UI port; clusters with
|
||||||
|
/// more than one member are port-hash collisions the operator needs
|
||||||
|
/// to resolve by renaming. Manager (fixed at 8000) and sub-agents
|
||||||
|
/// (8100..8999) can't collide with each other — collisions are
|
||||||
|
/// strictly between sub-agents.
|
||||||
|
fn build_port_conflicts(containers: &[ContainerView]) -> Vec<PortConflict> {
|
||||||
|
let mut by_port: std::collections::BTreeMap<u16, Vec<String>> =
|
||||||
|
std::collections::BTreeMap::new();
|
||||||
|
for c in containers {
|
||||||
|
by_port.entry(c.port).or_default().push(c.name.clone());
|
||||||
|
}
|
||||||
|
by_port
|
||||||
|
.into_iter()
|
||||||
|
.filter(|(_, agents)| agents.len() > 1)
|
||||||
|
.map(|(port, mut agents)| {
|
||||||
|
agents.sort();
|
||||||
|
PortConflict { port, agents }
|
||||||
|
})
|
||||||
|
.collect()
|
||||||
|
}
|
||||||
|
|
||||||
/// Build `ContainerView`s for every live nixos-container. Returns the
|
/// Build `ContainerView`s for every live nixos-container. Returns the
|
||||||
/// list and whether any container is stale (drives the "↻ UPD4TE 4LL"
|
/// list and whether any container is stale (drives the "↻ UPD4TE 4LL"
|
||||||
/// banner).
|
/// banner).
|
||||||
|
|
|
||||||
|
|
@ -107,6 +107,32 @@ fn validate(name: &str) -> Result<()> {
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// First name (≠ `self_name`) currently running whose hashed port
|
||||||
|
/// matches this agent's. The harness inside the colliding container
|
||||||
|
/// would otherwise loop on `AddrInUse` forever; we surface the
|
||||||
|
/// conflict here so spawn / rebuild fails loudly with an actionable
|
||||||
|
/// message instead.
|
||||||
|
async fn port_collision(self_name: &str) -> Option<String> {
|
||||||
|
let port = agent_web_port(self_name);
|
||||||
|
let raw = list().await.unwrap_or_default();
|
||||||
|
for c in raw {
|
||||||
|
let other = if c == MANAGER_NAME {
|
||||||
|
MANAGER_NAME.to_owned()
|
||||||
|
} else if let Some(n) = c.strip_prefix(AGENT_PREFIX) {
|
||||||
|
n.to_owned()
|
||||||
|
} else {
|
||||||
|
continue;
|
||||||
|
};
|
||||||
|
if other == self_name {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
if agent_web_port(&other) == port && is_running(&other).await {
|
||||||
|
return Some(other);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
None
|
||||||
|
}
|
||||||
|
|
||||||
#[allow(clippy::too_many_arguments)]
|
#[allow(clippy::too_many_arguments)]
|
||||||
pub async fn spawn(
|
pub async fn spawn(
|
||||||
name: &str,
|
name: &str,
|
||||||
|
|
@ -119,6 +145,12 @@ pub async fn spawn(
|
||||||
dashboard_port: u16,
|
dashboard_port: u16,
|
||||||
) -> Result<()> {
|
) -> Result<()> {
|
||||||
validate(name)?;
|
validate(name)?;
|
||||||
|
if let Some(other) = port_collision(name).await {
|
||||||
|
bail!(
|
||||||
|
"port {} is already taken by '{other}' — rename one of them and retry",
|
||||||
|
agent_web_port(name)
|
||||||
|
);
|
||||||
|
}
|
||||||
setup_proposed(proposed_dir, name).await?;
|
setup_proposed(proposed_dir, name).await?;
|
||||||
setup_applied(applied_dir, name, hyperhive_flake, dashboard_port).await?;
|
setup_applied(applied_dir, name, hyperhive_flake, dashboard_port).await?;
|
||||||
ensure_claude_dir(claude_dir)?;
|
ensure_claude_dir(claude_dir)?;
|
||||||
|
|
@ -192,6 +224,12 @@ pub async fn rebuild(
|
||||||
dashboard_port: u16,
|
dashboard_port: u16,
|
||||||
) -> Result<()> {
|
) -> Result<()> {
|
||||||
validate(name)?;
|
validate(name)?;
|
||||||
|
if let Some(other) = port_collision(name).await {
|
||||||
|
bail!(
|
||||||
|
"port {} is already taken by '{other}' — rename one of them and retry",
|
||||||
|
agent_web_port(name)
|
||||||
|
);
|
||||||
|
}
|
||||||
setup_applied(applied_dir, name, hyperhive_flake, dashboard_port).await?;
|
setup_applied(applied_dir, name, hyperhive_flake, dashboard_port).await?;
|
||||||
ensure_claude_dir(claude_dir)?;
|
ensure_claude_dir(claude_dir)?;
|
||||||
ensure_state_dir(notes_dir)?;
|
ensure_state_dir(notes_dir)?;
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue