surface agent-vs-agent port collisions (manager:8000 can't collide)
manager is fixed at 8000, sub-agents are 8100-8999, so collisions
are strictly between two sub-agents hashing to the same value.
the colliding container's harness restart-loops on AddrInUse —
which the user just hit on :8945. previously the only sign was a
buried journalctl warn line.
now surfaced two ways:
- lifecycle::spawn / rebuild preflight: walks the live container
list, computes each agent's hashed port, refuses with
'port N already taken by <other> — rename one of them' if any
running sub-agent shares the new agent's port. so the operator
sees an actionable error in the dashboard's transient pill /
approve-result instead of waiting for the harness to die.
- /api/state grows a port_conflicts: [{port, agents: [...]}]
array; dashboard renders a pulsing red banner above the
containers list listing each cluster. matches the questions
panel pulse so it's hard to miss.
This commit is contained in:
parent
2029840671
commit
6a2ffd521b
4 changed files with 100 additions and 0 deletions
|
|
@ -107,6 +107,32 @@ fn validate(name: &str) -> Result<()> {
|
|||
Ok(())
|
||||
}
|
||||
|
||||
/// First name (≠ `self_name`) currently running whose hashed port
|
||||
/// matches this agent's. The harness inside the colliding container
|
||||
/// would otherwise loop on `AddrInUse` forever; we surface the
|
||||
/// conflict here so spawn / rebuild fails loudly with an actionable
|
||||
/// message instead.
|
||||
async fn port_collision(self_name: &str) -> Option<String> {
|
||||
let port = agent_web_port(self_name);
|
||||
let raw = list().await.unwrap_or_default();
|
||||
for c in raw {
|
||||
let other = if c == MANAGER_NAME {
|
||||
MANAGER_NAME.to_owned()
|
||||
} else if let Some(n) = c.strip_prefix(AGENT_PREFIX) {
|
||||
n.to_owned()
|
||||
} else {
|
||||
continue;
|
||||
};
|
||||
if other == self_name {
|
||||
continue;
|
||||
}
|
||||
if agent_web_port(&other) == port && is_running(&other).await {
|
||||
return Some(other);
|
||||
}
|
||||
}
|
||||
None
|
||||
}
|
||||
|
||||
#[allow(clippy::too_many_arguments)]
|
||||
pub async fn spawn(
|
||||
name: &str,
|
||||
|
|
@ -119,6 +145,12 @@ pub async fn spawn(
|
|||
dashboard_port: u16,
|
||||
) -> Result<()> {
|
||||
validate(name)?;
|
||||
if let Some(other) = port_collision(name).await {
|
||||
bail!(
|
||||
"port {} is already taken by '{other}' — rename one of them and retry",
|
||||
agent_web_port(name)
|
||||
);
|
||||
}
|
||||
setup_proposed(proposed_dir, name).await?;
|
||||
setup_applied(applied_dir, name, hyperhive_flake, dashboard_port).await?;
|
||||
ensure_claude_dir(claude_dir)?;
|
||||
|
|
@ -192,6 +224,12 @@ pub async fn rebuild(
|
|||
dashboard_port: u16,
|
||||
) -> Result<()> {
|
||||
validate(name)?;
|
||||
if let Some(other) = port_collision(name).await {
|
||||
bail!(
|
||||
"port {} is already taken by '{other}' — rename one of them and retry",
|
||||
agent_web_port(name)
|
||||
);
|
||||
}
|
||||
setup_applied(applied_dir, name, hyperhive_flake, dashboard_port).await?;
|
||||
ensure_claude_dir(claude_dir)?;
|
||||
ensure_state_dir(notes_dir)?;
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue