From c35f566d1511c2824591969ebe8db6922e0daf14 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?m=C3=BCde?= Date: Fri, 15 May 2026 21:13:17 +0200 Subject: [PATCH] agent_web_port: actually resolve legacy collisions MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit previous attempt was wrong: the legacy branch returned port_hash unconditionally, so two legacies hashing to the same port both wrote that port and the collision persisted (test still trying to bind coder's port). new rule: always probe forward from port_hash, with scan_taken_ports parameterised by include_implicit_hashes: - legacy migration (applied dir exists, no port file): pass false. scan only counts other agents' port files. first-queried legacy claims its hash; subsequent colliders see the first's port file and probe forward. we don't know which legacy originally won the bind race, so first-write-wins; the loser was already crash-looping anyway and gets a fresh port to rebuild to. - fresh spawn (no applied dir): pass true. counts port files AND implicit hashes for not-yet-migrated legacies, so a new spawn doesn't race with an unmigrated peer. migration note for affected users: agents whose port file says something other than their hashed port may have been corrupted by the previous fix. Hit ↻ R3BU1LD on the offender to regenerate the flake (uses the current port file) and the container will bind the right port on restart. --- hive-c0re/src/lifecycle.rs | 95 ++++++++++++++++++++------------------ 1 file changed, 49 insertions(+), 46 deletions(-) diff --git a/hive-c0re/src/lifecycle.rs b/hive-c0re/src/lifecycle.rs index 2ddf30c..4b6b991 100644 --- a/hive-c0re/src/lifecycle.rs +++ b/hive-c0re/src/lifecycle.rs @@ -46,18 +46,26 @@ const DEFAULT_MEMORY_MAX: &str = "2G"; const DEFAULT_CPU_QUOTA: &str = "50%"; /// Returns the per-agent web UI port. Manager is fixed at `MANAGER_PORT`. -/// For sub-agents the port is sticky once chosen: /// -/// - **Port file present** (`state_root/port`): use it. End of story. -/// - **Port file absent, applied flake present**: this is a legacy -/// agent whose container is already bound to the bare -/// `port_hash(name)`. Don't probe; just migrate by writing that -/// value to the port file. The container stays where it is and -/// subsequent renders agree with it. -/// - **Port file absent, no applied flake**: this is a fresh spawn. -/// Probe forward from `port_hash(name)` to skip any port another -/// sub-agent has already claimed (via port file or legacy hash). -/// Write the chosen port back. +/// The port is **sticky** once chosen: looked up from +/// `state_root/port` if present. On first call we have to decide +/// what to write there, and the answer depends on whether the agent +/// is legacy (has an applied flake — container already exists at +/// some port) or fresh (no applied dir yet — we're about to pick). +/// +/// In both cases we probe forward from `port_hash(name)` to skip +/// ports already claimed by another agent's *port file*. The +/// difference is what we count as "claimed": +/// +/// - **Legacy migration**: only port files count. We do NOT treat +/// other legacy agents' implicit hashes as taken — if two legacy +/// agents collide on hash, the first queried claims the hash port +/// and the others probe forward. (We don't know which originally +/// won the bind race; first-write-wins is good enough — the +/// loser was already crash-looping anyway.) +/// - **Fresh spawn**: port files AND implicit hashes for any legacy +/// agent that hasn't been migrated yet. Without that, a new +/// agent could pick the same port as a yet-to-be-migrated legacy. #[must_use] pub fn agent_web_port(name: &str) -> u16 { if name == MANAGER_NAME { @@ -71,31 +79,20 @@ pub fn agent_web_port(name: &str) -> u16 { { return port; } - let applied_exists = crate::coordinator::Coordinator::agent_applied_dir(name).exists(); - let chosen = if applied_exists { - // Legacy agent — container already running on the hashed - // port. Don't move it; just persist the value so future - // calls bypass this path. - port_hash(name) - } else { - let taken = scan_taken_ports(name); - let start = port_hash(name); - let mut port = start; - for _ in 0..WEB_PORT_RANGE { - if !taken.contains(&port) { - break; - } - port = next_port(port); - if port == start { - // Range fully exhausted (very unlikely — 900 slots) — - // give up and use the hashed value; collisions are - // surfaced as bind errors by the harness retry loop. - tracing::warn!(%name, "agent_web_port: range exhausted, returning hash"); - break; - } + let is_legacy = crate::coordinator::Coordinator::agent_applied_dir(name).exists(); + let taken = scan_taken_ports(name, /* include_implicit_hashes = */ !is_legacy); + let start = port_hash(name); + let mut chosen = start; + for _ in 0..WEB_PORT_RANGE { + if !taken.contains(&chosen) { + break; } - port - }; + chosen = next_port(chosen); + if chosen == start { + tracing::warn!(%name, "agent_web_port: range exhausted, returning hash"); + break; + } + } let _ = std::fs::create_dir_all(&state_root); if let Err(e) = std::fs::write(&port_file, format!("{chosen}\n")) { tracing::warn!(error = ?e, file = %port_file.display(), "persisting agent port failed"); @@ -122,14 +119,20 @@ fn next_port(port: u16) -> u16 { } } -/// Scan every other agent's effective web UI port: prefer the -/// persisted `port` file when present, fall back to the hashed -/// value for legacy agents that pre-date the port-file scheme. The -/// latter is important on existing deployments — without it, a new -/// agent's collision check wouldn't see incumbents that haven't -/// written their port file yet, and we'd re-emit the same -/// collision the operator just hit. -fn scan_taken_ports(name: &str) -> std::collections::HashSet { +/// Set of ports claimed by other agents. +/// +/// - Always includes ports persisted to other agents' port files. +/// - `include_implicit_hashes`: if true (fresh-spawn case), also +/// include `port_hash(other_name)` for every other agent that has +/// NOT yet been migrated to a port file. This protects new spawns +/// from racing with not-yet-migrated legacies. +/// +/// When false (legacy migration), we do NOT count implicit hashes +/// — otherwise two legacies that hash to the same port would each +/// see each other's hash as taken and both probe forward away +/// from the port one of them is actually running on, leaving a +/// dashboard URL that doesn't match the running container. +fn scan_taken_ports(name: &str, include_implicit_hashes: bool) -> std::collections::HashSet { let mut out = std::collections::HashSet::new(); let Ok(rd) = std::fs::read_dir("/var/lib/hyperhive/agents") else { return out; @@ -146,9 +149,9 @@ fn scan_taken_ports(name: &str) -> std::collections::HashSet { && let Ok(port) = s.trim().parse::() { out.insert(port); - } else { - // Legacy: no port file yet → its effective port is the - // bare hash. Treat as taken so we don't collide with it. + } else if include_implicit_hashes { + // Legacy not yet migrated. From the fresh-spawn POV its + // effective port is the bare hash. out.insert(port_hash(&file_name)); } }