From 79a46f359ae847d226a30d3043cccdfa538058ba Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?m=C3=BCde?= Date: Fri, 15 May 2026 20:41:18 +0200 Subject: [PATCH] agent_web_port: collision-aware sticky allocation MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit operator hit 'coder' and 'test' colliding on the same hashed port — fnv-1a mod 900 has ~0.1% collision probability per pair and clearly that's not enough. agent_web_port goes stateful: - per-agent port persisted to /var/lib/hyperhive/agents//port - on first call, look up the file; if absent, hash, then probe forward through the allocated range skipping any port other agents already claim, then write the chosen value back - subsequent calls return the persisted port (sticky) other agents' ports come from their port file if present, else the fallback is the hashed value — that handles existing deployments without forcing a rebuild-all just to migrate. rebuilding the colliding agent re-runs agent_web_port, sees its peer's implicit hash port as taken, picks the next free slot, persists. range exhaustion (very unlikely — 900 slots) logs a warning and returns the hash; the bind-with-retry on the harness will surface the failure honestly rather than silently looping. --- hive-c0re/src/lifecycle.rs | 85 ++++++++++++++++++++++++++++++++++++-- 1 file changed, 82 insertions(+), 3 deletions(-) diff --git a/hive-c0re/src/lifecycle.rs b/hive-c0re/src/lifecycle.rs index 985b82d..f675735 100644 --- a/hive-c0re/src/lifecycle.rs +++ b/hive-c0re/src/lifecycle.rs @@ -45,14 +45,51 @@ const WEB_PORT_RANGE: u16 = 900; const DEFAULT_MEMORY_MAX: &str = "2G"; const DEFAULT_CPU_QUOTA: &str = "50%"; -/// Returns the per-agent web UI port. Same hash on both sides — manager, -/// dashboard, and agent harness all agree. Manager is fixed at -/// `MANAGER_PORT`. +/// Returns the per-agent web UI port. Manager is fixed at `MANAGER_PORT`. +/// For sub-agents the port is sticky once chosen: looked up from +/// `agent_state_root(name)/port` if present, otherwise derived from +/// the FNV-1a hash of the name and *probed forward* through the +/// allocated range to skip any port another sub-agent has already +/// claimed (birthday-paradox collisions are real even at 2–3 +/// agents). The chosen port is written back so subsequent calls +/// resolve to the same value without re-probing. #[must_use] pub fn agent_web_port(name: &str) -> u16 { if name == MANAGER_NAME { return MANAGER_PORT; } + let state_root = crate::coordinator::Coordinator::agent_state_root(name); + let port_file = state_root.join("port"); + if let Ok(s) = std::fs::read_to_string(&port_file) + && let Ok(port) = s.trim().parse::() + && (WEB_PORT_BASE..WEB_PORT_BASE + WEB_PORT_RANGE).contains(&port) + { + return port; + } + let taken = scan_taken_ports(name); + let start = port_hash(name); + let mut port = start; + for _ in 0..WEB_PORT_RANGE { + if !taken.contains(&port) { + break; + } + port = next_port(port); + if port == start { + // Range fully exhausted (very unlikely — 900 slots) — + // give up and just use the hashed value; collisions are + // surfaced as bind errors by the harness retry loop. + tracing::warn!(%name, "agent_web_port: range exhausted, returning hash"); + return start; + } + } + let _ = std::fs::create_dir_all(&state_root); + if let Err(e) = std::fs::write(&port_file, format!("{port}\n")) { + tracing::warn!(error = ?e, file = %port_file.display(), "persisting agent port failed"); + } + port +} + +fn port_hash(name: &str) -> u16 { let mut hash: u32 = 2_166_136_261; for b in name.bytes() { hash ^= u32::from(b); @@ -62,6 +99,48 @@ pub fn agent_web_port(name: &str) -> u16 { WEB_PORT_BASE + u16::try_from(hash % u32::from(WEB_PORT_RANGE)).unwrap_or(0) } +fn next_port(port: u16) -> u16 { + let p = port + 1; + if p >= WEB_PORT_BASE + WEB_PORT_RANGE { + WEB_PORT_BASE + } else { + p + } +} + +/// Scan every other agent's effective web UI port: prefer the +/// persisted `port` file when present, fall back to the hashed +/// value for legacy agents that pre-date the port-file scheme. The +/// latter is important on existing deployments — without it, a new +/// agent's collision check wouldn't see incumbents that haven't +/// written their port file yet, and we'd re-emit the same +/// collision the operator just hit. +fn scan_taken_ports(name: &str) -> std::collections::HashSet { + let mut out = std::collections::HashSet::new(); + let Ok(rd) = std::fs::read_dir("/var/lib/hyperhive/agents") else { + return out; + }; + for entry in rd.flatten() { + let Ok(file_name) = entry.file_name().into_string() else { + continue; + }; + if file_name == name || file_name == MANAGER_NAME { + continue; + } + let pf = entry.path().join("port"); + if let Ok(s) = std::fs::read_to_string(&pf) + && let Ok(port) = s.trim().parse::() + { + out.insert(port); + } else { + // Legacy: no port file yet → its effective port is the + // bare hash. Treat as taken so we don't collide with it. + out.insert(port_hash(&file_name)); + } + } + out +} + #[must_use] pub fn container_name(name: &str) -> String { if name == MANAGER_NAME {