From acaa0eb89508df4a2e703dcf878cadbc0fd5aea2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?m=C3=BCde?= Date: Fri, 15 May 2026 21:17:31 +0200 Subject: [PATCH] agent_web_port: back to pure hash, drop port-file dance MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit operator's call: probing-forward + state-file machinery is more brittle than the bug it tried to fix. revert to the original deterministic FNV-1a hash mod 900. collisions are real but rare; operator resolves by renaming (different name → different hash) and rebuilding. no per-agent port file, no scan, no migration path, nothing to drift out of sync with the running container. existing port files on disk are silently ignored — operator rebuilds affected agents to regenerate flakes from the deterministic hash. --- hive-c0re/src/lifecycle.rs | 106 +++---------------------------------- 1 file changed, 7 insertions(+), 99 deletions(-) diff --git a/hive-c0re/src/lifecycle.rs b/hive-c0re/src/lifecycle.rs index 4b6b991..130ddfd 100644 --- a/hive-c0re/src/lifecycle.rs +++ b/hive-c0re/src/lifecycle.rs @@ -45,62 +45,18 @@ const WEB_PORT_RANGE: u16 = 900; const DEFAULT_MEMORY_MAX: &str = "2G"; const DEFAULT_CPU_QUOTA: &str = "50%"; -/// Returns the per-agent web UI port. Manager is fixed at `MANAGER_PORT`. -/// -/// The port is **sticky** once chosen: looked up from -/// `state_root/port` if present. On first call we have to decide -/// what to write there, and the answer depends on whether the agent -/// is legacy (has an applied flake — container already exists at -/// some port) or fresh (no applied dir yet — we're about to pick). -/// -/// In both cases we probe forward from `port_hash(name)` to skip -/// ports already claimed by another agent's *port file*. The -/// difference is what we count as "claimed": -/// -/// - **Legacy migration**: only port files count. We do NOT treat -/// other legacy agents' implicit hashes as taken — if two legacy -/// agents collide on hash, the first queried claims the hash port -/// and the others probe forward. (We don't know which originally -/// won the bind race; first-write-wins is good enough — the -/// loser was already crash-looping anyway.) -/// - **Fresh spawn**: port files AND implicit hashes for any legacy -/// agent that hasn't been migrated yet. Without that, a new -/// agent could pick the same port as a yet-to-be-migrated legacy. +/// Per-agent web UI port. Manager is fixed at `MANAGER_PORT`; every +/// sub-agent is `WEB_PORT_BASE + FNV-1a(name) % WEB_PORT_RANGE`, +/// pure and reproducible from just the name. Collisions are +/// possible (birthday paradox at ~30 agents); the operator resolves +/// them by renaming an agent (different hash → different port). +/// Stable across hosts, restarts, and dashboard renders — no +/// state-file dance. #[must_use] pub fn agent_web_port(name: &str) -> u16 { if name == MANAGER_NAME { return MANAGER_PORT; } - let state_root = crate::coordinator::Coordinator::agent_state_root(name); - let port_file = state_root.join("port"); - if let Ok(s) = std::fs::read_to_string(&port_file) - && let Ok(port) = s.trim().parse::() - && (WEB_PORT_BASE..WEB_PORT_BASE + WEB_PORT_RANGE).contains(&port) - { - return port; - } - let is_legacy = crate::coordinator::Coordinator::agent_applied_dir(name).exists(); - let taken = scan_taken_ports(name, /* include_implicit_hashes = */ !is_legacy); - let start = port_hash(name); - let mut chosen = start; - for _ in 0..WEB_PORT_RANGE { - if !taken.contains(&chosen) { - break; - } - chosen = next_port(chosen); - if chosen == start { - tracing::warn!(%name, "agent_web_port: range exhausted, returning hash"); - break; - } - } - let _ = std::fs::create_dir_all(&state_root); - if let Err(e) = std::fs::write(&port_file, format!("{chosen}\n")) { - tracing::warn!(error = ?e, file = %port_file.display(), "persisting agent port failed"); - } - chosen -} - -fn port_hash(name: &str) -> u16 { let mut hash: u32 = 2_166_136_261; for b in name.bytes() { hash ^= u32::from(b); @@ -110,54 +66,6 @@ fn port_hash(name: &str) -> u16 { WEB_PORT_BASE + u16::try_from(hash % u32::from(WEB_PORT_RANGE)).unwrap_or(0) } -fn next_port(port: u16) -> u16 { - let p = port + 1; - if p >= WEB_PORT_BASE + WEB_PORT_RANGE { - WEB_PORT_BASE - } else { - p - } -} - -/// Set of ports claimed by other agents. -/// -/// - Always includes ports persisted to other agents' port files. -/// - `include_implicit_hashes`: if true (fresh-spawn case), also -/// include `port_hash(other_name)` for every other agent that has -/// NOT yet been migrated to a port file. This protects new spawns -/// from racing with not-yet-migrated legacies. -/// -/// When false (legacy migration), we do NOT count implicit hashes -/// — otherwise two legacies that hash to the same port would each -/// see each other's hash as taken and both probe forward away -/// from the port one of them is actually running on, leaving a -/// dashboard URL that doesn't match the running container. -fn scan_taken_ports(name: &str, include_implicit_hashes: bool) -> std::collections::HashSet { - let mut out = std::collections::HashSet::new(); - let Ok(rd) = std::fs::read_dir("/var/lib/hyperhive/agents") else { - return out; - }; - for entry in rd.flatten() { - let Ok(file_name) = entry.file_name().into_string() else { - continue; - }; - if file_name == name || file_name == MANAGER_NAME { - continue; - } - let pf = entry.path().join("port"); - if let Ok(s) = std::fs::read_to_string(&pf) - && let Ok(port) = s.trim().parse::() - { - out.insert(port); - } else if include_implicit_hashes { - // Legacy not yet migrated. From the fresh-spawn POV its - // effective port is the bare hash. - out.insert(port_hash(&file_name)); - } - } - out -} - #[must_use] pub fn container_name(name: &str) -> String { if name == MANAGER_NAME {