From d73175a23e85b9a983fa06c723b98205e1d11c86 Mon Sep 17 00:00:00 2001 From: iris Date: Sat, 23 May 2026 02:19:14 +0200 Subject: [PATCH] harness: keep retrying web-UI bind on AddrInUse MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The retry was capped at 12 attempts (~20s of exponential backoff capped at 2s). Two back-to-back nspawn restarts in #324 left the previous socket holding the port longer than that budget; once the cap fired, the web-UI task returned an error and silently died for the rest of the process lifetime — the agent kept running fine otherwise (MCP, turn loop), but the operator's dashboard click hit nothing. Genuine port collisions are preflighted host-side (lifecycle::{spawn,rebuild}) and surfaced as a port-conflict banner, so at this layer a persistent AddrInUse always reflects a recoverable stale socket. Drop the cap, keep retrying forever with the same 2s-capped backoff. WARN for the first dozen attempts so a normal restart-race is visible; INFO after that to avoid spamming the journal during a long stale-socket hold. Logs a one-line INFO on success when we did have to retry, so post-mortems can find the attempt count. Closes #324. --- hive-ag3nt/src/web_ui.rs | 51 +++++++++++++++++++++++++++++++--------- 1 file changed, 40 insertions(+), 11 deletions(-) diff --git a/hive-ag3nt/src/web_ui.rs b/hive-ag3nt/src/web_ui.rs index fde54d3..3868c82 100644 --- a/hive-ag3nt/src/web_ui.rs +++ b/hive-ag3nt/src/web_ui.rs @@ -135,22 +135,51 @@ pub async fn serve( // --------------------------------------------------------------------------- /// Bind a TCP listener with `SO_REUSEADDR` set, retrying on -/// `AddrInUse` for up to ~20s. nspawn restarts can race the previous -/// harness's socket release; `SO_REUSEADDR` lets us reclaim a port -/// still in `TIME_WAIT` from a clean previous exit, and the retry -/// covers the case where the previous process is genuinely still -/// alive (systemd restart-delay overlap). +/// `AddrInUse` indefinitely with exponential backoff capped at 2s. +/// nspawn restarts can race the previous harness's socket release; +/// `SO_REUSEADDR` lets us reclaim a port still in `TIME_WAIT` from a +/// clean previous exit, and the retry covers the case where the +/// previous process is genuinely still alive (systemd restart-delay +/// overlap). +/// +/// The retry has no attempt cap: capping was the proximate cause of +/// issue #324 — two back-to-back restarts left the previous socket +/// holding the port for longer than the ~20s the old 12-attempt +/// budget allowed, and the harness silently lost its web UI for the +/// rest of the process lifetime. Genuine port collisions are +/// preflighted host-side (`lifecycle::{spawn,rebuild}`) and surfaced +/// on the dashboard as a banner, so at this layer a persistent +/// `AddrInUse` always reflects a recoverable stale socket — retrying +/// forever is the safe choice. The first attempts log at WARN; once +/// we cross attempt 12 the level drops to INFO so a long stale +/// socket doesn't flood the journal. async fn bind_with_retry(addr: SocketAddr, label: &str) -> Result { let mut delay_ms = 250u64; let mut attempts = 0u32; loop { match try_bind(addr) { - Ok(l) => return Ok(l), - Err(e) if e.kind() == std::io::ErrorKind::AddrInUse && attempts < 12 => { - tracing::warn!( - %addr, attempt = attempts + 1, - "{label}: AddrInUse, retrying in {delay_ms}ms" - ); + Ok(l) => { + if attempts > 0 { + tracing::info!( + %addr, attempts, + "{label}: bind succeeded after retry" + ); + } + return Ok(l); + } + Err(e) if e.kind() == std::io::ErrorKind::AddrInUse => { + let attempt = attempts + 1; + if attempt <= 12 { + tracing::warn!( + %addr, attempt, + "{label}: AddrInUse, retrying in {delay_ms}ms" + ); + } else { + tracing::info!( + %addr, attempt, + "{label}: AddrInUse still holding, retrying in {delay_ms}ms" + ); + } tokio::time::sleep(std::time::Duration::from_millis(delay_ms)).await; attempts += 1; delay_ms = (delay_ms * 2).min(2000);