diff --git a/hive-ag3nt/src/web_ui.rs b/hive-ag3nt/src/web_ui.rs index fde54d3..3868c82 100644 --- a/hive-ag3nt/src/web_ui.rs +++ b/hive-ag3nt/src/web_ui.rs @@ -135,22 +135,51 @@ pub async fn serve( // --------------------------------------------------------------------------- /// Bind a TCP listener with `SO_REUSEADDR` set, retrying on -/// `AddrInUse` for up to ~20s. nspawn restarts can race the previous -/// harness's socket release; `SO_REUSEADDR` lets us reclaim a port -/// still in `TIME_WAIT` from a clean previous exit, and the retry -/// covers the case where the previous process is genuinely still -/// alive (systemd restart-delay overlap). +/// `AddrInUse` indefinitely with exponential backoff capped at 2s. +/// nspawn restarts can race the previous harness's socket release; +/// `SO_REUSEADDR` lets us reclaim a port still in `TIME_WAIT` from a +/// clean previous exit, and the retry covers the case where the +/// previous process is genuinely still alive (systemd restart-delay +/// overlap). +/// +/// The retry has no attempt cap: capping was the proximate cause of +/// issue #324 — two back-to-back restarts left the previous socket +/// holding the port for longer than the ~20s the old 12-attempt +/// budget allowed, and the harness silently lost its web UI for the +/// rest of the process lifetime. Genuine port collisions are +/// preflighted host-side (`lifecycle::{spawn,rebuild}`) and surfaced +/// on the dashboard as a banner, so at this layer a persistent +/// `AddrInUse` always reflects a recoverable stale socket — retrying +/// forever is the safe choice. The first attempts log at WARN; once +/// we cross attempt 12 the level drops to INFO so a long stale +/// socket doesn't flood the journal. async fn bind_with_retry(addr: SocketAddr, label: &str) -> Result { let mut delay_ms = 250u64; let mut attempts = 0u32; loop { match try_bind(addr) { - Ok(l) => return Ok(l), - Err(e) if e.kind() == std::io::ErrorKind::AddrInUse && attempts < 12 => { - tracing::warn!( - %addr, attempt = attempts + 1, - "{label}: AddrInUse, retrying in {delay_ms}ms" - ); + Ok(l) => { + if attempts > 0 { + tracing::info!( + %addr, attempts, + "{label}: bind succeeded after retry" + ); + } + return Ok(l); + } + Err(e) if e.kind() == std::io::ErrorKind::AddrInUse => { + let attempt = attempts + 1; + if attempt <= 12 { + tracing::warn!( + %addr, attempt, + "{label}: AddrInUse, retrying in {delay_ms}ms" + ); + } else { + tracing::info!( + %addr, attempt, + "{label}: AddrInUse still holding, retrying in {delay_ms}ms" + ); + } tokio::time::sleep(std::time::Duration::from_millis(delay_ms)).await; attempts += 1; delay_ms = (delay_ms * 2).min(2000);