diff --git a/hive-c0re/src/dashboard.rs b/hive-c0re/src/dashboard.rs index 07a4e9f..1cc418f 100644 --- a/hive-c0re/src/dashboard.rs +++ b/hive-c0re/src/dashboard.rs @@ -84,19 +84,43 @@ pub async fn serve(port: u16, coord: Arc) -> Result<()> { // `/dashboard/stream` for the unified live event channel. // --------------------------------------------------------------------------- -/// `SO_REUSEADDR` bind with retry. Mirrors the per-agent variant — -/// hive-c0re restarts also race the previous process's socket release. +/// `SO_REUSEADDR` bind with retry. Mirrors the per-agent variant in +/// `hive-ag3nt::web_ui::bind_with_retry`: hive-c0re restarts also +/// race the previous process's socket release, and the retry has no +/// attempt cap — capping was the proximate cause of issue #324 +/// (silent give-up on a long stale socket). Genuine port collisions +/// don't reach this layer (dashboard is bound to a fixed configured +/// port, no per-agent hashing), so any persistent `AddrInUse` always +/// reflects a recoverable stale socket. WARN for the first dozen +/// attempts; INFO after that to avoid spamming the journal during a +/// long hold; INFO on eventual success when we did have to retry. async fn bind_with_retry(addr: SocketAddr) -> Result { let mut delay_ms = 250u64; let mut attempts = 0u32; loop { match try_bind(addr) { - Ok(l) => return Ok(l), - Err(e) if e.kind() == std::io::ErrorKind::AddrInUse && attempts < 12 => { - tracing::warn!( - %addr, attempt = attempts + 1, - "dashboard: AddrInUse, retrying in {delay_ms}ms" - ); + Ok(l) => { + if attempts > 0 { + tracing::info!( + %addr, attempts, + "dashboard: bind succeeded after retry" + ); + } + return Ok(l); + } + Err(e) if e.kind() == std::io::ErrorKind::AddrInUse => { + let attempt = attempts + 1; + if attempt <= 12 { + tracing::warn!( + %addr, attempt, + "dashboard: AddrInUse, retrying in {delay_ms}ms" + ); + } else { + tracing::info!( + %addr, attempt, + "dashboard: AddrInUse still holding, retrying in {delay_ms}ms" + ); + } tokio::time::sleep(std::time::Duration::from_millis(delay_ms)).await; attempts += 1; delay_ms = (delay_ms * 2).min(2000);