From 2c087f53edaa0150d6513c72dc9d5157be51f654 Mon Sep 17 00:00:00 2001 From: iris Date: Sat, 23 May 2026 02:24:49 +0200 Subject: [PATCH] dashboard: same bind retry policy as the agent half MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit dashboard.rs had the same 12-attempt cap shape as the per-agent bind_with_retry. Apply the same fix — retry forever with the 2s-capped backoff, WARN early then INFO once we're clearly stuck on a stale socket, INFO on success when we did have to retry. Mirrors the agent change in this PR. --- hive-c0re/src/dashboard.rs | 40 ++++++++++++++++++++++++++++++-------- 1 file changed, 32 insertions(+), 8 deletions(-) diff --git a/hive-c0re/src/dashboard.rs b/hive-c0re/src/dashboard.rs index 07a4e9f..1cc418f 100644 --- a/hive-c0re/src/dashboard.rs +++ b/hive-c0re/src/dashboard.rs @@ -84,19 +84,43 @@ pub async fn serve(port: u16, coord: Arc) -> Result<()> { // `/dashboard/stream` for the unified live event channel. // --------------------------------------------------------------------------- -/// `SO_REUSEADDR` bind with retry. Mirrors the per-agent variant — -/// hive-c0re restarts also race the previous process's socket release. +/// `SO_REUSEADDR` bind with retry. Mirrors the per-agent variant in +/// `hive-ag3nt::web_ui::bind_with_retry`: hive-c0re restarts also +/// race the previous process's socket release, and the retry has no +/// attempt cap — capping was the proximate cause of issue #324 +/// (silent give-up on a long stale socket). Genuine port collisions +/// don't reach this layer (dashboard is bound to a fixed configured +/// port, no per-agent hashing), so any persistent `AddrInUse` always +/// reflects a recoverable stale socket. WARN for the first dozen +/// attempts; INFO after that to avoid spamming the journal during a +/// long hold; INFO on eventual success when we did have to retry. async fn bind_with_retry(addr: SocketAddr) -> Result { let mut delay_ms = 250u64; let mut attempts = 0u32; loop { match try_bind(addr) { - Ok(l) => return Ok(l), - Err(e) if e.kind() == std::io::ErrorKind::AddrInUse && attempts < 12 => { - tracing::warn!( - %addr, attempt = attempts + 1, - "dashboard: AddrInUse, retrying in {delay_ms}ms" - ); + Ok(l) => { + if attempts > 0 { + tracing::info!( + %addr, attempts, + "dashboard: bind succeeded after retry" + ); + } + return Ok(l); + } + Err(e) if e.kind() == std::io::ErrorKind::AddrInUse => { + let attempt = attempts + 1; + if attempt <= 12 { + tracing::warn!( + %addr, attempt, + "dashboard: AddrInUse, retrying in {delay_ms}ms" + ); + } else { + tracing::info!( + %addr, attempt, + "dashboard: AddrInUse still holding, retrying in {delay_ms}ms" + ); + } tokio::time::sleep(std::time::Duration::from_millis(delay_ms)).await; attempts += 1; delay_ms = (delay_ms * 2).min(2000);