From 2146e47770d349fc58a0efc1a3160b8f31a1dc99 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?m=C3=BCde?= Date: Fri, 15 May 2026 20:33:51 +0200 Subject: [PATCH] web ui: retry binding on AddrInUse during restart races MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit operator hit 'Address already in use (os error 98)' on a harness restart — the new harness raced the old socket's release. add a bind_with_retry helper that backs off (250ms doubling, capped at 2s, 12 tries ≈ 22s total) on AddrInUse before giving up. applied to both the per-agent web UI and the hive-c0re dashboard. proper fix would be SO_REUSEADDR via socket2 but retry covers the TIME_WAIT case fine and keeps the dep count down. Other bind errors still fail immediately (port permission, fd exhaustion). --- hive-ag3nt/src/web_ui.rs | 31 ++++++++++++++++++++++++++++--- hive-c0re/src/dashboard.rs | 28 +++++++++++++++++++++++++--- 2 files changed, 53 insertions(+), 6 deletions(-) diff --git a/hive-ag3nt/src/web_ui.rs b/hive-ag3nt/src/web_ui.rs index 9228857..688f7a0 100644 --- a/hive-ag3nt/src/web_ui.rs +++ b/hive-ag3nt/src/web_ui.rs @@ -83,9 +83,7 @@ pub async fn serve( .route("/api/compact", post(post_compact)) .with_state(state); let addr = SocketAddr::from(([0, 0, 0, 0], port)); - let listener = tokio::net::TcpListener::bind(addr) - .await - .with_context(|| format!("bind web UI on port {port}"))?; + let listener = bind_with_retry(addr, "web UI").await?; tracing::info!(%port, "web UI listening"); axum::serve(listener, app).await?; Ok(()) @@ -95,6 +93,33 @@ pub async fn serve( // Static assets + state snapshot // --------------------------------------------------------------------------- +/// Bind a TCP listener, retrying on `AddrInUse` for up to ~20s. +/// nspawn restarts can race the previous harness's socket release; +/// without retry the new harness fails to bind and systemd just +/// keeps restarting it. `SO_REUSEADDR` would be the proper fix but +/// would require socket2; retry is good enough here. +async fn bind_with_retry(addr: SocketAddr, label: &str) -> Result { + let mut delay_ms = 250u64; + let mut attempts = 0u32; + loop { + match tokio::net::TcpListener::bind(addr).await { + Ok(l) => return Ok(l), + Err(e) if e.kind() == std::io::ErrorKind::AddrInUse && attempts < 12 => { + tracing::warn!( + %addr, attempt = attempts + 1, + "{label}: AddrInUse, retrying in {delay_ms}ms" + ); + tokio::time::sleep(std::time::Duration::from_millis(delay_ms)).await; + attempts += 1; + delay_ms = (delay_ms * 2).min(2000); + } + Err(e) => { + return Err(e).with_context(|| format!("bind {label} on {addr}")); + } + } + } +} + async fn serve_index() -> impl IntoResponse { ( [("content-type", "text/html; charset=utf-8")], diff --git a/hive-c0re/src/dashboard.rs b/hive-c0re/src/dashboard.rs index eed8f9b..1165e99 100644 --- a/hive-c0re/src/dashboard.rs +++ b/hive-c0re/src/dashboard.rs @@ -57,9 +57,7 @@ pub async fn serve(port: u16, coord: Arc) -> Result<()> { .route("/messages/stream", get(messages_stream)) .with_state(AppState { coord }); let addr = SocketAddr::from(([0, 0, 0, 0], port)); - let listener = tokio::net::TcpListener::bind(addr) - .await - .with_context(|| format!("bind dashboard on port {port}"))?; + let listener = bind_with_retry(addr).await?; tracing::info!(%port, "dashboard listening"); axum::serve(listener, app).await?; Ok(()) @@ -73,6 +71,30 @@ pub async fn serve(port: u16, coord: Arc) -> Result<()> { // `/messages/stream` for broker traffic. // --------------------------------------------------------------------------- +/// Retry-on-AddrInUse bind. Same shape as the per-agent variant — +/// hive-c0re restarts also race the previous process's socket release. +async fn bind_with_retry(addr: SocketAddr) -> Result { + let mut delay_ms = 250u64; + let mut attempts = 0u32; + loop { + match tokio::net::TcpListener::bind(addr).await { + Ok(l) => return Ok(l), + Err(e) if e.kind() == std::io::ErrorKind::AddrInUse && attempts < 12 => { + tracing::warn!( + %addr, attempt = attempts + 1, + "dashboard: AddrInUse, retrying in {delay_ms}ms" + ); + tokio::time::sleep(std::time::Duration::from_millis(delay_ms)).await; + attempts += 1; + delay_ms = (delay_ms * 2).min(2000); + } + Err(e) => { + return Err(e).with_context(|| format!("bind dashboard on {addr}")); + } + } + } +} + async fn serve_index() -> impl IntoResponse { Html(include_str!("../assets/index.html")) }