diff --git a/hive-ag3nt/src/web_ui.rs b/hive-ag3nt/src/web_ui.rs index 9228857..688f7a0 100644 --- a/hive-ag3nt/src/web_ui.rs +++ b/hive-ag3nt/src/web_ui.rs @@ -83,9 +83,7 @@ pub async fn serve( .route("/api/compact", post(post_compact)) .with_state(state); let addr = SocketAddr::from(([0, 0, 0, 0], port)); - let listener = tokio::net::TcpListener::bind(addr) - .await - .with_context(|| format!("bind web UI on port {port}"))?; + let listener = bind_with_retry(addr, "web UI").await?; tracing::info!(%port, "web UI listening"); axum::serve(listener, app).await?; Ok(()) @@ -95,6 +93,33 @@ pub async fn serve( // Static assets + state snapshot // --------------------------------------------------------------------------- +/// Bind a TCP listener, retrying on `AddrInUse` for up to ~20s. +/// nspawn restarts can race the previous harness's socket release; +/// without retry the new harness fails to bind and systemd just +/// keeps restarting it. `SO_REUSEADDR` would be the proper fix but +/// would require socket2; retry is good enough here. +async fn bind_with_retry(addr: SocketAddr, label: &str) -> Result { + let mut delay_ms = 250u64; + let mut attempts = 0u32; + loop { + match tokio::net::TcpListener::bind(addr).await { + Ok(l) => return Ok(l), + Err(e) if e.kind() == std::io::ErrorKind::AddrInUse && attempts < 12 => { + tracing::warn!( + %addr, attempt = attempts + 1, + "{label}: AddrInUse, retrying in {delay_ms}ms" + ); + tokio::time::sleep(std::time::Duration::from_millis(delay_ms)).await; + attempts += 1; + delay_ms = (delay_ms * 2).min(2000); + } + Err(e) => { + return Err(e).with_context(|| format!("bind {label} on {addr}")); + } + } + } +} + async fn serve_index() -> impl IntoResponse { ( [("content-type", "text/html; charset=utf-8")], diff --git a/hive-c0re/src/dashboard.rs b/hive-c0re/src/dashboard.rs index eed8f9b..1165e99 100644 --- a/hive-c0re/src/dashboard.rs +++ b/hive-c0re/src/dashboard.rs @@ -57,9 +57,7 @@ pub async fn serve(port: u16, coord: Arc) -> Result<()> { .route("/messages/stream", get(messages_stream)) .with_state(AppState { coord }); let addr = SocketAddr::from(([0, 0, 0, 0], port)); - let listener = tokio::net::TcpListener::bind(addr) - .await - .with_context(|| format!("bind dashboard on port {port}"))?; + let listener = bind_with_retry(addr).await?; tracing::info!(%port, "dashboard listening"); axum::serve(listener, app).await?; Ok(()) @@ -73,6 +71,30 @@ pub async fn serve(port: u16, coord: Arc) -> Result<()> { // `/messages/stream` for broker traffic. // --------------------------------------------------------------------------- +/// Retry-on-AddrInUse bind. Same shape as the per-agent variant — +/// hive-c0re restarts also race the previous process's socket release. +async fn bind_with_retry(addr: SocketAddr) -> Result { + let mut delay_ms = 250u64; + let mut attempts = 0u32; + loop { + match tokio::net::TcpListener::bind(addr).await { + Ok(l) => return Ok(l), + Err(e) if e.kind() == std::io::ErrorKind::AddrInUse && attempts < 12 => { + tracing::warn!( + %addr, attempt = attempts + 1, + "dashboard: AddrInUse, retrying in {delay_ms}ms" + ); + tokio::time::sleep(std::time::Duration::from_millis(delay_ms)).await; + attempts += 1; + delay_ms = (delay_ms * 2).min(2000); + } + Err(e) => { + return Err(e).with_context(|| format!("bind dashboard on {addr}")); + } + } + } +} + async fn serve_index() -> impl IntoResponse { Html(include_str!("../assets/index.html")) }