web ui: retry binding on AddrInUse during restart races

operator hit 'Address already in use (os error 98)' on a harness
restart — the new harness raced the old socket's release. add a
bind_with_retry helper that backs off (250ms doubling, capped at
2s, 12 tries ≈ 22s total) on AddrInUse before giving up. applied
to both the per-agent web UI and the hive-c0re dashboard.

proper fix would be SO_REUSEADDR via socket2 but retry covers the
TIME_WAIT case fine and keeps the dep count down. Other bind errors
still fail immediately (port permission, fd exhaustion).
This commit is contained in:
müde 2026-05-15 20:33:51 +02:00
parent 538e0446d7
commit 2146e47770
2 changed files with 53 additions and 6 deletions

View file

@ -83,9 +83,7 @@ pub async fn serve(
.route("/api/compact", post(post_compact)) .route("/api/compact", post(post_compact))
.with_state(state); .with_state(state);
let addr = SocketAddr::from(([0, 0, 0, 0], port)); let addr = SocketAddr::from(([0, 0, 0, 0], port));
let listener = tokio::net::TcpListener::bind(addr) let listener = bind_with_retry(addr, "web UI").await?;
.await
.with_context(|| format!("bind web UI on port {port}"))?;
tracing::info!(%port, "web UI listening"); tracing::info!(%port, "web UI listening");
axum::serve(listener, app).await?; axum::serve(listener, app).await?;
Ok(()) Ok(())
@ -95,6 +93,33 @@ pub async fn serve(
// Static assets + state snapshot // Static assets + state snapshot
// --------------------------------------------------------------------------- // ---------------------------------------------------------------------------
/// Bind a TCP listener, retrying on `AddrInUse` for up to ~20s.
/// nspawn restarts can race the previous harness's socket release;
/// without retry the new harness fails to bind and systemd just
/// keeps restarting it. `SO_REUSEADDR` would be the proper fix but
/// would require socket2; retry is good enough here.
async fn bind_with_retry(addr: SocketAddr, label: &str) -> Result<tokio::net::TcpListener> {
let mut delay_ms = 250u64;
let mut attempts = 0u32;
loop {
match tokio::net::TcpListener::bind(addr).await {
Ok(l) => return Ok(l),
Err(e) if e.kind() == std::io::ErrorKind::AddrInUse && attempts < 12 => {
tracing::warn!(
%addr, attempt = attempts + 1,
"{label}: AddrInUse, retrying in {delay_ms}ms"
);
tokio::time::sleep(std::time::Duration::from_millis(delay_ms)).await;
attempts += 1;
delay_ms = (delay_ms * 2).min(2000);
}
Err(e) => {
return Err(e).with_context(|| format!("bind {label} on {addr}"));
}
}
}
}
async fn serve_index() -> impl IntoResponse { async fn serve_index() -> impl IntoResponse {
( (
[("content-type", "text/html; charset=utf-8")], [("content-type", "text/html; charset=utf-8")],

View file

@ -57,9 +57,7 @@ pub async fn serve(port: u16, coord: Arc<Coordinator>) -> Result<()> {
.route("/messages/stream", get(messages_stream)) .route("/messages/stream", get(messages_stream))
.with_state(AppState { coord }); .with_state(AppState { coord });
let addr = SocketAddr::from(([0, 0, 0, 0], port)); let addr = SocketAddr::from(([0, 0, 0, 0], port));
let listener = tokio::net::TcpListener::bind(addr) let listener = bind_with_retry(addr).await?;
.await
.with_context(|| format!("bind dashboard on port {port}"))?;
tracing::info!(%port, "dashboard listening"); tracing::info!(%port, "dashboard listening");
axum::serve(listener, app).await?; axum::serve(listener, app).await?;
Ok(()) Ok(())
@ -73,6 +71,30 @@ pub async fn serve(port: u16, coord: Arc<Coordinator>) -> Result<()> {
// `/messages/stream` for broker traffic. // `/messages/stream` for broker traffic.
// --------------------------------------------------------------------------- // ---------------------------------------------------------------------------
/// Retry-on-AddrInUse bind. Same shape as the per-agent variant —
/// hive-c0re restarts also race the previous process's socket release.
async fn bind_with_retry(addr: SocketAddr) -> Result<tokio::net::TcpListener> {
let mut delay_ms = 250u64;
let mut attempts = 0u32;
loop {
match tokio::net::TcpListener::bind(addr).await {
Ok(l) => return Ok(l),
Err(e) if e.kind() == std::io::ErrorKind::AddrInUse && attempts < 12 => {
tracing::warn!(
%addr, attempt = attempts + 1,
"dashboard: AddrInUse, retrying in {delay_ms}ms"
);
tokio::time::sleep(std::time::Duration::from_millis(delay_ms)).await;
attempts += 1;
delay_ms = (delay_ms * 2).min(2000);
}
Err(e) => {
return Err(e).with_context(|| format!("bind dashboard on {addr}"));
}
}
}
}
async fn serve_index() -> impl IntoResponse { async fn serve_index() -> impl IntoResponse {
Html(include_str!("../assets/index.html")) Html(include_str!("../assets/index.html"))
} }