web ui: retry binding on AddrInUse during restart races
operator hit 'Address already in use (os error 98)' on a harness restart — the new harness raced the old socket's release. add a bind_with_retry helper that backs off (250ms doubling, capped at 2s, 12 tries ≈ 22s total) on AddrInUse before giving up. applied to both the per-agent web UI and the hive-c0re dashboard. proper fix would be SO_REUSEADDR via socket2 but retry covers the TIME_WAIT case fine and keeps the dep count down. Other bind errors still fail immediately (port permission, fd exhaustion).
This commit is contained in:
parent
538e0446d7
commit
2146e47770
2 changed files with 53 additions and 6 deletions
|
|
@ -83,9 +83,7 @@ pub async fn serve(
|
|||
.route("/api/compact", post(post_compact))
|
||||
.with_state(state);
|
||||
let addr = SocketAddr::from(([0, 0, 0, 0], port));
|
||||
let listener = tokio::net::TcpListener::bind(addr)
|
||||
.await
|
||||
.with_context(|| format!("bind web UI on port {port}"))?;
|
||||
let listener = bind_with_retry(addr, "web UI").await?;
|
||||
tracing::info!(%port, "web UI listening");
|
||||
axum::serve(listener, app).await?;
|
||||
Ok(())
|
||||
|
|
@ -95,6 +93,33 @@ pub async fn serve(
|
|||
// Static assets + state snapshot
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
/// Bind a TCP listener, retrying on `AddrInUse` for up to ~20s.
|
||||
/// nspawn restarts can race the previous harness's socket release;
|
||||
/// without retry the new harness fails to bind and systemd just
|
||||
/// keeps restarting it. `SO_REUSEADDR` would be the proper fix but
|
||||
/// would require socket2; retry is good enough here.
|
||||
async fn bind_with_retry(addr: SocketAddr, label: &str) -> Result<tokio::net::TcpListener> {
|
||||
let mut delay_ms = 250u64;
|
||||
let mut attempts = 0u32;
|
||||
loop {
|
||||
match tokio::net::TcpListener::bind(addr).await {
|
||||
Ok(l) => return Ok(l),
|
||||
Err(e) if e.kind() == std::io::ErrorKind::AddrInUse && attempts < 12 => {
|
||||
tracing::warn!(
|
||||
%addr, attempt = attempts + 1,
|
||||
"{label}: AddrInUse, retrying in {delay_ms}ms"
|
||||
);
|
||||
tokio::time::sleep(std::time::Duration::from_millis(delay_ms)).await;
|
||||
attempts += 1;
|
||||
delay_ms = (delay_ms * 2).min(2000);
|
||||
}
|
||||
Err(e) => {
|
||||
return Err(e).with_context(|| format!("bind {label} on {addr}"));
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
async fn serve_index() -> impl IntoResponse {
|
||||
(
|
||||
[("content-type", "text/html; charset=utf-8")],
|
||||
|
|
|
|||
|
|
@ -57,9 +57,7 @@ pub async fn serve(port: u16, coord: Arc<Coordinator>) -> Result<()> {
|
|||
.route("/messages/stream", get(messages_stream))
|
||||
.with_state(AppState { coord });
|
||||
let addr = SocketAddr::from(([0, 0, 0, 0], port));
|
||||
let listener = tokio::net::TcpListener::bind(addr)
|
||||
.await
|
||||
.with_context(|| format!("bind dashboard on port {port}"))?;
|
||||
let listener = bind_with_retry(addr).await?;
|
||||
tracing::info!(%port, "dashboard listening");
|
||||
axum::serve(listener, app).await?;
|
||||
Ok(())
|
||||
|
|
@ -73,6 +71,30 @@ pub async fn serve(port: u16, coord: Arc<Coordinator>) -> Result<()> {
|
|||
// `/messages/stream` for broker traffic.
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
/// Retry-on-AddrInUse bind. Same shape as the per-agent variant —
|
||||
/// hive-c0re restarts also race the previous process's socket release.
|
||||
async fn bind_with_retry(addr: SocketAddr) -> Result<tokio::net::TcpListener> {
|
||||
let mut delay_ms = 250u64;
|
||||
let mut attempts = 0u32;
|
||||
loop {
|
||||
match tokio::net::TcpListener::bind(addr).await {
|
||||
Ok(l) => return Ok(l),
|
||||
Err(e) if e.kind() == std::io::ErrorKind::AddrInUse && attempts < 12 => {
|
||||
tracing::warn!(
|
||||
%addr, attempt = attempts + 1,
|
||||
"dashboard: AddrInUse, retrying in {delay_ms}ms"
|
||||
);
|
||||
tokio::time::sleep(std::time::Duration::from_millis(delay_ms)).await;
|
||||
attempts += 1;
|
||||
delay_ms = (delay_ms * 2).min(2000);
|
||||
}
|
||||
Err(e) => {
|
||||
return Err(e).with_context(|| format!("bind dashboard on {addr}"));
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
async fn serve_index() -> impl IntoResponse {
|
||||
Html(include_str!("../assets/index.html"))
|
||||
}
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue