dashboard: same bind retry policy as the agent half

dashboard.rs had the same 12-attempt cap shape as the per-agent
bind_with_retry. Apply the same fix — retry forever with the 2s-capped
backoff, WARN early then INFO once we're clearly stuck on a stale
socket, INFO on success when we did have to retry. Mirrors the
agent change in this PR.
This commit is contained in:
iris 2026-05-23 02:24:49 +02:00
parent d73175a23e
commit 2c087f53ed

View file

@ -84,19 +84,43 @@ pub async fn serve(port: u16, coord: Arc<Coordinator>) -> Result<()> {
// `/dashboard/stream` for the unified live event channel. // `/dashboard/stream` for the unified live event channel.
// --------------------------------------------------------------------------- // ---------------------------------------------------------------------------
/// `SO_REUSEADDR` bind with retry. Mirrors the per-agent variant — /// `SO_REUSEADDR` bind with retry. Mirrors the per-agent variant in
/// hive-c0re restarts also race the previous process's socket release. /// `hive-ag3nt::web_ui::bind_with_retry`: hive-c0re restarts also
/// race the previous process's socket release, and the retry has no
/// attempt cap — capping was the proximate cause of issue #324
/// (silent give-up on a long stale socket). Genuine port collisions
/// don't reach this layer (dashboard is bound to a fixed configured
/// port, no per-agent hashing), so any persistent `AddrInUse` always
/// reflects a recoverable stale socket. WARN for the first dozen
/// attempts; INFO after that to avoid spamming the journal during a
/// long hold; INFO on eventual success when we did have to retry.
async fn bind_with_retry(addr: SocketAddr) -> Result<tokio::net::TcpListener> { async fn bind_with_retry(addr: SocketAddr) -> Result<tokio::net::TcpListener> {
let mut delay_ms = 250u64; let mut delay_ms = 250u64;
let mut attempts = 0u32; let mut attempts = 0u32;
loop { loop {
match try_bind(addr) { match try_bind(addr) {
Ok(l) => return Ok(l), Ok(l) => {
Err(e) if e.kind() == std::io::ErrorKind::AddrInUse && attempts < 12 => { if attempts > 0 {
tracing::info!(
%addr, attempts,
"dashboard: bind succeeded after retry"
);
}
return Ok(l);
}
Err(e) if e.kind() == std::io::ErrorKind::AddrInUse => {
let attempt = attempts + 1;
if attempt <= 12 {
tracing::warn!( tracing::warn!(
%addr, attempt = attempts + 1, %addr, attempt,
"dashboard: AddrInUse, retrying in {delay_ms}ms" "dashboard: AddrInUse, retrying in {delay_ms}ms"
); );
} else {
tracing::info!(
%addr, attempt,
"dashboard: AddrInUse still holding, retrying in {delay_ms}ms"
);
}
tokio::time::sleep(std::time::Duration::from_millis(delay_ms)).await; tokio::time::sleep(std::time::Duration::from_millis(delay_ms)).await;
attempts += 1; attempts += 1;
delay_ms = (delay_ms * 2).min(2000); delay_ms = (delay_ms * 2).min(2000);