dashboard: same bind retry policy as the agent half
dashboard.rs had the same 12-attempt cap shape as the per-agent bind_with_retry. Apply the same fix — retry forever with the 2s-capped backoff, WARN early then INFO once we're clearly stuck on a stale socket, INFO on success when we did have to retry. Mirrors the agent change in this PR.
This commit is contained in:
parent
d73175a23e
commit
2c087f53ed
1 changed files with 32 additions and 8 deletions
|
|
@ -84,19 +84,43 @@ pub async fn serve(port: u16, coord: Arc<Coordinator>) -> Result<()> {
|
|||
// `/dashboard/stream` for the unified live event channel.
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
/// `SO_REUSEADDR` bind with retry. Mirrors the per-agent variant —
|
||||
/// hive-c0re restarts also race the previous process's socket release.
|
||||
/// `SO_REUSEADDR` bind with retry. Mirrors the per-agent variant in
|
||||
/// `hive-ag3nt::web_ui::bind_with_retry`: hive-c0re restarts also
|
||||
/// race the previous process's socket release, and the retry has no
|
||||
/// attempt cap — capping was the proximate cause of issue #324
|
||||
/// (silent give-up on a long stale socket). Genuine port collisions
|
||||
/// don't reach this layer (dashboard is bound to a fixed configured
|
||||
/// port, no per-agent hashing), so any persistent `AddrInUse` always
|
||||
/// reflects a recoverable stale socket. WARN for the first dozen
|
||||
/// attempts; INFO after that to avoid spamming the journal during a
|
||||
/// long hold; INFO on eventual success when we did have to retry.
|
||||
async fn bind_with_retry(addr: SocketAddr) -> Result<tokio::net::TcpListener> {
|
||||
let mut delay_ms = 250u64;
|
||||
let mut attempts = 0u32;
|
||||
loop {
|
||||
match try_bind(addr) {
|
||||
Ok(l) => return Ok(l),
|
||||
Err(e) if e.kind() == std::io::ErrorKind::AddrInUse && attempts < 12 => {
|
||||
tracing::warn!(
|
||||
%addr, attempt = attempts + 1,
|
||||
"dashboard: AddrInUse, retrying in {delay_ms}ms"
|
||||
);
|
||||
Ok(l) => {
|
||||
if attempts > 0 {
|
||||
tracing::info!(
|
||||
%addr, attempts,
|
||||
"dashboard: bind succeeded after retry"
|
||||
);
|
||||
}
|
||||
return Ok(l);
|
||||
}
|
||||
Err(e) if e.kind() == std::io::ErrorKind::AddrInUse => {
|
||||
let attempt = attempts + 1;
|
||||
if attempt <= 12 {
|
||||
tracing::warn!(
|
||||
%addr, attempt,
|
||||
"dashboard: AddrInUse, retrying in {delay_ms}ms"
|
||||
);
|
||||
} else {
|
||||
tracing::info!(
|
||||
%addr, attempt,
|
||||
"dashboard: AddrInUse still holding, retrying in {delay_ms}ms"
|
||||
);
|
||||
}
|
||||
tokio::time::sleep(std::time::Duration::from_millis(delay_ms)).await;
|
||||
attempts += 1;
|
||||
delay_ms = (delay_ms * 2).min(2000);
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue