dashboard: same bind retry policy as the agent half
dashboard.rs had the same 12-attempt cap shape as the per-agent bind_with_retry. Apply the same fix — retry forever with the 2s-capped backoff, WARN early then INFO once we're clearly stuck on a stale socket, INFO on success when we did have to retry. Mirrors the agent change in this PR.
This commit is contained in:
parent
d73175a23e
commit
2c087f53ed
1 changed files with 32 additions and 8 deletions
|
|
@ -84,19 +84,43 @@ pub async fn serve(port: u16, coord: Arc<Coordinator>) -> Result<()> {
|
||||||
// `/dashboard/stream` for the unified live event channel.
|
// `/dashboard/stream` for the unified live event channel.
|
||||||
// ---------------------------------------------------------------------------
|
// ---------------------------------------------------------------------------
|
||||||
|
|
||||||
/// `SO_REUSEADDR` bind with retry. Mirrors the per-agent variant —
|
/// `SO_REUSEADDR` bind with retry. Mirrors the per-agent variant in
|
||||||
/// hive-c0re restarts also race the previous process's socket release.
|
/// `hive-ag3nt::web_ui::bind_with_retry`: hive-c0re restarts also
|
||||||
|
/// race the previous process's socket release, and the retry has no
|
||||||
|
/// attempt cap — capping was the proximate cause of issue #324
|
||||||
|
/// (silent give-up on a long stale socket). Genuine port collisions
|
||||||
|
/// don't reach this layer (dashboard is bound to a fixed configured
|
||||||
|
/// port, no per-agent hashing), so any persistent `AddrInUse` always
|
||||||
|
/// reflects a recoverable stale socket. WARN for the first dozen
|
||||||
|
/// attempts; INFO after that to avoid spamming the journal during a
|
||||||
|
/// long hold; INFO on eventual success when we did have to retry.
|
||||||
async fn bind_with_retry(addr: SocketAddr) -> Result<tokio::net::TcpListener> {
|
async fn bind_with_retry(addr: SocketAddr) -> Result<tokio::net::TcpListener> {
|
||||||
let mut delay_ms = 250u64;
|
let mut delay_ms = 250u64;
|
||||||
let mut attempts = 0u32;
|
let mut attempts = 0u32;
|
||||||
loop {
|
loop {
|
||||||
match try_bind(addr) {
|
match try_bind(addr) {
|
||||||
Ok(l) => return Ok(l),
|
Ok(l) => {
|
||||||
Err(e) if e.kind() == std::io::ErrorKind::AddrInUse && attempts < 12 => {
|
if attempts > 0 {
|
||||||
|
tracing::info!(
|
||||||
|
%addr, attempts,
|
||||||
|
"dashboard: bind succeeded after retry"
|
||||||
|
);
|
||||||
|
}
|
||||||
|
return Ok(l);
|
||||||
|
}
|
||||||
|
Err(e) if e.kind() == std::io::ErrorKind::AddrInUse => {
|
||||||
|
let attempt = attempts + 1;
|
||||||
|
if attempt <= 12 {
|
||||||
tracing::warn!(
|
tracing::warn!(
|
||||||
%addr, attempt = attempts + 1,
|
%addr, attempt,
|
||||||
"dashboard: AddrInUse, retrying in {delay_ms}ms"
|
"dashboard: AddrInUse, retrying in {delay_ms}ms"
|
||||||
);
|
);
|
||||||
|
} else {
|
||||||
|
tracing::info!(
|
||||||
|
%addr, attempt,
|
||||||
|
"dashboard: AddrInUse still holding, retrying in {delay_ms}ms"
|
||||||
|
);
|
||||||
|
}
|
||||||
tokio::time::sleep(std::time::Duration::from_millis(delay_ms)).await;
|
tokio::time::sleep(std::time::Duration::from_millis(delay_ms)).await;
|
||||||
attempts += 1;
|
attempts += 1;
|
||||||
delay_ms = (delay_ms * 2).min(2000);
|
delay_ms = (delay_ms * 2).min(2000);
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue