From d73175a23e85b9a983fa06c723b98205e1d11c86 Mon Sep 17 00:00:00 2001
From: iris <iris@hyperhive>
Date: Sat, 23 May 2026 02:19:14 +0200
Subject: [PATCH] harness: keep retrying web-UI bind on AddrInUse
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The retry was capped at 12 attempts (~20s of exponential backoff
capped at 2s). Two back-to-back nspawn restarts in #324 left the
previous socket holding the port longer than that budget; once the
cap fired, the web-UI task returned an error and silently died for
the rest of the process lifetime — the agent kept running fine
otherwise (MCP, turn loop), but the operator's dashboard click
hit nothing.

Genuine port collisions are preflighted host-side
(lifecycle::{spawn,rebuild}) and surfaced as a port-conflict banner,
so at this layer a persistent AddrInUse always reflects a
recoverable stale socket. Drop the cap, keep retrying forever with
the same 2s-capped backoff. WARN for the first dozen attempts so a
normal restart-race is visible; INFO after that to avoid spamming
the journal during a long stale-socket hold. Logs a one-line INFO
on success when we did have to retry, so post-mortems can find the
attempt count.

Closes #324.
---
 hive-ag3nt/src/web_ui.rs | 51 +++++++++++++++++++++++++++++++---------
 1 file changed, 40 insertions(+), 11 deletions(-)

diff --git a/hive-ag3nt/src/web_ui.rs b/hive-ag3nt/src/web_ui.rs
index fde54d3..3868c82 100644
--- a/hive-ag3nt/src/web_ui.rs
+++ b/hive-ag3nt/src/web_ui.rs
@@ -135,22 +135,51 @@ pub async fn serve(
 // ---------------------------------------------------------------------------
 
 /// Bind a TCP listener with `SO_REUSEADDR` set, retrying on
-/// `AddrInUse` for up to ~20s. nspawn restarts can race the previous
-/// harness's socket release; `SO_REUSEADDR` lets us reclaim a port
-/// still in `TIME_WAIT` from a clean previous exit, and the retry
-/// covers the case where the previous process is genuinely still
-/// alive (systemd restart-delay overlap).
+/// `AddrInUse` indefinitely with exponential backoff capped at 2s.
+/// nspawn restarts can race the previous harness's socket release;
+/// `SO_REUSEADDR` lets us reclaim a port still in `TIME_WAIT` from a
+/// clean previous exit, and the retry covers the case where the
+/// previous process is genuinely still alive (systemd restart-delay
+/// overlap).
+///
+/// The retry has no attempt cap: capping was the proximate cause of
+/// issue #324 — two back-to-back restarts left the previous socket
+/// holding the port for longer than the ~20s the old 12-attempt
+/// budget allowed, and the harness silently lost its web UI for the
+/// rest of the process lifetime. Genuine port collisions are
+/// preflighted host-side (`lifecycle::{spawn,rebuild}`) and surfaced
+/// on the dashboard as a banner, so at this layer a persistent
+/// `AddrInUse` always reflects a recoverable stale socket — retrying
+/// forever is the safe choice. The first attempts log at WARN; once
+/// we cross attempt 12 the level drops to INFO so a long stale
+/// socket doesn't flood the journal.
 async fn bind_with_retry(addr: SocketAddr, label: &str) -> Result<tokio::net::TcpListener> {
     let mut delay_ms = 250u64;
     let mut attempts = 0u32;
     loop {
         match try_bind(addr) {
-            Ok(l) => return Ok(l),
-            Err(e) if e.kind() == std::io::ErrorKind::AddrInUse && attempts < 12 => {
-                tracing::warn!(
-                    %addr, attempt = attempts + 1,
-                    "{label}: AddrInUse, retrying in {delay_ms}ms"
-                );
+            Ok(l) => {
+                if attempts > 0 {
+                    tracing::info!(
+                        %addr, attempts,
+                        "{label}: bind succeeded after retry"
+                    );
+                }
+                return Ok(l);
+            }
+            Err(e) if e.kind() == std::io::ErrorKind::AddrInUse => {
+                let attempt = attempts + 1;
+                if attempt <= 12 {
+                    tracing::warn!(
+                        %addr, attempt,
+                        "{label}: AddrInUse, retrying in {delay_ms}ms"
+                    );
+                } else {
+                    tracing::info!(
+                        %addr, attempt,
+                        "{label}: AddrInUse still holding, retrying in {delay_ms}ms"
+                    );
+                }
                 tokio::time::sleep(std::time::Duration::from_millis(delay_ms)).await;
                 attempts += 1;
                 delay_ms = (delay_ms * 2).min(2000);