broker: lease-style delivery — ack_turn + requeue_inflight close the no-drop loop

This commit is contained in:
damocles 2026-05-18 22:01:48 +02:00
parent 69a3ca7469
commit 690cb5ab5b
8 changed files with 684 additions and 35 deletions

View file

@ -160,6 +160,15 @@ async fn serve(
) -> Result<()> {
tracing::info!(socket = %socket.display(), "hive-ag3nt serve");
let _ = state; // reserved for future state transitions (turn-loop -> needs-login)
// Boot-time recovery: ask the broker to resurface anything we
// popped in a previous harness session but never acked
// (crashed mid-turn / OOM / container restart). The broker
// resets `delivered_at = NULL` on those rows and remembers
// their ids so the next `Recv` tags them `redelivered: true`;
// we then prepend a "may already be handled" hint to the wake
// prompt. Single shot before entering the serve loop; idempotent
// when there's nothing inflight.
requeue_inflight(socket).await;
loop {
let recv: Result<AgentResponse> =
// Explicit long-poll: the new agent_server semantics treat
@ -174,8 +183,13 @@ async fn serve(
)
.await;
match recv {
Ok(AgentResponse::Message { from, body }) => {
tracing::info!(%from, %body, "inbox");
Ok(AgentResponse::Message {
from,
body,
id: _,
redelivered,
}) => {
tracing::info!(%from, %body, %redelivered, "inbox");
let unread = inbox_unread(socket).await;
bus.emit(LiveEvent::TurnStart {
from: from.clone(),
@ -186,13 +200,24 @@ async fn serve(
let started_at = now_unix();
let started_instant = std::time::Instant::now();
let model_at_start = bus.model();
let prompt = format_wake_prompt(&from, &body, unread);
let prompt = format_wake_prompt(&from, &body, unread, redelivered);
let outcome = {
let _guard = turn_lock.lock().await;
turn::drive_turn(&prompt, files, &bus).await
};
turn::emit_turn_end(&bus, &outcome);
bus.set_state(TurnState::Idle);
// Ack only on a clean turn-end. `Failed` leaves every
// message popped during the turn in the unacked list;
// next harness boot's `RequeueInflight` will reset
// `delivered_at = NULL` and tag them `redelivered`.
// `PromptTooLong` is absorbed inside `drive_turn` via
// compaction so it shouldn't reach here, but if it
// does we also skip the ack (safer to redeliver than
// to lose the message).
if matches!(outcome, turn::TurnOutcome::Ok) {
ack_turn(socket).await;
}
// Failures are unhandled by definition — PromptTooLong is
// absorbed inside drive_turn via compaction, so anything
// that reaches Failed here is a real crash. Notify the
@ -261,7 +286,16 @@ async fn serve(
/// (`prompts/agent.md` → `claude --system-prompt-file`); this is just the
/// wake signal claude reacts to. `unread` is the count of *other*
/// messages in the inbox right after this one was popped.
fn format_wake_prompt(from: &str, body: &str, unread: u64) -> String {
/// `redelivered` flags messages that were popped in a prior harness
/// session, never acked, and resurfaced after a restart — a banner
/// at the top of the wake prompt warns that any side-effects of
/// previous handling may already have happened.
fn format_wake_prompt(from: &str, body: &str, unread: u64, redelivered: bool) -> String {
let banner = if redelivered {
hive_ag3nt::mcp::REDELIVERY_HINT
} else {
""
};
let pending = if unread == 0 {
String::new()
} else {
@ -269,7 +303,42 @@ fn format_wake_prompt(from: &str, body: &str, unread: u64) -> String {
"\n\n({unread} more message(s) pending in your inbox — drain via `mcp__hyperhive__recv` if relevant.)"
)
};
format!("Incoming message from `{from}`:\n---\n{body}\n---{pending}")
format!("{banner}Incoming message from `{from}`:\n---\n{body}\n---{pending}")
}
/// Best-effort: tell the broker every message we popped during the
/// turn is now fully handled (turn-end-OK). Swallows transport
/// errors — the worst case is a redundant requeue on next boot.
async fn ack_turn(socket: &Path) {
match client::request::<_, AgentResponse>(socket, &AgentRequest::AckTurn).await {
Ok(AgentResponse::Ok) => {}
Ok(AgentResponse::Err { message }) => {
tracing::warn!(%message, "ack_turn rejected by broker");
}
Ok(other) => {
tracing::warn!(?other, "ack_turn unexpected response");
}
Err(e) => tracing::warn!(error = ?e, "ack_turn transport error"),
}
}
/// Boot-time recovery: ask the broker to resurface anything we
/// popped in a previous harness session but never acked. The broker
/// resets `delivered_at = NULL` on those rows and remembers their
/// ids so the next `Recv` carries `redelivered: true`. Swallows
/// transport errors — they degrade to "no recovery this boot",
/// which is no worse than the pre-feature behaviour (silent drop).
async fn requeue_inflight(socket: &Path) {
match client::request::<_, AgentResponse>(socket, &AgentRequest::RequeueInflight).await {
Ok(AgentResponse::Ok) => {}
Ok(AgentResponse::Err { message }) => {
tracing::warn!(%message, "requeue_inflight rejected by broker");
}
Ok(other) => {
tracing::warn!(?other, "requeue_inflight unexpected response");
}
Err(e) => tracing::warn!(error = ?e, "requeue_inflight transport error"),
}
}
/// Best-effort: tell the manager that this agent's last turn crashed