broker: lease-style delivery — ack_turn + requeue_inflight close the no-drop loop
This commit is contained in:
parent
69a3ca7469
commit
690cb5ab5b
8 changed files with 684 additions and 35 deletions
|
|
@ -160,6 +160,15 @@ async fn serve(
|
|||
) -> Result<()> {
|
||||
tracing::info!(socket = %socket.display(), "hive-ag3nt serve");
|
||||
let _ = state; // reserved for future state transitions (turn-loop -> needs-login)
|
||||
// Boot-time recovery: ask the broker to resurface anything we
|
||||
// popped in a previous harness session but never acked
|
||||
// (crashed mid-turn / OOM / container restart). The broker
|
||||
// resets `delivered_at = NULL` on those rows and remembers
|
||||
// their ids so the next `Recv` tags them `redelivered: true`;
|
||||
// we then prepend a "may already be handled" hint to the wake
|
||||
// prompt. Single shot before entering the serve loop; idempotent
|
||||
// when there's nothing inflight.
|
||||
requeue_inflight(socket).await;
|
||||
loop {
|
||||
let recv: Result<AgentResponse> =
|
||||
// Explicit long-poll: the new agent_server semantics treat
|
||||
|
|
@ -174,8 +183,13 @@ async fn serve(
|
|||
)
|
||||
.await;
|
||||
match recv {
|
||||
Ok(AgentResponse::Message { from, body }) => {
|
||||
tracing::info!(%from, %body, "inbox");
|
||||
Ok(AgentResponse::Message {
|
||||
from,
|
||||
body,
|
||||
id: _,
|
||||
redelivered,
|
||||
}) => {
|
||||
tracing::info!(%from, %body, %redelivered, "inbox");
|
||||
let unread = inbox_unread(socket).await;
|
||||
bus.emit(LiveEvent::TurnStart {
|
||||
from: from.clone(),
|
||||
|
|
@ -186,13 +200,24 @@ async fn serve(
|
|||
let started_at = now_unix();
|
||||
let started_instant = std::time::Instant::now();
|
||||
let model_at_start = bus.model();
|
||||
let prompt = format_wake_prompt(&from, &body, unread);
|
||||
let prompt = format_wake_prompt(&from, &body, unread, redelivered);
|
||||
let outcome = {
|
||||
let _guard = turn_lock.lock().await;
|
||||
turn::drive_turn(&prompt, files, &bus).await
|
||||
};
|
||||
turn::emit_turn_end(&bus, &outcome);
|
||||
bus.set_state(TurnState::Idle);
|
||||
// Ack only on a clean turn-end. `Failed` leaves every
|
||||
// message popped during the turn in the unacked list;
|
||||
// next harness boot's `RequeueInflight` will reset
|
||||
// `delivered_at = NULL` and tag them `redelivered`.
|
||||
// `PromptTooLong` is absorbed inside `drive_turn` via
|
||||
// compaction so it shouldn't reach here, but if it
|
||||
// does we also skip the ack (safer to redeliver than
|
||||
// to lose the message).
|
||||
if matches!(outcome, turn::TurnOutcome::Ok) {
|
||||
ack_turn(socket).await;
|
||||
}
|
||||
// Failures are unhandled by definition — PromptTooLong is
|
||||
// absorbed inside drive_turn via compaction, so anything
|
||||
// that reaches Failed here is a real crash. Notify the
|
||||
|
|
@ -261,7 +286,16 @@ async fn serve(
|
|||
/// (`prompts/agent.md` → `claude --system-prompt-file`); this is just the
|
||||
/// wake signal claude reacts to. `unread` is the count of *other*
|
||||
/// messages in the inbox right after this one was popped.
|
||||
fn format_wake_prompt(from: &str, body: &str, unread: u64) -> String {
|
||||
/// `redelivered` flags messages that were popped in a prior harness
|
||||
/// session, never acked, and resurfaced after a restart — a banner
|
||||
/// at the top of the wake prompt warns that any side-effects of
|
||||
/// previous handling may already have happened.
|
||||
fn format_wake_prompt(from: &str, body: &str, unread: u64, redelivered: bool) -> String {
|
||||
let banner = if redelivered {
|
||||
hive_ag3nt::mcp::REDELIVERY_HINT
|
||||
} else {
|
||||
""
|
||||
};
|
||||
let pending = if unread == 0 {
|
||||
String::new()
|
||||
} else {
|
||||
|
|
@ -269,7 +303,42 @@ fn format_wake_prompt(from: &str, body: &str, unread: u64) -> String {
|
|||
"\n\n({unread} more message(s) pending in your inbox — drain via `mcp__hyperhive__recv` if relevant.)"
|
||||
)
|
||||
};
|
||||
format!("Incoming message from `{from}`:\n---\n{body}\n---{pending}")
|
||||
format!("{banner}Incoming message from `{from}`:\n---\n{body}\n---{pending}")
|
||||
}
|
||||
|
||||
/// Best-effort: tell the broker every message we popped during the
|
||||
/// turn is now fully handled (turn-end-OK). Swallows transport
|
||||
/// errors — the worst case is a redundant requeue on next boot.
|
||||
async fn ack_turn(socket: &Path) {
|
||||
match client::request::<_, AgentResponse>(socket, &AgentRequest::AckTurn).await {
|
||||
Ok(AgentResponse::Ok) => {}
|
||||
Ok(AgentResponse::Err { message }) => {
|
||||
tracing::warn!(%message, "ack_turn rejected by broker");
|
||||
}
|
||||
Ok(other) => {
|
||||
tracing::warn!(?other, "ack_turn unexpected response");
|
||||
}
|
||||
Err(e) => tracing::warn!(error = ?e, "ack_turn transport error"),
|
||||
}
|
||||
}
|
||||
|
||||
/// Boot-time recovery: ask the broker to resurface anything we
|
||||
/// popped in a previous harness session but never acked. The broker
|
||||
/// resets `delivered_at = NULL` on those rows and remembers their
|
||||
/// ids so the next `Recv` carries `redelivered: true`. Swallows
|
||||
/// transport errors — they degrade to "no recovery this boot",
|
||||
/// which is no worse than the pre-feature behaviour (silent drop).
|
||||
async fn requeue_inflight(socket: &Path) {
|
||||
match client::request::<_, AgentResponse>(socket, &AgentRequest::RequeueInflight).await {
|
||||
Ok(AgentResponse::Ok) => {}
|
||||
Ok(AgentResponse::Err { message }) => {
|
||||
tracing::warn!(%message, "requeue_inflight rejected by broker");
|
||||
}
|
||||
Ok(other) => {
|
||||
tracing::warn!(?other, "requeue_inflight unexpected response");
|
||||
}
|
||||
Err(e) => tracing::warn!(error = ?e, "requeue_inflight transport error"),
|
||||
}
|
||||
}
|
||||
|
||||
/// Best-effort: tell the manager that this agent's last turn crashed
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue