broker: lease-style delivery — ack_turn + requeue_inflight close the no-drop loop
This commit is contained in:
parent
69a3ca7469
commit
690cb5ab5b
8 changed files with 684 additions and 35 deletions
|
|
@ -160,6 +160,15 @@ async fn serve(
|
|||
) -> Result<()> {
|
||||
tracing::info!(socket = %socket.display(), "hive-ag3nt serve");
|
||||
let _ = state; // reserved for future state transitions (turn-loop -> needs-login)
|
||||
// Boot-time recovery: ask the broker to resurface anything we
|
||||
// popped in a previous harness session but never acked
|
||||
// (crashed mid-turn / OOM / container restart). The broker
|
||||
// resets `delivered_at = NULL` on those rows and remembers
|
||||
// their ids so the next `Recv` tags them `redelivered: true`;
|
||||
// we then prepend a "may already be handled" hint to the wake
|
||||
// prompt. Single shot before entering the serve loop; idempotent
|
||||
// when there's nothing inflight.
|
||||
requeue_inflight(socket).await;
|
||||
loop {
|
||||
let recv: Result<AgentResponse> =
|
||||
// Explicit long-poll: the new agent_server semantics treat
|
||||
|
|
@ -174,8 +183,13 @@ async fn serve(
|
|||
)
|
||||
.await;
|
||||
match recv {
|
||||
Ok(AgentResponse::Message { from, body }) => {
|
||||
tracing::info!(%from, %body, "inbox");
|
||||
Ok(AgentResponse::Message {
|
||||
from,
|
||||
body,
|
||||
id: _,
|
||||
redelivered,
|
||||
}) => {
|
||||
tracing::info!(%from, %body, %redelivered, "inbox");
|
||||
let unread = inbox_unread(socket).await;
|
||||
bus.emit(LiveEvent::TurnStart {
|
||||
from: from.clone(),
|
||||
|
|
@ -186,13 +200,24 @@ async fn serve(
|
|||
let started_at = now_unix();
|
||||
let started_instant = std::time::Instant::now();
|
||||
let model_at_start = bus.model();
|
||||
let prompt = format_wake_prompt(&from, &body, unread);
|
||||
let prompt = format_wake_prompt(&from, &body, unread, redelivered);
|
||||
let outcome = {
|
||||
let _guard = turn_lock.lock().await;
|
||||
turn::drive_turn(&prompt, files, &bus).await
|
||||
};
|
||||
turn::emit_turn_end(&bus, &outcome);
|
||||
bus.set_state(TurnState::Idle);
|
||||
// Ack only on a clean turn-end. `Failed` leaves every
|
||||
// message popped during the turn in the unacked list;
|
||||
// next harness boot's `RequeueInflight` will reset
|
||||
// `delivered_at = NULL` and tag them `redelivered`.
|
||||
// `PromptTooLong` is absorbed inside `drive_turn` via
|
||||
// compaction so it shouldn't reach here, but if it
|
||||
// does we also skip the ack (safer to redeliver than
|
||||
// to lose the message).
|
||||
if matches!(outcome, turn::TurnOutcome::Ok) {
|
||||
ack_turn(socket).await;
|
||||
}
|
||||
// Failures are unhandled by definition — PromptTooLong is
|
||||
// absorbed inside drive_turn via compaction, so anything
|
||||
// that reaches Failed here is a real crash. Notify the
|
||||
|
|
@ -261,7 +286,16 @@ async fn serve(
|
|||
/// (`prompts/agent.md` → `claude --system-prompt-file`); this is just the
|
||||
/// wake signal claude reacts to. `unread` is the count of *other*
|
||||
/// messages in the inbox right after this one was popped.
|
||||
fn format_wake_prompt(from: &str, body: &str, unread: u64) -> String {
|
||||
/// `redelivered` flags messages that were popped in a prior harness
|
||||
/// session, never acked, and resurfaced after a restart — a banner
|
||||
/// at the top of the wake prompt warns that any side-effects of
|
||||
/// previous handling may already have happened.
|
||||
fn format_wake_prompt(from: &str, body: &str, unread: u64, redelivered: bool) -> String {
|
||||
let banner = if redelivered {
|
||||
hive_ag3nt::mcp::REDELIVERY_HINT
|
||||
} else {
|
||||
""
|
||||
};
|
||||
let pending = if unread == 0 {
|
||||
String::new()
|
||||
} else {
|
||||
|
|
@ -269,7 +303,42 @@ fn format_wake_prompt(from: &str, body: &str, unread: u64) -> String {
|
|||
"\n\n({unread} more message(s) pending in your inbox — drain via `mcp__hyperhive__recv` if relevant.)"
|
||||
)
|
||||
};
|
||||
format!("Incoming message from `{from}`:\n---\n{body}\n---{pending}")
|
||||
format!("{banner}Incoming message from `{from}`:\n---\n{body}\n---{pending}")
|
||||
}
|
||||
|
||||
/// Best-effort: tell the broker every message we popped during the
|
||||
/// turn is now fully handled (turn-end-OK). Swallows transport
|
||||
/// errors — the worst case is a redundant requeue on next boot.
|
||||
async fn ack_turn(socket: &Path) {
|
||||
match client::request::<_, AgentResponse>(socket, &AgentRequest::AckTurn).await {
|
||||
Ok(AgentResponse::Ok) => {}
|
||||
Ok(AgentResponse::Err { message }) => {
|
||||
tracing::warn!(%message, "ack_turn rejected by broker");
|
||||
}
|
||||
Ok(other) => {
|
||||
tracing::warn!(?other, "ack_turn unexpected response");
|
||||
}
|
||||
Err(e) => tracing::warn!(error = ?e, "ack_turn transport error"),
|
||||
}
|
||||
}
|
||||
|
||||
/// Boot-time recovery: ask the broker to resurface anything we
|
||||
/// popped in a previous harness session but never acked. The broker
|
||||
/// resets `delivered_at = NULL` on those rows and remembers their
|
||||
/// ids so the next `Recv` carries `redelivered: true`. Swallows
|
||||
/// transport errors — they degrade to "no recovery this boot",
|
||||
/// which is no worse than the pre-feature behaviour (silent drop).
|
||||
async fn requeue_inflight(socket: &Path) {
|
||||
match client::request::<_, AgentResponse>(socket, &AgentRequest::RequeueInflight).await {
|
||||
Ok(AgentResponse::Ok) => {}
|
||||
Ok(AgentResponse::Err { message }) => {
|
||||
tracing::warn!(%message, "requeue_inflight rejected by broker");
|
||||
}
|
||||
Ok(other) => {
|
||||
tracing::warn!(?other, "requeue_inflight unexpected response");
|
||||
}
|
||||
Err(e) => tracing::warn!(error = ?e, "requeue_inflight transport error"),
|
||||
}
|
||||
}
|
||||
|
||||
/// Best-effort: tell the manager that this agent's last turn crashed
|
||||
|
|
|
|||
|
|
@ -121,6 +121,10 @@ async fn serve(
|
|||
turn_lock: TurnLock,
|
||||
) -> Result<()> {
|
||||
tracing::info!(socket = %socket.display(), "hive-m1nd serve");
|
||||
// Same boot-time recovery as hive-ag3nt — see that loop for the
|
||||
// rationale. Manager-flavour socket so we requeue only manager
|
||||
// inflight rows.
|
||||
requeue_inflight(socket).await;
|
||||
loop {
|
||||
let recv: Result<ManagerResponse> =
|
||||
// Explicit long-poll: see hive-ag3nt's serve loop for the
|
||||
|
|
@ -134,7 +138,12 @@ async fn serve(
|
|||
)
|
||||
.await;
|
||||
match recv {
|
||||
Ok(ManagerResponse::Message { from, body }) => {
|
||||
Ok(ManagerResponse::Message {
|
||||
from,
|
||||
body,
|
||||
id: _,
|
||||
redelivered,
|
||||
}) => {
|
||||
if from == SYSTEM_SENDER {
|
||||
// Helper events (ApprovalResolved / Spawned / Rebuilt /
|
||||
// Killed / Destroyed) — these are FYI for the manager;
|
||||
|
|
@ -154,14 +163,14 @@ async fn serve(
|
|||
// prompt body so claude sees it. Sender stays "system"
|
||||
// so the wake prompt can label it as such.
|
||||
}
|
||||
tracing::info!(%from, %body, "manager inbox");
|
||||
tracing::info!(%from, %body, %redelivered, "manager inbox");
|
||||
let unread = inbox_unread(socket).await;
|
||||
bus.emit(LiveEvent::TurnStart {
|
||||
from: from.clone(),
|
||||
body: body.clone(),
|
||||
unread,
|
||||
});
|
||||
let prompt = format_wake_prompt(&from, &body, unread);
|
||||
let prompt = format_wake_prompt(&from, &body, unread, redelivered);
|
||||
bus.set_state(TurnState::Thinking);
|
||||
let started_at = now_unix();
|
||||
let started_instant = std::time::Instant::now();
|
||||
|
|
@ -172,6 +181,12 @@ async fn serve(
|
|||
};
|
||||
turn::emit_turn_end(&bus, &outcome);
|
||||
bus.set_state(TurnState::Idle);
|
||||
// Ack only on a clean turn-end; Failed leaves the
|
||||
// popped ids in-flight for the next boot's requeue.
|
||||
// Mirrors hive-ag3nt; see that loop for full rationale.
|
||||
if matches!(outcome, turn::TurnOutcome::Ok) {
|
||||
ack_turn(socket).await;
|
||||
}
|
||||
if let Some(s) = &stats {
|
||||
let ended_at = now_unix();
|
||||
let duration_ms =
|
||||
|
|
@ -228,8 +243,15 @@ async fn serve(
|
|||
/// Per-turn user prompt. The role/tools/etc. is in the system prompt
|
||||
/// (`prompts/manager.md` → `claude --system-prompt-file`); this is just
|
||||
/// the wake signal. `unread` is the inbox depth after this message was
|
||||
/// popped.
|
||||
fn format_wake_prompt(from: &str, body: &str, unread: u64) -> String {
|
||||
/// popped. `redelivered` adds a "may already be handled" banner above
|
||||
/// the wake body when the broker resurfaced this row (see hive-ag3nt's
|
||||
/// `format_wake_prompt` for the full story).
|
||||
fn format_wake_prompt(from: &str, body: &str, unread: u64, redelivered: bool) -> String {
|
||||
let banner = if redelivered {
|
||||
hive_ag3nt::mcp::REDELIVERY_HINT
|
||||
} else {
|
||||
""
|
||||
};
|
||||
let pending = if unread == 0 {
|
||||
String::new()
|
||||
} else {
|
||||
|
|
@ -237,7 +259,39 @@ fn format_wake_prompt(from: &str, body: &str, unread: u64) -> String {
|
|||
"\n\n({unread} more message(s) pending in your inbox — drain via `mcp__hyperhive__recv` if relevant.)"
|
||||
)
|
||||
};
|
||||
format!("Incoming message from `{from}`:\n---\n{body}\n---{pending}")
|
||||
format!("{banner}Incoming message from `{from}`:\n---\n{body}\n---{pending}")
|
||||
}
|
||||
|
||||
/// Best-effort: tell the broker every message popped during the turn
|
||||
/// is now handled. Mirror of `hive-ag3nt::ack_turn` on the manager
|
||||
/// surface.
|
||||
async fn ack_turn(socket: &Path) {
|
||||
match client::request::<_, ManagerResponse>(socket, &ManagerRequest::AckTurn).await {
|
||||
Ok(ManagerResponse::Ok) => {}
|
||||
Ok(ManagerResponse::Err { message }) => {
|
||||
tracing::warn!(%message, "ack_turn rejected by broker");
|
||||
}
|
||||
Ok(other) => {
|
||||
tracing::warn!(?other, "ack_turn unexpected response");
|
||||
}
|
||||
Err(e) => tracing::warn!(error = ?e, "ack_turn transport error"),
|
||||
}
|
||||
}
|
||||
|
||||
/// Boot-time recovery: ask the broker to resurface any inflight (popped
|
||||
/// but not acked) messages so the next `Recv` re-delivers them with
|
||||
/// the redelivery banner. Mirror of `hive-ag3nt::requeue_inflight`.
|
||||
async fn requeue_inflight(socket: &Path) {
|
||||
match client::request::<_, ManagerResponse>(socket, &ManagerRequest::RequeueInflight).await {
|
||||
Ok(ManagerResponse::Ok) => {}
|
||||
Ok(ManagerResponse::Err { message }) => {
|
||||
tracing::warn!(%message, "requeue_inflight rejected by broker");
|
||||
}
|
||||
Ok(other) => {
|
||||
tracing::warn!(?other, "requeue_inflight unexpected response");
|
||||
}
|
||||
Err(e) => tracing::warn!(error = ?e, "requeue_inflight transport error"),
|
||||
}
|
||||
}
|
||||
|
||||
async fn inbox_unread(socket: &Path) -> u64 {
|
||||
|
|
|
|||
|
|
@ -34,7 +34,18 @@ use crate::client;
|
|||
pub enum SocketReply {
|
||||
Ok,
|
||||
Err(String),
|
||||
Message { from: String, body: String },
|
||||
/// `id` is the broker's row id — not surfaced to claude but
|
||||
/// useful for harness-side bookkeeping (not used in this module
|
||||
/// today; the bin loops drive ack via `AckTurn` instead of
|
||||
/// per-id). `redelivered` triggers the "may already be handled"
|
||||
/// hint in `format_recv` so claude sees it when draining the
|
||||
/// inbox in-turn.
|
||||
Message {
|
||||
from: String,
|
||||
body: String,
|
||||
id: i64,
|
||||
redelivered: bool,
|
||||
},
|
||||
Empty,
|
||||
Status(u64),
|
||||
QuestionQueued(i64),
|
||||
|
|
@ -54,7 +65,17 @@ impl From<hive_sh4re::AgentResponse> for SocketReply {
|
|||
match r {
|
||||
hive_sh4re::AgentResponse::Ok => Self::Ok,
|
||||
hive_sh4re::AgentResponse::Err { message } => Self::Err(message),
|
||||
hive_sh4re::AgentResponse::Message { from, body } => Self::Message { from, body },
|
||||
hive_sh4re::AgentResponse::Message {
|
||||
from,
|
||||
body,
|
||||
id,
|
||||
redelivered,
|
||||
} => Self::Message {
|
||||
from,
|
||||
body,
|
||||
id,
|
||||
redelivered,
|
||||
},
|
||||
hive_sh4re::AgentResponse::Empty => Self::Empty,
|
||||
hive_sh4re::AgentResponse::Status { unread } => Self::Status(unread),
|
||||
hive_sh4re::AgentResponse::Recent { rows } => Self::Recent(rows),
|
||||
|
|
@ -81,7 +102,17 @@ impl From<hive_sh4re::ManagerResponse> for SocketReply {
|
|||
match r {
|
||||
hive_sh4re::ManagerResponse::Ok => Self::Ok,
|
||||
hive_sh4re::ManagerResponse::Err { message } => Self::Err(message),
|
||||
hive_sh4re::ManagerResponse::Message { from, body } => Self::Message { from, body },
|
||||
hive_sh4re::ManagerResponse::Message {
|
||||
from,
|
||||
body,
|
||||
id,
|
||||
redelivered,
|
||||
} => Self::Message {
|
||||
from,
|
||||
body,
|
||||
id,
|
||||
redelivered,
|
||||
},
|
||||
hive_sh4re::ManagerResponse::Empty => Self::Empty,
|
||||
hive_sh4re::ManagerResponse::Status { unread } => Self::Status(unread),
|
||||
hive_sh4re::ManagerResponse::QuestionQueued { id } => Self::QuestionQueued(id),
|
||||
|
|
@ -117,10 +148,22 @@ pub fn format_ack(resp: Result<SocketReply, anyhow::Error>, tool: &str, ok_msg:
|
|||
}
|
||||
|
||||
/// Format helper for `recv` tools: `Message` → from + body block;
|
||||
/// `Empty` → marker; anything else surfaces as an error.
|
||||
/// `Empty` → marker; anything else surfaces as an error. When the
|
||||
/// broker tags the row as `redelivered` (popped before, never acked,
|
||||
/// resurfaced after a harness restart) a short banner is prepended
|
||||
/// so claude knows the side-effects of any previous handling may
|
||||
/// already have happened.
|
||||
pub fn format_recv(resp: Result<SocketReply, anyhow::Error>) -> String {
|
||||
match resp {
|
||||
Ok(SocketReply::Message { from, body }) => format!("from: {from}\n\n{body}"),
|
||||
Ok(SocketReply::Message {
|
||||
from,
|
||||
body,
|
||||
redelivered,
|
||||
..
|
||||
}) => {
|
||||
let banner = if redelivered { REDELIVERY_HINT } else { "" };
|
||||
format!("{banner}from: {from}\n\n{body}")
|
||||
}
|
||||
Ok(SocketReply::Empty) => "(empty)".into(),
|
||||
Ok(SocketReply::Err(m)) => format!("recv failed: {m}"),
|
||||
Ok(other) => format!("recv unexpected response: {other:?}"),
|
||||
|
|
@ -128,6 +171,14 @@ pub fn format_recv(resp: Result<SocketReply, anyhow::Error>) -> String {
|
|||
}
|
||||
}
|
||||
|
||||
/// Header prepended to message bodies that were popped by a prior
|
||||
/// harness session, never acked (turn crash / OOM / restart), and
|
||||
/// resurfaced by `RequeueInflight` on this session's boot. Same
|
||||
/// string surfaces in the wake prompt (see the bin loops) and the
|
||||
/// in-turn `recv` tool result so claude sees the warning either way.
|
||||
pub const REDELIVERY_HINT: &str =
|
||||
"[redelivered after harness restart — may already be handled]\n";
|
||||
|
||||
/// Format helper for `get_loose_ends`: renders a short bulleted list
|
||||
/// of pending approvals + questions + reminders. Empty list collapses
|
||||
/// to a clear marker so claude doesn't go hunting for a payload that
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue