retry hive socket up to 5x over 60s, surface retry count to claude

socket client now retries connect/IO failures with 2-4-8-16-30s
backoffs (60s total budget). transparent for non-tool callers via
request(); tool handlers go through request_retried() which also
returns the retry count, then annotate_retries() appends a one-line
note to the tool result so claude knows the slow round-trip was a
c0re flicker, not a content failure — avoids burning tokens on an
LLM-level retry.
This commit is contained in:
müde 2026-05-16 15:28:18 +02:00
parent 4a8a668348
commit 7d33da3727
2 changed files with 180 additions and 67 deletions

View file

@ -107,6 +107,22 @@ where
result
}
/// Append a short note to a tool result when the underlying socket call
/// took retries to land. Lets claude distinguish "my request was wrong"
/// from "c0re flickered and the harness rode it out" — without the
/// hint, a tool result that took 30s to come back looks identical to a
/// content failure and the model would burn a turn retrying it.
pub fn annotate_retries(mut s: String, retries: u32) -> String {
if retries > 0 {
let suffix = if retries == 1 { "retry" } else { "retries" };
s.push_str(&format!(
"\n\n(note: hive socket connect needed {retries} {suffix} — c0re likely \
restarted. Your request did succeed on the final attempt; no action needed.)"
));
}
s
}
#[derive(Debug, serde::Deserialize, schemars::JsonSchema)]
pub struct SendArgs {
/// Logical agent name to deliver the message to (e.g. `"manager"`,
@ -138,6 +154,19 @@ impl AgentServer {
pub fn new(socket: PathBuf) -> Self {
Self { socket }
}
/// Issue any `AgentRequest` through the retry-aware client and pull
/// the reply through `SocketReply`. Returns the retry count so tool
/// handlers can annotate their result (see `annotate_retries`).
async fn dispatch(
&self,
req: hive_sh4re::AgentRequest,
) -> (Result<SocketReply, anyhow::Error>, u32) {
match client::request_retried::<_, hive_sh4re::AgentResponse>(&self.socket, &req).await {
Ok((r, n)) => (Ok(SocketReply::from(r)), n),
Err(e) => (Err(e), 0),
}
}
}
#[tool_router]
@ -153,16 +182,13 @@ impl AgentServer {
return run_tool_envelope("send", log, async move { refusal }).await;
}
run_tool_envelope("send", log, async move {
let resp = client::request::<_, hive_sh4re::AgentResponse>(
&self.socket,
&hive_sh4re::AgentRequest::Send {
let (resp, retries) = self
.dispatch(hive_sh4re::AgentRequest::Send {
to: args.to,
body: args.body,
},
)
.await
.map(SocketReply::from);
format_ack(resp, "send", format!("sent to {to}"))
})
.await;
annotate_retries(format_ack(resp, "send", format!("sent to {to}")), retries)
})
.await
}
@ -182,18 +208,15 @@ impl AgentServer {
async fn ask_operator(&self, Parameters(args): Parameters<AskOperatorArgs>) -> String {
let log = format!("{args:?}");
run_tool_envelope("ask_operator", log, async move {
let resp = client::request::<_, hive_sh4re::AgentResponse>(
&self.socket,
&hive_sh4re::AgentRequest::AskOperator {
let (resp, retries) = self
.dispatch(hive_sh4re::AgentRequest::AskOperator {
question: args.question,
options: args.options,
multi: args.multi,
ttl_seconds: args.ttl_seconds,
},
)
.await
.map(SocketReply::from);
match resp {
})
.await;
let s = match resp {
Ok(SocketReply::QuestionQueued(id)) => format!(
"question queued (id={id}); operator's answer will arrive as a system \
`operator_answered` event in your inbox"
@ -201,7 +224,8 @@ impl AgentServer {
Ok(SocketReply::Err(m)) => format!("ask_operator failed: {m}"),
Ok(other) => format!("ask_operator unexpected response: {other:?}"),
Err(e) => format!("ask_operator transport error: {e:#}"),
}
};
annotate_retries(s, retries)
})
.await
}
@ -219,15 +243,12 @@ impl AgentServer {
async fn recv(&self, Parameters(args): Parameters<RecvArgs>) -> String {
let log = format!("{args:?}");
run_tool_envelope("recv", log, async move {
let resp = client::request::<_, hive_sh4re::AgentResponse>(
&self.socket,
&hive_sh4re::AgentRequest::Recv {
let (resp, retries) = self
.dispatch(hive_sh4re::AgentRequest::Recv {
wait_seconds: args.wait_seconds,
},
)
.await
.map(SocketReply::from);
format_recv(resp)
})
.await;
annotate_retries(format_recv(resp), retries)
})
.await
}
@ -341,15 +362,18 @@ impl ManagerServer {
Self { socket }
}
/// Helper: issue any `ManagerRequest`, convert the reply through
/// `SocketReply`. Manager tools that just need an `Ok` ack share this.
/// Helper: issue any `ManagerRequest` through the retry-aware
/// client, convert the reply through `SocketReply`, and return the
/// retry count alongside so the tool handler can `annotate_retries`
/// on the final string.
async fn dispatch(
&self,
req: hive_sh4re::ManagerRequest,
) -> Result<SocketReply, anyhow::Error> {
client::request::<_, hive_sh4re::ManagerResponse>(&self.socket, &req)
.await
.map(SocketReply::from)
) -> (Result<SocketReply, anyhow::Error>, u32) {
match client::request_retried::<_, hive_sh4re::ManagerResponse>(&self.socket, &req).await {
Ok((r, n)) => (Ok(SocketReply::from(r)), n),
Err(e) => (Err(e), 0),
}
}
}
@ -363,13 +387,13 @@ impl ManagerServer {
let log = format!("{args:?}");
let to = args.to.clone();
run_tool_envelope("send", log, async move {
let resp = self
let (resp, retries) = self
.dispatch(hive_sh4re::ManagerRequest::Send {
to: args.to,
body: args.body,
})
.await;
format_ack(resp, "send", format!("sent to {to}"))
annotate_retries(format_ack(resp, "send", format!("sent to {to}")), retries)
})
.await
}
@ -384,12 +408,12 @@ impl ManagerServer {
async fn recv(&self, Parameters(args): Parameters<RecvArgs>) -> String {
let log = format!("{args:?}");
run_tool_envelope("recv", log, async move {
let resp = self
let (resp, retries) = self
.dispatch(hive_sh4re::ManagerRequest::Recv {
wait_seconds: args.wait_seconds,
})
.await;
format_recv(resp)
annotate_retries(format_recv(resp), retries)
})
.await
}
@ -402,16 +426,19 @@ impl ManagerServer {
let log = format!("{args:?}");
let name = args.name.clone();
run_tool_envelope("request_spawn", log, async move {
let resp = self
let (resp, retries) = self
.dispatch(hive_sh4re::ManagerRequest::RequestSpawn {
name: args.name,
description: args.description,
})
.await;
format_ack(
resp,
"request_spawn",
format!("spawn approval queued for {name}"),
annotate_retries(
format_ack(
resp,
"request_spawn",
format!("spawn approval queued for {name}"),
),
retries,
)
})
.await
@ -425,10 +452,10 @@ impl ManagerServer {
let log = format!("{args:?}");
let name = args.name.clone();
run_tool_envelope("kill", log, async move {
let resp = self
let (resp, retries) = self
.dispatch(hive_sh4re::ManagerRequest::Kill { name: args.name })
.await;
format_ack(resp, "kill", format!("killed {name}"))
annotate_retries(format_ack(resp, "kill", format!("killed {name}")), retries)
})
.await
}
@ -441,10 +468,10 @@ impl ManagerServer {
let log = format!("{args:?}");
let name = args.name.clone();
run_tool_envelope("start", log, async move {
let resp = self
let (resp, retries) = self
.dispatch(hive_sh4re::ManagerRequest::Start { name: args.name })
.await;
format_ack(resp, "start", format!("started {name}"))
annotate_retries(format_ack(resp, "start", format!("started {name}")), retries)
})
.await
}
@ -454,10 +481,13 @@ impl ManagerServer {
let log = format!("{args:?}");
let name = args.name.clone();
run_tool_envelope("restart", log, async move {
let resp = self
let (resp, retries) = self
.dispatch(hive_sh4re::ManagerRequest::Restart { name: args.name })
.await;
format_ack(resp, "restart", format!("restarted {name}"))
annotate_retries(
format_ack(resp, "restart", format!("restarted {name}")),
retries,
)
})
.await
}
@ -471,10 +501,13 @@ impl ManagerServer {
let log = format!("{args:?}");
let name = args.name.clone();
run_tool_envelope("update", log, async move {
let resp = self
let (resp, retries) = self
.dispatch(hive_sh4re::ManagerRequest::Update { name: args.name })
.await;
format_ack(resp, "update", format!("updated {name}"))
annotate_retries(
format_ack(resp, "update", format!("updated {name}")),
retries,
)
})
.await
}
@ -494,7 +527,7 @@ impl ManagerServer {
async fn ask_operator(&self, Parameters(args): Parameters<AskOperatorArgs>) -> String {
let log = format!("{args:?}");
run_tool_envelope("ask_operator", log, async move {
let resp = self
let (resp, retries) = self
.dispatch(hive_sh4re::ManagerRequest::AskOperator {
question: args.question,
options: args.options,
@ -502,7 +535,7 @@ impl ManagerServer {
ttl_seconds: args.ttl_seconds,
})
.await;
match resp {
let s = match resp {
Ok(SocketReply::QuestionQueued(id)) => format!(
"question queued (id={id}); operator's answer will arrive as a system \
`operator_answered` event in your inbox"
@ -510,7 +543,8 @@ impl ManagerServer {
Ok(SocketReply::Err(m)) => format!("ask_operator failed: {m}"),
Ok(other) => format!("ask_operator unexpected response: {other:?}"),
Err(e) => format!("ask_operator transport error: {e:#}"),
}
};
annotate_retries(s, retries)
})
.await
}
@ -528,17 +562,20 @@ impl ManagerServer {
let agent = args.agent.clone();
let commit_ref = args.commit_ref.clone();
run_tool_envelope("request_apply_commit", log, async move {
let resp = self
let (resp, retries) = self
.dispatch(hive_sh4re::ManagerRequest::RequestApplyCommit {
agent: args.agent,
commit_ref: args.commit_ref,
description: args.description,
})
.await;
format_ack(
resp,
"request_apply_commit",
format!("apply approval queued for {agent} @ {commit_ref}"),
annotate_retries(
format_ack(
resp,
"request_apply_commit",
format!("apply approval queued for {agent} @ {commit_ref}"),
),
retries,
)
})
.await