manager events: Spawned/Rebuilt/Killed/Destroyed + start button
This commit is contained in:
parent
06ea0cf283
commit
37c6504462
9 changed files with 165 additions and 70 deletions
|
|
@ -132,8 +132,11 @@ async fn serve(socket: &Path, interval: Duration, bus: Bus) -> Result<()> {
|
|||
match recv {
|
||||
Ok(ManagerResponse::Message { from, body }) => {
|
||||
if from == SYSTEM_SENDER {
|
||||
// Helper events (ApprovalResolved, etc.) — log + surface
|
||||
// in live view but don't burn a claude turn on them.
|
||||
// Helper events (ApprovalResolved / Spawned / Rebuilt /
|
||||
// Killed / Destroyed) — these are FYI for the manager;
|
||||
// we surface them in the live view and forward them as
|
||||
// a normal claude turn so the manager can react (e.g.
|
||||
// greet a newly-spawned agent, retry a failed rebuild).
|
||||
let parsed = serde_json::from_str::<HelperEvent>(&body).ok();
|
||||
if let Some(event) = parsed {
|
||||
tracing::info!(?event, "helper event");
|
||||
|
|
@ -141,7 +144,9 @@ async fn serve(socket: &Path, interval: Duration, bus: Bus) -> Result<()> {
|
|||
tracing::info!(%from, %body, "system message");
|
||||
}
|
||||
bus.emit(LiveEvent::Note(format!("[system] {body}")));
|
||||
continue;
|
||||
// Fall through: drive a turn with the event in the wake
|
||||
// prompt body so claude sees it. Sender stays "system"
|
||||
// so the wake prompt can label it as such.
|
||||
}
|
||||
tracing::info!(%from, %body, "manager inbox");
|
||||
bus.emit(LiveEvent::TurnStart {
|
||||
|
|
@ -172,6 +177,15 @@ async fn serve(socket: &Path, interval: Duration, bus: Bus) -> Result<()> {
|
|||
/// prompt doesn't have access to, and points the manager at its own
|
||||
/// editable config repo for self-modification.
|
||||
fn format_wake_prompt(label: &str, from: &str, body: &str) -> String {
|
||||
let from_note = if from == SYSTEM_SENDER {
|
||||
"\n The sender `system` means this is a hyperhive helper event \
|
||||
(JSON body, `event` field discriminates): `approval_resolved`, \
|
||||
`spawned`, `rebuilt`, `killed`, `destroyed`. Use these to react to \
|
||||
lifecycle changes — e.g. greet a freshly-spawned agent, retry a \
|
||||
failed rebuild, or note the change to the operator.\n"
|
||||
} else {
|
||||
""
|
||||
};
|
||||
format!(
|
||||
"You are the hyperhive manager `{label}` in a multi-agent system. You \
|
||||
coordinate sub-agents and relay between them and the operator.\n\
|
||||
|
|
@ -179,7 +193,7 @@ fn format_wake_prompt(label: &str, from: &str, body: &str) -> String {
|
|||
Incoming message from `{from}`:\n\
|
||||
---\n\
|
||||
{body}\n\
|
||||
---\n\
|
||||
---\n{from_note}
|
||||
\n\
|
||||
Tools (hyperhive surface):\n\
|
||||
- `mcp__hyperhive__recv()` — drain one more message from your inbox.\n\
|
||||
|
|
|
|||
|
|
@ -147,6 +147,11 @@
|
|||
form('/kill/' + c.name, 'btn-stop', '■ ST0P', 'stop ' + c.name + '?'),
|
||||
);
|
||||
}
|
||||
} else {
|
||||
li.append(
|
||||
' ',
|
||||
form('/start/' + c.name, 'btn-start', '▶ ST4RT', 'start ' + c.name + '?'),
|
||||
);
|
||||
}
|
||||
li.append(
|
||||
' ',
|
||||
|
|
|
|||
|
|
@ -100,6 +100,7 @@ ul form.inline { display: inline-block; }
|
|||
.btn-rebuild { color: var(--amber); border-color: var(--amber); font-size: 0.75em; padding: 0.15em 0.5em; margin-left: 0.6em; }
|
||||
.btn-restart { color: var(--cyan); border-color: var(--cyan); font-size: 0.75em; padding: 0.15em 0.5em; margin-left: 0.6em; }
|
||||
.btn-stop { color: var(--pink); border-color: var(--pink); font-size: 0.75em; padding: 0.15em 0.5em; margin-left: 0.6em; }
|
||||
.btn-start { color: var(--green); border-color: var(--green); font-size: 0.75em; padding: 0.15em 0.5em; margin-left: 0.6em; }
|
||||
.btn-talk { color: var(--cyan); border-color: var(--cyan); }
|
||||
.btn-spawn { color: var(--amber); border-color: var(--amber); }
|
||||
.spawnform { display: flex; gap: 0.6em; align-items: stretch; margin: 0.5em 0; }
|
||||
|
|
|
|||
|
|
@ -6,9 +6,7 @@
|
|||
use std::sync::Arc;
|
||||
|
||||
use anyhow::{Result, bail};
|
||||
use hive_sh4re::{
|
||||
ApprovalKind, ApprovalStatus, HelperEvent, MANAGER_AGENT, Message, SYSTEM_SENDER,
|
||||
};
|
||||
use hive_sh4re::{ApprovalKind, ApprovalStatus, HelperEvent, MANAGER_AGENT};
|
||||
|
||||
use crate::coordinator::{Coordinator, TransientKind};
|
||||
use crate::lifecycle::{self, MANAGER_NAME};
|
||||
|
|
@ -90,36 +88,39 @@ fn finish_approval(
|
|||
approval: &hive_sh4re::Approval,
|
||||
result: Result<()>,
|
||||
) -> Result<()> {
|
||||
match result {
|
||||
Ok(()) => {
|
||||
notify_manager(
|
||||
coord,
|
||||
&HelperEvent::ApprovalResolved {
|
||||
id: approval.id,
|
||||
agent: approval.agent.clone(),
|
||||
commit_ref: approval.commit_ref.clone(),
|
||||
status: ApprovalStatus::Approved,
|
||||
note: None,
|
||||
},
|
||||
);
|
||||
Ok(())
|
||||
}
|
||||
let (status, note, ok) = match &result {
|
||||
Ok(()) => (ApprovalStatus::Approved, None, true),
|
||||
Err(e) => {
|
||||
let note = format!("{e:#}");
|
||||
let _ = coord.approvals.mark_failed(approval.id, ¬e);
|
||||
notify_manager(
|
||||
coord,
|
||||
&HelperEvent::ApprovalResolved {
|
||||
(ApprovalStatus::Failed, Some(note), false)
|
||||
}
|
||||
};
|
||||
coord.notify_manager(&HelperEvent::ApprovalResolved {
|
||||
id: approval.id,
|
||||
agent: approval.agent.clone(),
|
||||
commit_ref: approval.commit_ref.clone(),
|
||||
status: ApprovalStatus::Failed,
|
||||
note: Some(note),
|
||||
},
|
||||
);
|
||||
Err(e)
|
||||
}
|
||||
status,
|
||||
note: note.clone(),
|
||||
});
|
||||
// For spawn/rebuild approvals, also surface the underlying action so
|
||||
// the manager knows whether the container actually came up. The
|
||||
// ApprovalResolved event already carries the same `ok` signal but
|
||||
// separating it lets the manager react to the lifecycle change
|
||||
// without having to special-case approvals.
|
||||
match approval.kind {
|
||||
ApprovalKind::Spawn => coord.notify_manager(&HelperEvent::Spawned {
|
||||
agent: approval.agent.clone(),
|
||||
ok,
|
||||
note,
|
||||
}),
|
||||
ApprovalKind::ApplyCommit => coord.notify_manager(&HelperEvent::Rebuilt {
|
||||
agent: approval.agent.clone(),
|
||||
ok,
|
||||
note,
|
||||
}),
|
||||
}
|
||||
result
|
||||
}
|
||||
|
||||
/// Tear down a sub-agent container. By default this is non-destructive to
|
||||
|
|
@ -144,6 +145,9 @@ pub async fn destroy(coord: &Coordinator, name: &str) -> Result<()> {
|
|||
let _ = coord
|
||||
.approvals
|
||||
.fail_pending_for_agent(name, "agent destroyed");
|
||||
coord.notify_manager(&HelperEvent::Destroyed {
|
||||
agent: name.to_owned(),
|
||||
});
|
||||
Ok(())
|
||||
}
|
||||
|
||||
|
|
@ -152,33 +156,13 @@ pub fn deny(coord: &Coordinator, id: i64) -> Result<()> {
|
|||
coord.approvals.mark_denied(id)?;
|
||||
tracing::info!(%id, "approval denied");
|
||||
if let Some(a) = approval {
|
||||
notify_manager(
|
||||
coord,
|
||||
&HelperEvent::ApprovalResolved {
|
||||
coord.notify_manager(&HelperEvent::ApprovalResolved {
|
||||
id: a.id,
|
||||
agent: a.agent,
|
||||
commit_ref: a.commit_ref,
|
||||
status: ApprovalStatus::Denied,
|
||||
note: None,
|
||||
},
|
||||
);
|
||||
});
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn notify_manager(coord: &Coordinator, event: &HelperEvent) {
|
||||
let body = match serde_json::to_string(event) {
|
||||
Ok(s) => s,
|
||||
Err(e) => {
|
||||
tracing::warn!(error = ?e, "failed to encode helper event");
|
||||
return;
|
||||
}
|
||||
};
|
||||
if let Err(e) = coord.broker.send(&Message {
|
||||
from: SYSTEM_SENDER.to_owned(),
|
||||
to: MANAGER_AGENT.to_owned(),
|
||||
body,
|
||||
}) {
|
||||
tracing::warn!(error = ?e, "failed to push helper event to manager");
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -58,7 +58,7 @@ pub async fn rebuild_agent(coord: &Arc<Coordinator>, name: &str, current_rev: &s
|
|||
.with_context(|| format!("ensure_runtime {name}"))?;
|
||||
let applied_dir = Coordinator::agent_applied_dir(name);
|
||||
let claude_dir = Coordinator::agent_claude_dir(name);
|
||||
lifecycle::rebuild(
|
||||
let result = lifecycle::rebuild(
|
||||
name,
|
||||
&coord.hyperhive_flake,
|
||||
&agent_dir,
|
||||
|
|
@ -66,10 +66,27 @@ pub async fn rebuild_agent(coord: &Arc<Coordinator>, name: &str, current_rev: &s
|
|||
&claude_dir,
|
||||
coord.dashboard_port,
|
||||
)
|
||||
.await?;
|
||||
std::fs::write(rev_marker_path(name), current_rev)
|
||||
.with_context(|| format!("write rev marker for {name}"))?;
|
||||
Ok(())
|
||||
.await;
|
||||
match &result {
|
||||
Ok(()) => {
|
||||
if let Err(e) = std::fs::write(rev_marker_path(name), current_rev) {
|
||||
tracing::warn!(%name, error = ?e, "write rev marker failed");
|
||||
}
|
||||
coord.notify_manager(&hive_sh4re::HelperEvent::Rebuilt {
|
||||
agent: name.to_owned(),
|
||||
ok: true,
|
||||
note: None,
|
||||
});
|
||||
}
|
||||
Err(e) => {
|
||||
coord.notify_manager(&hive_sh4re::HelperEvent::Rebuilt {
|
||||
agent: name.to_owned(),
|
||||
ok: false,
|
||||
note: Some(format!("{e:#}")),
|
||||
});
|
||||
}
|
||||
}
|
||||
result
|
||||
}
|
||||
|
||||
/// Auto-create the manager container on startup if it isn't already there.
|
||||
|
|
|
|||
|
|
@ -107,6 +107,27 @@ impl Coordinator {
|
|||
self.transient.lock().unwrap().clone()
|
||||
}
|
||||
|
||||
/// Push a `HelperEvent` into the manager's inbox. Encoded as JSON in
|
||||
/// `Message::body`; sender = `SYSTEM_SENDER`. The manager harness
|
||||
/// recognises the sender and parses the body. Best-effort: a serde or
|
||||
/// broker error is logged but does not propagate.
|
||||
pub fn notify_manager(&self, event: &hive_sh4re::HelperEvent) {
|
||||
let body = match serde_json::to_string(event) {
|
||||
Ok(s) => s,
|
||||
Err(e) => {
|
||||
tracing::warn!(error = ?e, "failed to encode helper event");
|
||||
return;
|
||||
}
|
||||
};
|
||||
if let Err(e) = self.broker.send(&hive_sh4re::Message {
|
||||
from: hive_sh4re::SYSTEM_SENDER.to_owned(),
|
||||
to: hive_sh4re::MANAGER_AGENT.to_owned(),
|
||||
body,
|
||||
}) {
|
||||
tracing::warn!(error = ?e, "failed to push helper event to manager");
|
||||
}
|
||||
}
|
||||
|
||||
pub fn agent_dir(name: &str) -> PathBuf {
|
||||
PathBuf::from(format!("{AGENT_RUNTIME_ROOT}/{name}"))
|
||||
}
|
||||
|
|
|
|||
|
|
@ -47,6 +47,7 @@ pub async fn serve(port: u16, coord: Arc<Coordinator>) -> Result<()> {
|
|||
.route("/destroy/{name}", post(post_destroy))
|
||||
.route("/kill/{name}", post(post_kill))
|
||||
.route("/restart/{name}", post(post_restart))
|
||||
.route("/start/{name}", post(post_start))
|
||||
.route("/rebuild/{name}", post(post_rebuild))
|
||||
.route("/update-all", post(post_update_all))
|
||||
.route("/request-spawn", post(post_request_spawn))
|
||||
|
|
@ -313,6 +314,11 @@ async fn post_kill(State(state): State<AppState>, AxumPath(name): AxumPath<Strin
|
|||
match lifecycle::kill(&logical).await {
|
||||
Ok(()) => {
|
||||
state.coord.unregister_agent(&logical);
|
||||
state
|
||||
.coord
|
||||
.notify_manager(&hive_sh4re::HelperEvent::Killed {
|
||||
agent: logical.clone(),
|
||||
});
|
||||
Redirect::to("/").into_response()
|
||||
}
|
||||
Err(e) => error_response(&format!("kill {logical} failed: {e:#}")),
|
||||
|
|
@ -327,6 +333,14 @@ async fn post_restart(State(_state): State<AppState>, AxumPath(name): AxumPath<S
|
|||
}
|
||||
}
|
||||
|
||||
async fn post_start(State(_state): State<AppState>, AxumPath(name): AxumPath<String>) -> Response {
|
||||
let logical = strip_container_prefix(&name);
|
||||
match lifecycle::start(&logical).await {
|
||||
Ok(()) => Redirect::to("/").into_response(),
|
||||
Err(e) => error_response(&format!("start {logical} failed: {e:#}")),
|
||||
}
|
||||
}
|
||||
|
||||
async fn post_update_all(State(state): State<AppState>) -> Response {
|
||||
let Some(current_rev) = crate::auto_update::current_flake_rev(&state.coord.hyperhive_flake)
|
||||
else {
|
||||
|
|
|
|||
|
|
@ -65,7 +65,7 @@ async fn dispatch(req: &HostRequest, coord: Arc<Coordinator>) -> HostResponse {
|
|||
let proposed_dir = Coordinator::agent_proposed_dir(name);
|
||||
let applied_dir = Coordinator::agent_applied_dir(name);
|
||||
let claude_dir = Coordinator::agent_claude_dir(name);
|
||||
if let Err(e) = lifecycle::spawn(
|
||||
match lifecycle::spawn(
|
||||
name,
|
||||
&coord.hyperhive_flake,
|
||||
&agent_dir,
|
||||
|
|
@ -76,10 +76,24 @@ async fn dispatch(req: &HostRequest, coord: Arc<Coordinator>) -> HostResponse {
|
|||
)
|
||||
.await
|
||||
{
|
||||
Ok(()) => {
|
||||
coord.notify_manager(&hive_sh4re::HelperEvent::Spawned {
|
||||
agent: name.clone(),
|
||||
ok: true,
|
||||
note: None,
|
||||
});
|
||||
}
|
||||
Err(e) => {
|
||||
// Roll back socket registration if container creation failed.
|
||||
coord.unregister_agent(name);
|
||||
coord.notify_manager(&hive_sh4re::HelperEvent::Spawned {
|
||||
agent: name.clone(),
|
||||
ok: false,
|
||||
note: Some(format!("{e:#}")),
|
||||
});
|
||||
return Err(e);
|
||||
}
|
||||
}
|
||||
HostResponse::success()
|
||||
}
|
||||
HostRequest::RequestSpawn { name } => {
|
||||
|
|
@ -94,6 +108,9 @@ async fn dispatch(req: &HostRequest, coord: Arc<Coordinator>) -> HostResponse {
|
|||
tracing::info!(%name, "kill");
|
||||
lifecycle::kill(name).await?;
|
||||
coord.unregister_agent(name);
|
||||
coord.notify_manager(&hive_sh4re::HelperEvent::Killed {
|
||||
agent: name.clone(),
|
||||
});
|
||||
HostResponse::success()
|
||||
}
|
||||
HostRequest::Destroy { name } => {
|
||||
|
|
|
|||
|
|
@ -196,8 +196,8 @@ pub const SYSTEM_SENDER: &str = "system";
|
|||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
#[serde(tag = "event", rename_all = "snake_case")]
|
||||
pub enum HelperEvent {
|
||||
/// An approval was approved or denied; if approved, the rebuild has
|
||||
/// already run (status = Approved on success, Failed on error).
|
||||
/// An approval was approved/denied/failed; if approved, the underlying
|
||||
/// action (rebuild or spawn) has already run by the time this lands.
|
||||
ApprovalResolved {
|
||||
id: i64,
|
||||
agent: String,
|
||||
|
|
@ -206,6 +206,28 @@ pub enum HelperEvent {
|
|||
#[serde(default, skip_serializing_if = "Option::is_none")]
|
||||
note: Option<String>,
|
||||
},
|
||||
/// A new container was spawned (post-approval or via the admin CLI
|
||||
/// bypass path). `ok=false` means the spawn failed.
|
||||
Spawned {
|
||||
agent: String,
|
||||
ok: bool,
|
||||
#[serde(default, skip_serializing_if = "Option::is_none")]
|
||||
note: Option<String>,
|
||||
},
|
||||
/// A container was rebuilt (auto-update on flake rev change, or a
|
||||
/// manual rebuild from CLI/dashboard).
|
||||
Rebuilt {
|
||||
agent: String,
|
||||
ok: bool,
|
||||
#[serde(default, skip_serializing_if = "Option::is_none")]
|
||||
note: Option<String>,
|
||||
},
|
||||
/// A sub-agent's container was stopped (the systemd unit is down;
|
||||
/// persistent state is unchanged).
|
||||
Killed { agent: String },
|
||||
/// A sub-agent's container was torn down (container removed; state
|
||||
/// dirs preserved per `destroy` semantics).
|
||||
Destroyed { agent: String },
|
||||
}
|
||||
|
||||
/// Requests on the manager socket. Manager has the agent surface (send/recv)
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue