rebuild_queue: switch dashboard / auto-update / manager call sites to enqueue

This commit is contained in:
damocles 2026-05-23 11:53:14 +02:00 committed by Mara
parent 37f6bc4b6b
commit 11db5c2a8f
3 changed files with 48 additions and 139 deletions

View file

@ -206,10 +206,9 @@ pub async fn run(coord: Arc<Coordinator>) -> Result<()> {
} }
}; };
let current_rev = let _current_rev = current_flake_rev(&coord.hyperhive_flake).unwrap_or_default();
current_flake_rev(&coord.hyperhive_flake).unwrap_or_default();
tracing::info!(agents = containers.len(), "auto-update: rebuilding all on startup"); tracing::info!(agents = containers.len(), "auto-update: queueing all on startup");
for container in containers { for container in containers {
let logical = if container == MANAGER_NAME { let logical = if container == MANAGER_NAME {
Some(MANAGER_NAME.to_owned()) Some(MANAGER_NAME.to_owned())
@ -217,9 +216,14 @@ pub async fn run(coord: Arc<Coordinator>) -> Result<()> {
container.strip_prefix(AGENT_PREFIX).map(str::to_owned) container.strip_prefix(AGENT_PREFIX).map(str::to_owned)
}; };
let Some(name) = logical else { continue }; let Some(name) = logical else { continue };
if let Err(e) = rebuild_agent(&coord, &name, &current_rev).await { coord.rebuild_queue.enqueue(
tracing::warn!(%name, error = ?e, "auto-update: rebuild failed"); crate::rebuild_queue::QueueKind::Rebuild,
} name,
crate::rebuild_queue::QueueSource::AutoUpdate,
"startup sweep".to_owned(),
None,
);
} }
coord.emit_rebuild_queue_snapshot();
Ok(()) Ok(())
} }

View file

@ -1571,82 +1571,18 @@ async fn post_meta_update(
if inputs.is_empty() { if inputs.is_empty() {
return error_response("meta-update: no inputs selected"); return error_response("meta-update: no inputs selected");
} }
let coord = state.coord.clone(); state.coord.rebuild_queue.enqueue_with_inputs(
let inputs_clone = inputs.clone(); crate::rebuild_queue::QueueKind::MetaUpdate,
tokio::spawn(async move { "hyperhive".to_owned(),
run_meta_update(&coord, &inputs_clone).await; crate::rebuild_queue::QueueSource::Manual,
// Lock file changed — emit so dashboards refresh the format!("meta-update via dashboard ({})", inputs.join(", ")),
// meta-inputs panel without a snapshot poll. None,
emit_meta_inputs_snapshot(&coord); inputs,
}); );
state.coord.emit_rebuild_queue_snapshot();
(StatusCode::OK, "ok").into_response() (StatusCode::OK, "ok").into_response()
} }
/// Background task: run `nix flake update <inputs>` in meta + commit,
/// then rebuild every agent whose input was touched (or all agents
/// when `hyperhive` was bumped, since that's the shared base). Each
/// rebuild fires `Rebuilt { ok, note, ... }` to the manager so the
/// operator and manager get the same feedback they'd see from an
/// auto-update / manual dashboard rebuild.
async fn run_meta_update(coord: &Arc<crate::coordinator::Coordinator>, inputs: &[String]) {
// Held for the whole run (incl. the early `return` on lock failure):
// emits `MetaUpdateRunning { running: true }` now and `false` on
// drop so the META INPUTS panel shows progress (issue #259).
let _progress = coord.meta_update_guard();
tracing::info!(?inputs, "meta-update: starting");
if let Err(e) = crate::meta::lock_update(inputs).await {
tracing::warn!(error = ?e, "meta-update: lock_update failed");
return;
}
// Decide which agents to rebuild. Inputs are slash-paths from
// the meta root — `hyperhive`, `hyperhive/nixpkgs`,
// `agent-coder`, `agent-coder/mcp-matrix`, etc. Anything in the
// hyperhive subtree affects every agent (shared base); anything
// in `agent-<n>/...` only the named agent.
let touched_hyperhive = inputs
.iter()
.any(|i| i == "hyperhive" || i.starts_with("hyperhive/"));
let touched_agents: Vec<String> = inputs
.iter()
.filter_map(|i| i.strip_prefix("agent-"))
.map(|rest| rest.split('/').next().unwrap_or(rest).to_owned())
.collect();
let agents_to_rebuild: Vec<String> = if touched_hyperhive {
crate::lifecycle::list()
.await
.unwrap_or_default()
.into_iter()
.filter_map(|c| {
if c == crate::lifecycle::MANAGER_NAME {
Some(crate::lifecycle::MANAGER_NAME.to_owned())
} else {
c.strip_prefix(crate::lifecycle::AGENT_PREFIX)
.map(str::to_owned)
}
})
.collect()
} else {
touched_agents
};
let current_rev =
crate::auto_update::current_flake_rev(&coord.hyperhive_flake).unwrap_or_default();
// Sequential rebuild loop — the META_LOCK guards meta-side
// races but parallel nix builds also serialise via nix-daemon,
// so sequential is just as fast in practice and keeps logs
// readable.
for name in agents_to_rebuild {
tracing::info!(%name, "meta-update: rebuilding agent");
if let Err(e) = crate::auto_update::rebuild_agent(coord, &name, &current_rev).await {
tracing::warn!(%name, error = ?e, "meta-update: rebuild failed");
// continue: surface each per-agent failure via its own
// Rebuilt event; don't abort the whole batch.
}
}
tracing::info!("meta-update: done");
}
async fn post_op_send(State(state): State<AppState>, Form(form): Form<OpSendForm>) -> Response { async fn post_op_send(State(state): State<AppState>, Form(form): Form<OpSendForm>) -> Response {
let to = form.to.trim().to_owned(); let to = form.to.trim().to_owned();
let body = form.body.trim().to_owned(); let body = form.body.trim().to_owned();
@ -1708,28 +1644,16 @@ async fn post_request_spawn(
} }
async fn post_rebuild(State(state): State<AppState>, AxumPath(name): AxumPath<String>) -> Response { async fn post_rebuild(State(state): State<AppState>, AxumPath(name): AxumPath<String>) -> Response {
let Some(current_rev) = crate::auto_update::current_flake_rev(&state.coord.hyperhive_flake) let logical = strip_container_prefix(&name);
else { state.coord.rebuild_queue.enqueue(
return error_response( crate::rebuild_queue::QueueKind::Rebuild,
"rebuild: hyperhive_flake has no canonical path; manual rebuild only via `hive-c0re rebuild`", logical,
); crate::rebuild_queue::QueueSource::Manual,
}; "manual via dashboard ↻ R3BU1LD button".to_owned(),
let coord = state.coord.clone(); None,
lifecycle_action( );
&state, state.coord.emit_rebuild_queue_snapshot();
&name, (StatusCode::OK, "ok").into_response()
crate::coordinator::TransientKind::Rebuilding,
"rebuild",
move |n| {
let coord = coord.clone();
let rev = current_rev.clone();
async move { crate::auto_update::rebuild_agent(&coord, &n, &rev).await }
},
// rebuild_agent fires kick_agent on success itself, so the
// extra-closure is a no-op here.
|_, _| {},
)
.await
} }
/// Common shape for the simple lifecycle action handlers (start / /// Common shape for the simple lifecycle action handlers (start /
@ -1816,12 +1740,7 @@ async fn post_start(State(state): State<AppState>, AxumPath(name): AxumPath<Stri
} }
async fn post_update_all(State(state): State<AppState>) -> Response { async fn post_update_all(State(state): State<AppState>) -> Response {
let Some(current_rev) = crate::auto_update::current_flake_rev(&state.coord.hyperhive_flake)
else {
return error_response("update-all: hyperhive_flake has no canonical path");
};
let containers = lifecycle::list().await.unwrap_or_default(); let containers = lifecycle::list().await.unwrap_or_default();
let mut errors = Vec::new();
for container in containers { for container in containers {
let logical = if container == lifecycle::MANAGER_NAME { let logical = if container == lifecycle::MANAGER_NAME {
lifecycle::MANAGER_NAME.to_owned() lifecycle::MANAGER_NAME.to_owned()
@ -1830,21 +1749,16 @@ async fn post_update_all(State(state): State<AppState>) -> Response {
} else { } else {
continue; continue;
}; };
if let Err(e) = state.coord.rebuild_queue.enqueue(
crate::auto_update::rebuild_agent(&state.coord, &logical, &current_rev).await crate::rebuild_queue::QueueKind::Rebuild,
{ logical,
errors.push(format!("{logical}: {e:#}")); crate::rebuild_queue::QueueSource::Manual,
} "manual via dashboard 🌀 UPDATE ALL".to_owned(),
} None,
if errors.is_empty() { );
// Each rebuild_agent rescanned; no extra refetch needed.
(StatusCode::OK, "ok").into_response()
} else {
error_response(&format!(
"update-all partial failure:\n{}",
errors.join("\n")
))
} }
state.coord.emit_rebuild_queue_snapshot();
(StatusCode::OK, "ok").into_response()
} }
fn transient_label(k: crate::coordinator::TransientKind) -> &'static str { fn transient_label(k: crate::coordinator::TransientKind) -> &'static str {

View file

@ -291,25 +291,16 @@ async fn dispatch(req: &ManagerRequest, coord: &Arc<Coordinator>) -> ManagerResp
} }
} }
ManagerRequest::Update { name } => { ManagerRequest::Update { name } => {
tracing::info!(%name, "manager: update"); tracing::info!(%name, "manager: enqueue update");
let Some(current_rev) = crate::auto_update::current_flake_rev(&coord.hyperhive_flake) coord.rebuild_queue.enqueue(
else { crate::rebuild_queue::QueueKind::Rebuild,
return ManagerResponse::Err { name.to_owned(),
message: "update: hyperhive_flake has no canonical path".into(), crate::rebuild_queue::QueueSource::Manual,
}; "manager `update` tool".to_owned(),
}; None,
let guard = coord.transient_guard(name, crate::coordinator::TransientKind::Rebuilding); );
let result = crate::auto_update::rebuild_agent(coord, name, &current_rev).await; coord.emit_rebuild_queue_snapshot();
drop(guard); ManagerResponse::Ok
match result {
Ok(()) => {
coord.kick_agent(name, "container rebuilt");
ManagerResponse::Ok
}
Err(e) => ManagerResponse::Err {
message: format!("{e:#}"),
},
}
} }
ManagerRequest::RequestUpdateMetaInputs { ManagerRequest::RequestUpdateMetaInputs {
inputs, inputs,