dashboard: meta flake inputs UI + sequential rebuild loop

new section 'M3T4 1NPUTS' between approvals and message flow:
one row per input in meta/flake.lock (hyperhive first, then
agent-<n> alphabetically). each row shows the input name, the
first 12 chars of the locked sha, a relative timestamp from
locked.lastModified, and the original.url when available.
checkbox per row; submit button is disabled until at least one
box is checked; submitting confirms then POSTs the selected
names to /meta-update.

backend:
- meta::lock_update(inputs: &[String]) — runs 'nix flake update
  <names>' in the meta dir, commits the lock change with a
  combined message ('lock update: hyperhive, agent-coder').
  preserves the existing META_LOCK serialization. existing
  lock_update_for_rebuild / lock_update_hyperhive stay for
  their single-input callers.
- POST /meta-update — comma-separated 'inputs' form field
  (JS joins checkboxes since axum::Form doesn't natively
  decode repeated keys); spawns a background task that runs
  the lock update + per-agent rebuild loop. hyperhive
  selection fans out to all agents; agent-<n> selection only
  rebuilds <n>. each rebuild fires Rebuilt to the manager
  exactly like dashboard / admin-CLI / auto-update.

rebuild loop is sequential — auto_update::run too (was
parallel via tokio::spawn). parallel rebuilds collide on
nix-store's sqlite cache ('sqlite db busy, not using cache')
and the meta META_LOCK contention. nix-daemon serializes the
heavy build steps anyway, so this isn't a throughput loss.
This commit is contained in:
müde 2026-05-16 03:38:07 +02:00
parent 891223219e
commit 266c2c7a77
6 changed files with 331 additions and 18 deletions

View file

@ -57,6 +57,7 @@ pub async fn serve(port: u16, coord: Arc<Coordinator>) -> Result<()> {
.route("/api/agent-config/{name}", get(get_agent_config))
.route("/request-spawn", post(post_request_spawn))
.route("/op-send", post(post_op_send))
.route("/meta-update", post(post_meta_update))
.route("/messages/stream", get(messages_stream))
.with_state(AppState { coord });
let addr = SocketAddr::from(([0, 0, 0, 0], port));
@ -154,6 +155,9 @@ struct StateSnapshot {
/// least one other agent. Operator resolves by renaming. The
/// dashboard renders a banner at the top listing each cluster.
port_conflicts: Vec<PortConflict>,
/// Inputs in `meta/flake.lock` the operator can selectively
/// `nix flake update`. Hyperhive first, then `agent-<n>` rows.
meta_inputs: Vec<MetaInputView>,
}
#[derive(Serialize)]
@ -280,6 +284,7 @@ async fn api_state(headers: HeaderMap, State(state): State<AppState>) -> axum::J
transients,
approvals,
approval_history,
meta_inputs: read_meta_inputs(),
operator_inbox,
questions,
tombstones,
@ -360,7 +365,33 @@ async fn build_container_views(
/// yields an empty map so the dashboard degrades gracefully when the
/// meta repo hasn't been seeded yet.
fn read_meta_locked_revs() -> std::collections::HashMap<String, String> {
let mut out = std::collections::HashMap::new();
read_meta_inputs()
.into_iter()
.map(|i| (i.name, i.rev))
.collect()
}
#[derive(Serialize, Clone)]
struct MetaInputView {
/// Input key in meta's `flake.nix` — `hyperhive`, `agent-<n>`, etc.
name: String,
/// Full locked sha. Not displayed verbatim; the dashboard
/// truncates to the first 12 chars for the chip.
rev: String,
/// Unix seconds — `locked.lastModified`. Drives the relative
/// "2h ago" timestamp on each input row.
last_modified: i64,
/// `original.url` if available, for the tooltip / row meta text.
#[serde(skip_serializing_if = "Option::is_none")]
url: Option<String>,
}
/// Walk `flake.lock`'s `nodes` map → `Vec<MetaInputView>`. Only
/// includes nodes the root depends on (i.e. real inputs), skipping
/// the synthetic `root` entry. Sorted with `hyperhive` first then
/// alphabetically so the UI's top entry is the swarm-wide base.
fn read_meta_inputs() -> Vec<MetaInputView> {
let mut out = Vec::new();
let Ok(raw) = std::fs::read_to_string("/var/lib/hyperhive/meta/flake.lock") else {
return out;
};
@ -370,15 +401,48 @@ fn read_meta_locked_revs() -> std::collections::HashMap<String, String> {
let Some(nodes) = json.get("nodes").and_then(|v| v.as_object()) else {
return out;
};
let Some(root_name) = json.get("root").and_then(|v| v.as_str()) else {
return out;
};
let root_inputs: std::collections::BTreeSet<String> = nodes
.get(root_name)
.and_then(|n| n.get("inputs"))
.and_then(|v| v.as_object())
.map(|m| m.keys().cloned().collect())
.unwrap_or_default();
for (name, node) in nodes {
if let Some(rev) = node
.get("locked")
if !root_inputs.contains(name) {
continue;
}
let locked = node.get("locked");
let Some(rev) = locked
.and_then(|v| v.get("rev"))
.and_then(|v| v.as_str())
{
out.insert(name.clone(), rev.to_owned());
}
else {
continue;
};
let last_modified = locked
.and_then(|v| v.get("lastModified"))
.and_then(serde_json::Value::as_i64)
.unwrap_or(0);
let url = node
.get("original")
.and_then(|v| v.get("url"))
.and_then(|v| v.as_str())
.map(str::to_owned);
out.push(MetaInputView {
name: name.clone(),
rev: rev.to_owned(),
last_modified,
url,
});
}
// hyperhive first, then alphabetical.
out.sort_by(|a, b| match (a.name.as_str(), b.name.as_str()) {
("hyperhive", _) => std::cmp::Ordering::Less,
(_, "hyperhive") => std::cmp::Ordering::Greater,
_ => a.name.cmp(&b.name),
});
out
}
@ -784,6 +848,96 @@ struct OpSendForm {
body: String,
}
/// Form for `POST /meta-update`. Inputs ride in as a comma-separated
/// list under the `inputs` field — the JS submitter joins the
/// checked boxes since axum's `Form` extractor doesn't natively
/// decode repeated keys without a helper.
#[derive(Deserialize)]
struct MetaUpdateForm {
inputs: String,
}
/// Bulk-update selected meta flake inputs, then rebuild the affected
/// agents in the background. Idempotent w.r.t. selection — choosing
/// an input that's already at the latest sha is a no-op (no commit,
/// no rebuild ripple). Returns immediately after queueing the work;
/// dashboard polls for progress via container `pending` spinners +
/// the meta-inputs row sha update.
async fn post_meta_update(
State(state): State<AppState>,
Form(form): Form<MetaUpdateForm>,
) -> Response {
let inputs: Vec<String> = form
.inputs
.split(',')
.map(|s| s.trim().to_owned())
.filter(|s| !s.is_empty())
.collect();
if inputs.is_empty() {
return error_response("meta-update: no inputs selected");
}
let coord = state.coord.clone();
let inputs_clone = inputs.clone();
tokio::spawn(async move {
run_meta_update(&coord, &inputs_clone).await;
});
Redirect::to("/").into_response()
}
/// Background task: run `nix flake update <inputs>` in meta + commit,
/// then rebuild every agent whose input was touched (or all agents
/// when `hyperhive` was bumped, since that's the shared base). Each
/// rebuild fires `Rebuilt { ok, note, ... }` to the manager so the
/// operator and manager get the same feedback they'd see from an
/// auto-update / manual dashboard rebuild.
async fn run_meta_update(coord: &Arc<crate::coordinator::Coordinator>, inputs: &[String]) {
tracing::info!(?inputs, "meta-update: starting");
if let Err(e) = crate::meta::lock_update(inputs).await {
tracing::warn!(error = ?e, "meta-update: lock_update failed");
return;
}
// Decide which agents to rebuild.
let touched_hyperhive = inputs.iter().any(|i| i == "hyperhive");
let touched_agents: Vec<String> = inputs
.iter()
.filter_map(|i| i.strip_prefix("agent-").map(str::to_owned))
.collect();
let agents_to_rebuild: Vec<String> = if touched_hyperhive {
crate::lifecycle::list()
.await
.unwrap_or_default()
.into_iter()
.filter_map(|c| {
if c == crate::lifecycle::MANAGER_NAME {
Some(crate::lifecycle::MANAGER_NAME.to_owned())
} else {
c.strip_prefix(crate::lifecycle::AGENT_PREFIX)
.map(str::to_owned)
}
})
.collect()
} else {
touched_agents
};
let current_rev = crate::auto_update::current_flake_rev(&coord.hyperhive_flake)
.unwrap_or_default();
// Sequential rebuild loop — the META_LOCK guards meta-side
// races but parallel nix builds also serialise via nix-daemon,
// so sequential is just as fast in practice and keeps logs
// readable.
for name in agents_to_rebuild {
tracing::info!(%name, "meta-update: rebuilding agent");
if let Err(e) = crate::auto_update::rebuild_agent(coord, &name, &current_rev).await {
tracing::warn!(%name, error = ?e, "meta-update: rebuild failed");
// continue: surface each per-agent failure via its own
// Rebuilt event; don't abort the whole batch.
}
}
tracing::info!("meta-update: done");
}
async fn post_op_send(State(state): State<AppState>, Form(form): Form<OpSendForm>) -> Response {
let to = form.to.trim().to_owned();
let body = form.body.trim().to_owned();