dashboard: meta flake inputs UI + sequential rebuild loop

new section 'M3T4 1NPUTS' between approvals and message flow:
one row per input in meta/flake.lock (hyperhive first, then
agent-<n> alphabetically). each row shows the input name, the
first 12 chars of the locked sha, a relative timestamp from
locked.lastModified, and the original.url when available.
checkbox per row; submit button is disabled until at least one
box is checked; submitting confirms then POSTs the selected
names to /meta-update.

backend:
- meta::lock_update(inputs: &[String]) — runs 'nix flake update
  <names>' in the meta dir, commits the lock change with a
  combined message ('lock update: hyperhive, agent-coder').
  preserves the existing META_LOCK serialization. existing
  lock_update_for_rebuild / lock_update_hyperhive stay for
  their single-input callers.
- POST /meta-update — comma-separated 'inputs' form field
  (JS joins checkboxes since axum::Form doesn't natively
  decode repeated keys); spawns a background task that runs
  the lock update + per-agent rebuild loop. hyperhive
  selection fans out to all agents; agent-<n> selection only
  rebuilds <n>. each rebuild fires Rebuilt to the manager
  exactly like dashboard / admin-CLI / auto-update.

rebuild loop is sequential — auto_update::run too (was
parallel via tokio::spawn). parallel rebuilds collide on
nix-store's sqlite cache ('sqlite db busy, not using cache')
and the meta META_LOCK contention. nix-daemon serializes the
heavy build steps anyway, so this isn't a throughput loss.
This commit is contained in:
müde 2026-05-16 03:38:07 +02:00
parent 891223219e
commit 266c2c7a77
6 changed files with 331 additions and 18 deletions

View file

@ -760,6 +760,77 @@
return Math.floor(ageSec / 86400) + 'd ago';
}
function renderMetaInputs(s) {
const root = $('meta-inputs-section');
if (!root) return;
root.innerHTML = '';
const inputs = s.meta_inputs || [];
if (!inputs.length) {
root.append(el('p', { class: 'empty' }, 'meta repo not seeded yet'));
return;
}
const form = el('form', {
method: 'POST',
action: '/meta-update',
class: 'meta-inputs-form',
'data-async': '',
'data-confirm': 'update selected meta flake inputs + rebuild affected agents?',
});
const ul = el('ul', { class: 'meta-inputs' });
for (const inp of inputs) {
const li = el('li');
const id = 'meta-input-' + inp.name.replace(/[^a-z0-9-]/gi, '_');
const cb = el('input', {
type: 'checkbox',
name: 'meta_input_' + inp.name,
id,
value: inp.name,
'data-meta-input': inp.name,
});
const label = el('label', { for: id });
label.append(
cb,
el('span', { class: 'meta-input-name' }, inp.name), ' ',
el('code', { class: 'meta-input-rev' }, inp.rev.slice(0, 12)), ' ',
el('span', { class: 'meta-input-ts' }, fmtAgo(inp.last_modified)),
);
if (inp.url) {
label.append(' ', el('span', { class: 'meta-input-url', title: inp.url },
'· ' + truncate(inp.url, 48)));
}
li.append(label);
ul.append(li);
}
form.append(ul);
// Hidden input the POST handler reads — populated at submit
// time from the checkbox states. axum's Form extractor doesn't
// natively decode repeated keys, so we join into one CSV.
const hidden = el('input', { type: 'hidden', name: 'inputs', value: '' });
form.append(hidden);
const btn = el('button', {
type: 'submit',
class: 'btn btn-meta-update',
disabled: '',
}, '◆ UPD4TE & R3BU1LD');
form.append(btn);
function refreshDisabled() {
const any = form.querySelectorAll('input[data-meta-input]:checked').length > 0;
if (any) btn.removeAttribute('disabled');
else btn.setAttribute('disabled', '');
}
form.addEventListener('change', refreshDisabled);
form.addEventListener('submit', () => {
const selected = Array.from(form.querySelectorAll('input[data-meta-input]:checked'))
.map((b) => b.dataset.metaInput);
hidden.value = selected.join(',');
});
root.append(form);
}
function truncate(s, n) {
return s.length <= n ? s : s.slice(0, n - 1) + '…';
}
// ─── state polling ──────────────────────────────────────────────────────
let pollTimer = null;
// Sections whose innerHTML gets blown away on each refresh. If the
@ -771,6 +842,7 @@
'questions-section',
'inbox-section',
'approvals-section',
'meta-inputs-section',
];
// <details> sections that should survive a refresh need a stable
// `data-restore-key` attribute. snapshotOpenDetails walks managed
@ -833,6 +905,7 @@
renderQuestions(s);
renderInbox(s);
renderApprovals(s);
renderMetaInputs(s);
restoreOpenDetails(openDetails);
notifyDeltas(s);
// Auto-refresh: fast (2s) while a spawn or a per-container

View file

@ -288,6 +288,56 @@ code {
.glyph-approved { color: var(--green); }
.glyph-denied { color: var(--red); }
.glyph-failed { color: var(--amber); }
.meta-inputs {
list-style: none;
padding: 0;
margin: 0 0 0.8em;
display: grid;
gap: 0.2em;
}
.meta-inputs li {
padding: 0.25em 0.6em;
border: 1px solid var(--border);
background: rgba(24, 24, 37, 0.6);
}
.meta-inputs label {
display: flex;
align-items: baseline;
gap: 0.5em;
cursor: pointer;
font-size: 0.9em;
}
.meta-input-name { color: var(--amber); font-weight: bold; }
.meta-input-rev { color: var(--muted); }
.meta-input-ts { color: var(--muted); font-size: 0.85em; }
.meta-input-url {
color: var(--muted);
font-size: 0.85em;
margin-left: auto;
overflow: hidden;
text-overflow: ellipsis;
white-space: nowrap;
}
.btn-meta-update {
background: rgba(203, 166, 247, 0.12);
border: 1px solid var(--purple);
color: var(--purple);
text-shadow: 0 0 4px currentColor;
padding: 0.3em 1em;
font: inherit;
font-size: 0.85em;
letter-spacing: 0.08em;
cursor: pointer;
transition: box-shadow 0.15s ease, background 0.15s ease;
}
.btn-meta-update:hover:not([disabled]) {
background: rgba(203, 166, 247, 0.22);
box-shadow: 0 0 10px -2px currentColor;
}
.btn-meta-update[disabled] {
opacity: 0.35;
cursor: not-allowed;
}
.history-note {
margin-left: 1.8em;
margin-top: 0.2em;

View file

@ -47,6 +47,13 @@
<p class="meta">loading…</p>
</div>
<h2>◆ M3T4 1NPUTS ◆</h2>
<div class="divider">══════════════════════════════════════════════════════════════</div>
<p class="meta">select inputs to <code>nix flake update</code> in <code>/meta/</code>. selected agents rebuild in sequence after the lock bump; manager learns each outcome via the usual <code>rebuilt</code> system event.</p>
<div id="meta-inputs-section">
<p class="meta">loading…</p>
</div>
<h2>◆ MESS4GE FL0W ◆</h2>
<div class="divider">══════════════════════════════════════════════════════════════</div>
<p class="meta">live tail — newest at the top. tap on every <code>send</code> / <code>recv</code> through the broker. compose below: <code>@name</code> picks the recipient (sticky until you @ someone else); <code>tab</code> completes.</p>

View file

@ -182,7 +182,12 @@ pub async fn run(coord: Arc<Coordinator>) -> Result<()> {
}
};
let mut tasks = Vec::new();
// Sequential, one agent at a time. Parallel rebuilds collide on
// nix-store's sqlite cache (the "sqlite db busy, not using
// cache" warning) and also race the meta-lock mutex; the
// resulting log interleave was bad enough on its own. Builds
// serialize on nix-daemon internally anyway, so this isn't a
// throughput loss in practice.
for container in containers {
// Manager and sub-agents share the same lifecycle now; both go
// through rebuild_agent with name-derived paths.
@ -198,17 +203,9 @@ pub async fn run(coord: Arc<Coordinator>) -> Result<()> {
tracing::debug!(%name, "auto-update: up-to-date");
continue;
}
let coord = coord.clone();
let current_rev = current_rev.clone();
tasks.push(tokio::spawn(async move {
if let Err(e) = rebuild_agent(&coord, &name, &current_rev).await {
tracing::warn!(%name, error = ?e, "auto-update: rebuild failed");
}
}));
}
for t in tasks {
let _ = t.await;
}
Ok(())
}

View file

@ -57,6 +57,7 @@ pub async fn serve(port: u16, coord: Arc<Coordinator>) -> Result<()> {
.route("/api/agent-config/{name}", get(get_agent_config))
.route("/request-spawn", post(post_request_spawn))
.route("/op-send", post(post_op_send))
.route("/meta-update", post(post_meta_update))
.route("/messages/stream", get(messages_stream))
.with_state(AppState { coord });
let addr = SocketAddr::from(([0, 0, 0, 0], port));
@ -154,6 +155,9 @@ struct StateSnapshot {
/// least one other agent. Operator resolves by renaming. The
/// dashboard renders a banner at the top listing each cluster.
port_conflicts: Vec<PortConflict>,
/// Inputs in `meta/flake.lock` the operator can selectively
/// `nix flake update`. Hyperhive first, then `agent-<n>` rows.
meta_inputs: Vec<MetaInputView>,
}
#[derive(Serialize)]
@ -280,6 +284,7 @@ async fn api_state(headers: HeaderMap, State(state): State<AppState>) -> axum::J
transients,
approvals,
approval_history,
meta_inputs: read_meta_inputs(),
operator_inbox,
questions,
tombstones,
@ -360,7 +365,33 @@ async fn build_container_views(
/// yields an empty map so the dashboard degrades gracefully when the
/// meta repo hasn't been seeded yet.
fn read_meta_locked_revs() -> std::collections::HashMap<String, String> {
let mut out = std::collections::HashMap::new();
read_meta_inputs()
.into_iter()
.map(|i| (i.name, i.rev))
.collect()
}
#[derive(Serialize, Clone)]
struct MetaInputView {
/// Input key in meta's `flake.nix` — `hyperhive`, `agent-<n>`, etc.
name: String,
/// Full locked sha. Not displayed verbatim; the dashboard
/// truncates to the first 12 chars for the chip.
rev: String,
/// Unix seconds — `locked.lastModified`. Drives the relative
/// "2h ago" timestamp on each input row.
last_modified: i64,
/// `original.url` if available, for the tooltip / row meta text.
#[serde(skip_serializing_if = "Option::is_none")]
url: Option<String>,
}
/// Walk `flake.lock`'s `nodes` map → `Vec<MetaInputView>`. Only
/// includes nodes the root depends on (i.e. real inputs), skipping
/// the synthetic `root` entry. Sorted with `hyperhive` first then
/// alphabetically so the UI's top entry is the swarm-wide base.
fn read_meta_inputs() -> Vec<MetaInputView> {
let mut out = Vec::new();
let Ok(raw) = std::fs::read_to_string("/var/lib/hyperhive/meta/flake.lock") else {
return out;
};
@ -370,15 +401,48 @@ fn read_meta_locked_revs() -> std::collections::HashMap<String, String> {
let Some(nodes) = json.get("nodes").and_then(|v| v.as_object()) else {
return out;
};
let Some(root_name) = json.get("root").and_then(|v| v.as_str()) else {
return out;
};
let root_inputs: std::collections::BTreeSet<String> = nodes
.get(root_name)
.and_then(|n| n.get("inputs"))
.and_then(|v| v.as_object())
.map(|m| m.keys().cloned().collect())
.unwrap_or_default();
for (name, node) in nodes {
if let Some(rev) = node
.get("locked")
if !root_inputs.contains(name) {
continue;
}
let locked = node.get("locked");
let Some(rev) = locked
.and_then(|v| v.get("rev"))
.and_then(|v| v.as_str())
{
out.insert(name.clone(), rev.to_owned());
}
else {
continue;
};
let last_modified = locked
.and_then(|v| v.get("lastModified"))
.and_then(serde_json::Value::as_i64)
.unwrap_or(0);
let url = node
.get("original")
.and_then(|v| v.get("url"))
.and_then(|v| v.as_str())
.map(str::to_owned);
out.push(MetaInputView {
name: name.clone(),
rev: rev.to_owned(),
last_modified,
url,
});
}
// hyperhive first, then alphabetical.
out.sort_by(|a, b| match (a.name.as_str(), b.name.as_str()) {
("hyperhive", _) => std::cmp::Ordering::Less,
(_, "hyperhive") => std::cmp::Ordering::Greater,
_ => a.name.cmp(&b.name),
});
out
}
@ -784,6 +848,96 @@ struct OpSendForm {
body: String,
}
/// Form for `POST /meta-update`. Inputs ride in as a comma-separated
/// list under the `inputs` field — the JS submitter joins the
/// checked boxes since axum's `Form` extractor doesn't natively
/// decode repeated keys without a helper.
#[derive(Deserialize)]
struct MetaUpdateForm {
inputs: String,
}
/// Bulk-update selected meta flake inputs, then rebuild the affected
/// agents in the background. Idempotent w.r.t. selection — choosing
/// an input that's already at the latest sha is a no-op (no commit,
/// no rebuild ripple). Returns immediately after queueing the work;
/// dashboard polls for progress via container `pending` spinners +
/// the meta-inputs row sha update.
async fn post_meta_update(
State(state): State<AppState>,
Form(form): Form<MetaUpdateForm>,
) -> Response {
let inputs: Vec<String> = form
.inputs
.split(',')
.map(|s| s.trim().to_owned())
.filter(|s| !s.is_empty())
.collect();
if inputs.is_empty() {
return error_response("meta-update: no inputs selected");
}
let coord = state.coord.clone();
let inputs_clone = inputs.clone();
tokio::spawn(async move {
run_meta_update(&coord, &inputs_clone).await;
});
Redirect::to("/").into_response()
}
/// Background task: run `nix flake update <inputs>` in meta + commit,
/// then rebuild every agent whose input was touched (or all agents
/// when `hyperhive` was bumped, since that's the shared base). Each
/// rebuild fires `Rebuilt { ok, note, ... }` to the manager so the
/// operator and manager get the same feedback they'd see from an
/// auto-update / manual dashboard rebuild.
async fn run_meta_update(coord: &Arc<crate::coordinator::Coordinator>, inputs: &[String]) {
tracing::info!(?inputs, "meta-update: starting");
if let Err(e) = crate::meta::lock_update(inputs).await {
tracing::warn!(error = ?e, "meta-update: lock_update failed");
return;
}
// Decide which agents to rebuild.
let touched_hyperhive = inputs.iter().any(|i| i == "hyperhive");
let touched_agents: Vec<String> = inputs
.iter()
.filter_map(|i| i.strip_prefix("agent-").map(str::to_owned))
.collect();
let agents_to_rebuild: Vec<String> = if touched_hyperhive {
crate::lifecycle::list()
.await
.unwrap_or_default()
.into_iter()
.filter_map(|c| {
if c == crate::lifecycle::MANAGER_NAME {
Some(crate::lifecycle::MANAGER_NAME.to_owned())
} else {
c.strip_prefix(crate::lifecycle::AGENT_PREFIX)
.map(str::to_owned)
}
})
.collect()
} else {
touched_agents
};
let current_rev = crate::auto_update::current_flake_rev(&coord.hyperhive_flake)
.unwrap_or_default();
// Sequential rebuild loop — the META_LOCK guards meta-side
// races but parallel nix builds also serialise via nix-daemon,
// so sequential is just as fast in practice and keeps logs
// readable.
for name in agents_to_rebuild {
tracing::info!(%name, "meta-update: rebuilding agent");
if let Err(e) = crate::auto_update::rebuild_agent(coord, &name, &current_rev).await {
tracing::warn!(%name, error = ?e, "meta-update: rebuild failed");
// continue: surface each per-agent failure via its own
// Rebuilt event; don't abort the whole batch.
}
}
tracing::info!("meta-update: done");
}
async fn post_op_send(State(state): State<AppState>, Form(form): Form<OpSendForm>) -> Response {
let to = form.to.trim().to_owned();
let body = form.body.trim().to_owned();

View file

@ -184,6 +184,38 @@ pub async fn lock_update_for_rebuild(name: &str) -> Result<()> {
git_commit(&dir, &format!("rebuild {name}: lock update")).await
}
/// Update one or more named inputs in the meta flake and commit
/// the resulting lock change with a single combined message.
/// Used by the dashboard's "update meta inputs" form so the
/// operator can bulk-bump `hyperhive` + selected agents in one
/// shot. Each input name is passed verbatim to
/// `nix flake update`; the caller is responsible for picking
/// real input keys (e.g. via `inputs_view()` snapshotted from
/// the lock file).
#[allow(dead_code)] // wired up by dashboard handler in the same commit
pub async fn lock_update(inputs: &[String]) -> Result<()> {
if inputs.is_empty() {
return Ok(());
}
let _guard = META_LOCK.lock().await;
let dir = meta_dir();
let mut args: Vec<&str> = vec!["flake", "update"];
for i in inputs {
args.push(i.as_str());
}
nix(&dir, &args).await?;
if git_is_clean(&dir).await? {
return Ok(());
}
git(&dir, &["add", "flake.lock"]).await?;
let msg = if inputs.len() == 1 {
format!("lock update: {}", inputs[0])
} else {
format!("lock update: {}", inputs.join(", "))
};
git_commit(&dir, &msg).await
}
/// One-shot used by the auto-update path: pin the latest hyperhive
/// rev, commit if the lock changed. Cheaper than `sync_agents`
/// because the per-agent inputs aren't touched.