rebuild_queue: wire worker into coordinator + dashboard event
This commit is contained in:
parent
5890e6796a
commit
37f6bc4b6b
4 changed files with 229 additions and 0 deletions
|
|
@ -169,6 +169,12 @@ pub struct QueueEntry {
|
|||
/// string (already truncated to a reasonable length by the caller).
|
||||
#[serde(default, skip_serializing_if = "Option::is_none")]
|
||||
pub error: Option<String>,
|
||||
/// `MetaUpdate`-only payload: the list of meta flake inputs to run
|
||||
/// through `nix flake update`. Empty / absent on `Rebuild` /
|
||||
/// `Spawn` / `Destroy` entries; absent on the wire (never
|
||||
/// serialised) when the entry kind doesn't have meaningful inputs.
|
||||
#[serde(default, skip_serializing_if = "Vec::is_empty")]
|
||||
pub inputs: Vec<String>,
|
||||
}
|
||||
|
||||
/// How many terminal-state entries (`Done` / `Failed` / `Cancelled`)
|
||||
|
|
@ -226,6 +232,21 @@ impl RebuildQueue {
|
|||
source: QueueSource,
|
||||
reason: String,
|
||||
parent_id: Option<u64>,
|
||||
) -> u64 {
|
||||
self.enqueue_with_inputs(kind, agent, source, reason, parent_id, Vec::new())
|
||||
}
|
||||
|
||||
/// Same as `enqueue` but carries an `inputs` payload — used by
|
||||
/// `MetaUpdate` enqueues to tell the worker which meta-flake
|
||||
/// inputs to bump.
|
||||
pub fn enqueue_with_inputs(
|
||||
&self,
|
||||
kind: QueueKind,
|
||||
agent: String,
|
||||
source: QueueSource,
|
||||
reason: String,
|
||||
parent_id: Option<u64>,
|
||||
inputs: Vec<String>,
|
||||
) -> u64 {
|
||||
let mut inner = self.inner.lock().expect("rebuild_queue mutex poisoned");
|
||||
// Dedup against a pending entry with the same (kind, agent).
|
||||
|
|
@ -251,6 +272,7 @@ impl RebuildQueue {
|
|||
started_at: None,
|
||||
finished_at: None,
|
||||
error: None,
|
||||
inputs,
|
||||
};
|
||||
inner.entries.push_back(entry);
|
||||
// Wake the worker. `notify_one` is a no-op when there's no
|
||||
|
|
@ -336,6 +358,169 @@ impl RebuildQueue {
|
|||
}
|
||||
}
|
||||
|
||||
/// Background worker that drains the queue. Spawned once at hive-c0re
|
||||
/// startup from `main.rs`. Loops forever:
|
||||
/// 1. Pop the next `Queued` entry (`take_next` marks it `Running` and
|
||||
/// fires a `RebuildQueueChanged` snapshot via the caller).
|
||||
/// 2. Dispatch by kind — single-agent rebuild, meta-update + cascade,
|
||||
/// or first-spawn.
|
||||
/// 3. Mark the entry terminal (`finish`) and emit another snapshot.
|
||||
/// 4. When the queue is empty, `await` on `notify` until something
|
||||
/// new lands.
|
||||
///
|
||||
/// Shutdown semantics: subscribes to `coord.shutdown_rx()`. On a true
|
||||
/// signal the worker exits after its current entry finishes; pending
|
||||
/// `Queued` entries are dropped (they'll either be replayed by the
|
||||
/// startup sweep on next boot or left for an operator to re-queue).
|
||||
pub async fn run_worker(coord: std::sync::Arc<crate::coordinator::Coordinator>) {
|
||||
let mut shutdown = coord.shutdown_rx();
|
||||
loop {
|
||||
// Drain everything available now.
|
||||
while let Some(entry) = coord.rebuild_queue.take_next() {
|
||||
coord.emit_rebuild_queue_snapshot();
|
||||
tracing::info!(
|
||||
id = entry.id,
|
||||
kind = entry.kind.as_str(),
|
||||
agent = %entry.agent,
|
||||
source = entry.source.as_str(),
|
||||
"rebuild_queue: running"
|
||||
);
|
||||
let result = dispatch(&coord, &entry).await;
|
||||
match result {
|
||||
Ok(()) => {
|
||||
coord.rebuild_queue.finish(entry.id, QueueState::Done, None);
|
||||
tracing::info!(id = entry.id, "rebuild_queue: done");
|
||||
}
|
||||
Err(e) => {
|
||||
let msg = format!("{e:#}");
|
||||
let truncated = if msg.len() > 2_000 {
|
||||
format!("{}…", &msg[..2_000])
|
||||
} else {
|
||||
msg.clone()
|
||||
};
|
||||
coord
|
||||
.rebuild_queue
|
||||
.finish(entry.id, QueueState::Failed, Some(truncated));
|
||||
tracing::warn!(id = entry.id, error = %msg, "rebuild_queue: failed");
|
||||
}
|
||||
}
|
||||
coord.emit_rebuild_queue_snapshot();
|
||||
}
|
||||
// Park until something new is enqueued OR shutdown fires.
|
||||
tokio::select! {
|
||||
biased;
|
||||
res = shutdown.changed() => {
|
||||
if res.is_err() || *shutdown.borrow() {
|
||||
tracing::info!("rebuild_queue: worker exiting on shutdown");
|
||||
return;
|
||||
}
|
||||
}
|
||||
_ = coord.rebuild_queue.notify.notified() => {
|
||||
// New entry — back to the drain loop.
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Run a single queue entry to completion. Kind-dispatched; failures
|
||||
/// bubble up to the worker which marks the entry `Failed`.
|
||||
async fn dispatch(
|
||||
coord: &std::sync::Arc<crate::coordinator::Coordinator>,
|
||||
entry: &QueueEntry,
|
||||
) -> anyhow::Result<()> {
|
||||
match entry.kind {
|
||||
QueueKind::Rebuild => {
|
||||
let current_rev = crate::auto_update::current_flake_rev(&coord.hyperhive_flake)
|
||||
.unwrap_or_default();
|
||||
crate::auto_update::rebuild_agent(coord, &entry.agent, ¤t_rev).await
|
||||
}
|
||||
QueueKind::MetaUpdate => run_meta_update(coord, entry).await,
|
||||
QueueKind::Spawn => {
|
||||
// First-deploy spawns route through `actions::approve_spawn`
|
||||
// / `actions::approve_apply_commit` today; they enqueue a
|
||||
// Spawn entry only to claim the queue slot, the actual
|
||||
// spawn work runs inside those handlers before completion.
|
||||
// Keeping this arm a no-op so we don't double-run.
|
||||
tracing::debug!(
|
||||
id = entry.id,
|
||||
agent = %entry.agent,
|
||||
"rebuild_queue: Spawn entry is a queue claim; actual work elsewhere"
|
||||
);
|
||||
Ok(())
|
||||
}
|
||||
QueueKind::Destroy => {
|
||||
// Reserved for future `destroy --purge` integration.
|
||||
anyhow::bail!("Destroy kind not yet implemented in rebuild_queue worker");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Run one `MetaUpdate` entry: bump the meta flake's locks for the
|
||||
/// requested inputs, then enqueue a cascade of `Rebuild` entries
|
||||
/// (with `parent_id` set to this entry's id) for every agent affected
|
||||
/// by the bump. Mirrors the previous `dashboard::run_meta_update`
|
||||
/// semantics; that path now enqueues into this queue rather than
|
||||
/// running the bump + rebuild loop inline.
|
||||
async fn run_meta_update(
|
||||
coord: &std::sync::Arc<crate::coordinator::Coordinator>,
|
||||
entry: &QueueEntry,
|
||||
) -> anyhow::Result<()> {
|
||||
let _progress = coord.meta_update_guard();
|
||||
let inputs = entry.inputs.clone();
|
||||
tracing::info!(?inputs, parent = entry.id, "rebuild_queue: meta-update starting");
|
||||
if inputs.is_empty() {
|
||||
crate::meta::lock_update(&[]).await?;
|
||||
} else {
|
||||
crate::meta::lock_update(&inputs).await?;
|
||||
}
|
||||
|
||||
// Decide which agents to rebuild. Same logic as the previous
|
||||
// `run_meta_update` — anything in the hyperhive subtree affects
|
||||
// every agent; anything in `agent-<n>/...` only the named agent.
|
||||
let touched_hyperhive = inputs
|
||||
.iter()
|
||||
.any(|i| i == "hyperhive" || i.starts_with("hyperhive/"));
|
||||
let touched_agents: Vec<String> = inputs
|
||||
.iter()
|
||||
.filter_map(|i| i.strip_prefix("agent-"))
|
||||
.map(|rest| rest.split('/').next().unwrap_or(rest).to_owned())
|
||||
.collect();
|
||||
let agents_to_rebuild: Vec<String> = if touched_hyperhive || inputs.is_empty() {
|
||||
crate::lifecycle::list()
|
||||
.await
|
||||
.unwrap_or_default()
|
||||
.into_iter()
|
||||
.filter_map(|c| {
|
||||
if c == crate::lifecycle::MANAGER_NAME {
|
||||
Some(crate::lifecycle::MANAGER_NAME.to_owned())
|
||||
} else {
|
||||
c.strip_prefix(crate::lifecycle::AGENT_PREFIX).map(str::to_owned)
|
||||
}
|
||||
})
|
||||
.collect()
|
||||
} else {
|
||||
touched_agents
|
||||
};
|
||||
|
||||
let reason_hint = if inputs.is_empty() {
|
||||
"meta-update cascade (all inputs)".to_owned()
|
||||
} else {
|
||||
format!("meta-update cascade ({})", inputs.join(", "))
|
||||
};
|
||||
for name in agents_to_rebuild {
|
||||
coord.rebuild_queue.enqueue(
|
||||
QueueKind::Rebuild,
|
||||
name,
|
||||
QueueSource::MetaUpdate,
|
||||
reason_hint.clone(),
|
||||
Some(entry.id),
|
||||
);
|
||||
}
|
||||
// Lock file changed — meta-inputs panel re-renders.
|
||||
crate::dashboard::emit_meta_inputs_snapshot(coord.as_ref());
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Current unix timestamp in seconds. `now()` calls are pulled into a
|
||||
/// helper so tests can swap them out later.
|
||||
fn now_unix() -> i64 {
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue