new DashboardEvent::TombstonesChanged + MetaInputsChanged carry full snapshots (lists are tiny; snapshot beats diff for race avoidance). Coordinator-side helpers emit_tombstones_snapshot + emit_meta_inputs_snapshot fire from every mutation site: actions::destroy + post_purge_tombstone + actions::approve (spawn finalise consumes tombstone) + run_meta_update + auto_update::rebuild_agent (lock bumps). client adds derived stores + apply* handlers + drops the post-submit refetch on PURG3 (container row + tombstone row) and meta-update. after this commit /api/state is fetched exactly once per page session (cold load); every other change rides the SSE channel.
448 lines
18 KiB
Rust
448 lines
18 KiB
Rust
//! Operations that are exposed through more than one surface (the host admin
|
|
//! socket *and* the dashboard's POST endpoints). Each function takes a
|
|
//! `&Coordinator` and the request parameters; callers stitch the response
|
|
//! shape they want (HTTP redirect vs JSON).
|
|
|
|
use std::sync::Arc;
|
|
|
|
use anyhow::{Result, bail};
|
|
use hive_sh4re::{ApprovalKind, ApprovalStatus, HelperEvent, MANAGER_AGENT};
|
|
|
|
use crate::coordinator::{Coordinator, TransientKind};
|
|
use crate::lifecycle::{self, MANAGER_NAME};
|
|
|
|
/// Approve a pending request and run the underlying action. Dispatches on the
|
|
/// approval kind:
|
|
/// - `ApplyCommit`: read agent.nix at the approval's commit from the proposed
|
|
/// repo, copy into the applied repo, commit there, rebuild the container.
|
|
/// Synchronous — returns once the rebuild completes.
|
|
/// - `Spawn`: create + start a brand-new sub-agent container. Runs in a
|
|
/// background task so the operator's approve click returns immediately;
|
|
/// the dashboard surfaces a transient `Spawning` state until the container
|
|
/// is up. On failure, the approval is marked failed.
|
|
///
|
|
/// In all cases an `ApprovalResolved` helper event lands in the manager's
|
|
/// inbox when the work resolves.
|
|
pub async fn approve(coord: Arc<Coordinator>, id: i64) -> Result<()> {
|
|
let approval = coord.approvals.mark_approved(id)?;
|
|
tracing::info!(
|
|
%approval.id,
|
|
%approval.agent,
|
|
kind = ?approval.kind,
|
|
%approval.commit_ref,
|
|
"approval: running action",
|
|
);
|
|
|
|
let agent_dir = coord.ensure_runtime(&approval.agent)?;
|
|
let proposed_dir = Coordinator::agent_proposed_dir(&approval.agent);
|
|
let applied_dir = Coordinator::agent_applied_dir(&approval.agent);
|
|
let claude_dir = Coordinator::agent_claude_dir(&approval.agent);
|
|
let notes_dir = Coordinator::agent_notes_dir(&approval.agent);
|
|
|
|
match approval.kind {
|
|
ApprovalKind::ApplyCommit => {
|
|
let (result, terminal_tag) = run_apply_commit(
|
|
&coord,
|
|
&approval,
|
|
&agent_dir,
|
|
&applied_dir,
|
|
&claude_dir,
|
|
¬es_dir,
|
|
)
|
|
.await;
|
|
finish_approval(&coord, &approval, result, terminal_tag)
|
|
}
|
|
ApprovalKind::Spawn => {
|
|
// Run the spawn in the background so the approve POST returns
|
|
// immediately. The dashboard reads `transient` to render a spinner.
|
|
// Guard is created synchronously here (so the spinner appears
|
|
// the moment the operator clicks approve) and moved into the
|
|
// task; it auto-clears even if the runtime drops the task.
|
|
let coord_bg = coord.clone();
|
|
let approval_bg = approval.clone();
|
|
let guard = coord_bg.transient_guard(&approval_bg.agent, TransientKind::Spawning);
|
|
tokio::spawn(async move {
|
|
let guard = guard;
|
|
let agent_bg = approval_bg.agent.clone();
|
|
let result = lifecycle::spawn(
|
|
&approval_bg.agent,
|
|
&coord_bg.hyperhive_flake,
|
|
&agent_dir,
|
|
&proposed_dir,
|
|
&applied_dir,
|
|
&claude_dir,
|
|
¬es_dir,
|
|
coord_bg.dashboard_port,
|
|
&coord_bg.operator_pronouns,
|
|
)
|
|
.await;
|
|
drop(guard);
|
|
if result.is_ok()
|
|
&& let Err(e) = crate::forge::ensure_user_for(&agent_bg).await
|
|
{
|
|
tracing::warn!(agent = %agent_bg, error = ?e, "forge: ensure_user after spawn failed");
|
|
}
|
|
if let Err(e) = finish_approval(&coord_bg, &approval_bg, result, None) {
|
|
tracing::warn!(agent = %agent_bg, error = ?e, "spawn approval failed");
|
|
}
|
|
// New container row appeared (or didn't, on failure
|
|
// before nixos-container create completed) — rescan so
|
|
// dashboards reflect the post-spawn state. Spawn can
|
|
// also consume a tombstone of the same name; emit the
|
|
// fresh list so the operator's dormant-state pane
|
|
// updates without a refetch.
|
|
coord_bg.rescan_containers_and_emit().await;
|
|
crate::dashboard::emit_tombstones_snapshot(&coord_bg).await;
|
|
});
|
|
Ok(())
|
|
}
|
|
}
|
|
}
|
|
|
|
fn finish_approval(
|
|
coord: &Coordinator,
|
|
approval: &hive_sh4re::Approval,
|
|
result: Result<()>,
|
|
terminal_tag: Option<String>,
|
|
) -> Result<()> {
|
|
let (status, note, ok) = match &result {
|
|
Ok(()) => (ApprovalStatus::Approved, None, true),
|
|
Err(e) => {
|
|
let note = format!("{e:#}");
|
|
let _ = coord.approvals.mark_failed(approval.id, ¬e);
|
|
(ApprovalStatus::Failed, Some(note), false)
|
|
}
|
|
};
|
|
coord.notify_manager(&HelperEvent::ApprovalResolved {
|
|
id: approval.id,
|
|
agent: approval.agent.clone(),
|
|
commit_ref: approval.commit_ref.clone(),
|
|
status,
|
|
note: note.clone(),
|
|
sha: approval.fetched_sha.clone(),
|
|
tag: terminal_tag.clone(),
|
|
});
|
|
// Phase 5b: also fire on the dashboard event channel so the
|
|
// browser moves the row out of pending into history without a
|
|
// snapshot refetch. `approved` rows that succeed get the
|
|
// approval's logged resolved_at indirectly via `now_unix()`;
|
|
// failures already wrote it via mark_failed above.
|
|
let approval_kind = match approval.kind {
|
|
ApprovalKind::Spawn => "spawn",
|
|
ApprovalKind::ApplyCommit => "apply_commit",
|
|
};
|
|
let sha_short = approval
|
|
.fetched_sha
|
|
.as_deref()
|
|
.map(|s| s[..s.len().min(12)].to_owned());
|
|
let status_str = if ok { "approved" } else { "failed" };
|
|
coord.emit_approval_resolved(
|
|
approval.id,
|
|
&approval.agent,
|
|
approval_kind,
|
|
sha_short,
|
|
status_str,
|
|
note.clone(),
|
|
approval.description.clone(),
|
|
);
|
|
// For spawn/rebuild approvals, also surface the underlying action so
|
|
// the manager knows whether the container actually came up. The
|
|
// ApprovalResolved event already carries the same `ok` signal but
|
|
// separating it lets the manager react to the lifecycle change
|
|
// without having to special-case approvals.
|
|
match approval.kind {
|
|
ApprovalKind::Spawn => coord.notify_manager(&HelperEvent::Spawned {
|
|
agent: approval.agent.clone(),
|
|
ok,
|
|
note,
|
|
sha: approval.fetched_sha.clone(),
|
|
}),
|
|
ApprovalKind::ApplyCommit => coord.notify_manager(&HelperEvent::Rebuilt {
|
|
agent: approval.agent.clone(),
|
|
ok,
|
|
note,
|
|
sha: approval.fetched_sha.clone(),
|
|
tag: terminal_tag,
|
|
}),
|
|
}
|
|
result
|
|
}
|
|
|
|
/// Tag-driven `ApplyCommit` handler. Walks the approval through the tag
|
|
/// state machine documented in `docs/approvals.md`: stamp
|
|
/// `approved/<id>` and `building/<id>` first so the audit trail
|
|
/// captures intent, then drop the candidate tree into the working dir
|
|
/// without moving HEAD, run the rebuild, and either fast-forward
|
|
/// `applied/main` to the proposal commit on success
|
|
/// (`deployed/<id>`) or annotate `failed/<id>` with the build error
|
|
/// and reset the working tree back to the last known-good main. main
|
|
/// never advances on a failed build, so a crash-and-recover doesn't
|
|
/// leave the agent pointing at a tree it can't evaluate.
|
|
async fn run_apply_commit(
|
|
coord: &Arc<Coordinator>,
|
|
approval: &hive_sh4re::Approval,
|
|
agent_dir: &std::path::Path,
|
|
applied_dir: &std::path::Path,
|
|
claude_dir: &std::path::Path,
|
|
notes_dir: &std::path::Path,
|
|
) -> (Result<()>, Option<String>) {
|
|
let id = approval.id;
|
|
let proposal_ref = format!("refs/tags/proposal/{id}");
|
|
|
|
// Defensive: submit-time should have planted proposal/<id>, but if
|
|
// the row was migrated from an older schema or the tag got pruned
|
|
// we fail early with a clear note rather than building a stale
|
|
// tree.
|
|
if let Err(e) = lifecycle::git_rev_parse(applied_dir, &proposal_ref).await {
|
|
return (
|
|
Err(anyhow::anyhow!(
|
|
"missing proposal tag {proposal_ref}: {e:#}"
|
|
)),
|
|
None,
|
|
);
|
|
}
|
|
|
|
// Capture the currently-deployed sha so we can roll applied/main
|
|
// (and the meta lock indirectly) back if the build fails.
|
|
let prev_main_sha = match lifecycle::git_rev_parse(applied_dir, "refs/heads/main").await {
|
|
Ok(s) => s,
|
|
Err(e) => return (Err(anyhow::anyhow!("read applied/main: {e:#}")), None),
|
|
};
|
|
|
|
if let Err(e) = lifecycle::git_tag(applied_dir, &format!("approved/{id}"), &proposal_ref).await
|
|
{
|
|
return (Err(anyhow::anyhow!("plant approved/{id}: {e:#}")), None);
|
|
}
|
|
if let Err(e) = lifecycle::git_tag(applied_dir, &format!("building/{id}"), &proposal_ref).await
|
|
{
|
|
return (Err(anyhow::anyhow!("plant building/{id}: {e:#}")), None);
|
|
}
|
|
|
|
// Fast-forward applied/main to proposal/<id> + sync the working
|
|
// tree. Meta input pins `?ref=main`, so this is what makes nix
|
|
// re-lock to the proposal commit on the prepare_deploy step
|
|
// below. On build failure we roll main back to prev_main_sha so
|
|
// a crash leaves the agent on its last-good tree.
|
|
if let Err(e) = lifecycle::git_update_ref(applied_dir, "refs/heads/main", &proposal_ref).await {
|
|
return (
|
|
Err(anyhow::anyhow!("ff main to {proposal_ref}: {e:#}")),
|
|
None,
|
|
);
|
|
}
|
|
if let Err(e) = lifecycle::git_read_tree_reset(applied_dir, "refs/heads/main").await {
|
|
// main is ahead; working tree didn't sync. Roll main back to
|
|
// keep the two consistent before bailing.
|
|
let _ = lifecycle::git_update_ref(applied_dir, "refs/heads/main", &prev_main_sha).await;
|
|
return (Err(anyhow::anyhow!("read-tree to main: {e:#}")), None);
|
|
}
|
|
|
|
// Phase 1 of the meta two-phase deploy: relock without committing.
|
|
if let Err(e) = crate::meta::prepare_deploy(&approval.agent).await {
|
|
let _ = lifecycle::git_update_ref(applied_dir, "refs/heads/main", &prev_main_sha).await;
|
|
let _ = lifecycle::git_read_tree_reset(applied_dir, "refs/heads/main").await;
|
|
return (Err(anyhow::anyhow!("meta prepare_deploy: {e:#}")), None);
|
|
}
|
|
|
|
// Container-level rebuild against meta#<name>.
|
|
let build_result = lifecycle::rebuild_no_meta(
|
|
&approval.agent,
|
|
agent_dir,
|
|
applied_dir,
|
|
claude_dir,
|
|
notes_dir,
|
|
)
|
|
.await;
|
|
|
|
match build_result {
|
|
Ok(()) => {
|
|
let tag = format!("deployed/{id}");
|
|
if let Err(e) = lifecycle::git_tag(applied_dir, &tag, &proposal_ref).await {
|
|
tracing::warn!(agent = %approval.agent, %id, error = ?e, "plant deployed tag failed");
|
|
}
|
|
if let Err(e) = crate::meta::finalize_deploy(
|
|
&approval.agent,
|
|
approval.fetched_sha.as_deref().unwrap_or(&proposal_ref),
|
|
&tag,
|
|
)
|
|
.await
|
|
{
|
|
// The build itself succeeded — meta lock landed but
|
|
// couldn't be committed. Surface as a soft warn so the
|
|
// operator can git-commit by hand if they care.
|
|
tracing::warn!(agent = %approval.agent, %id, error = ?e, "meta finalize_deploy failed");
|
|
}
|
|
// Wake the agent on its next turn so claude sees the
|
|
// config change took effect. Same hint pattern as
|
|
// auto_update::rebuild_agent — manager approved a
|
|
// proposal, agent picks up where it left off with the
|
|
// new env / packages.
|
|
coord.kick_agent(&approval.agent, "config update applied");
|
|
(Ok(()), Some(tag))
|
|
}
|
|
Err(e) => {
|
|
let tag = format!("failed/{id}");
|
|
let body = format!("{e:#}");
|
|
if let Err(te) =
|
|
lifecycle::git_tag_annotated(applied_dir, &tag, &proposal_ref, &body).await
|
|
{
|
|
tracing::warn!(agent = %approval.agent, %id, error = ?te, "annotate failed tag failed");
|
|
}
|
|
// Roll main back to last known-good so the on-disk state
|
|
// matches what nixos-container last successfully built.
|
|
if let Err(re) =
|
|
lifecycle::git_update_ref(applied_dir, "refs/heads/main", &prev_main_sha).await
|
|
{
|
|
tracing::warn!(agent = %approval.agent, %id, error = ?re, "main rollback failed");
|
|
}
|
|
if let Err(re) = lifecycle::git_read_tree_reset(applied_dir, "refs/heads/main").await {
|
|
tracing::warn!(agent = %approval.agent, %id, error = ?re, "rollback read-tree failed");
|
|
}
|
|
// Drop the staged meta lock change so the deploy log
|
|
// only ever shows successes.
|
|
if let Err(ae) = crate::meta::abort_deploy().await {
|
|
tracing::warn!(agent = %approval.agent, %id, error = ?ae, "meta abort_deploy failed");
|
|
}
|
|
let _ = coord;
|
|
(Err(e), Some(tag))
|
|
}
|
|
}
|
|
}
|
|
|
|
/// Tear down a sub-agent container. By default this is non-destructive to
|
|
/// persistent state: the proposed/applied config repos and the Claude
|
|
/// credentials dir under `/var/lib/hyperhive/{agents,applied}/<name>/` are
|
|
/// kept, so recreating an agent of the same name reuses prior config + creds
|
|
/// (no re-login). The ephemeral runtime dir under `/run/hyperhive/agents/`
|
|
/// is cleared because its contents (the mcp socket) don't survive restarts
|
|
/// anyway. With `purge=true` the persistent trees are also wiped — config
|
|
/// history, claude creds, notes — there is no undo.
|
|
/// Refuses the manager (declarative; would fight with the host's nixos config).
|
|
pub async fn destroy(coord: &Arc<Coordinator>, name: &str, purge: bool) -> Result<()> {
|
|
if name == MANAGER_NAME || name == MANAGER_AGENT {
|
|
bail!("refusing to destroy the manager ({name})");
|
|
}
|
|
tracing::info!(%name, purge, "destroy");
|
|
// Guard auto-clears on the success path's final scope exit and on
|
|
// every early-return / cancellation along the way.
|
|
let _guard = coord.transient_guard(name, TransientKind::Destroying);
|
|
lifecycle::destroy(name).await?;
|
|
coord.unregister_agent(name);
|
|
let runtime = Coordinator::agent_dir(name);
|
|
if runtime.exists() {
|
|
let _ = std::fs::remove_dir_all(&runtime);
|
|
}
|
|
if purge {
|
|
for dir in [
|
|
Coordinator::agent_state_root(name),
|
|
Coordinator::agent_applied_dir(name),
|
|
] {
|
|
if dir.exists()
|
|
&& let Err(e) = std::fs::remove_dir_all(&dir)
|
|
{
|
|
tracing::warn!(error = ?e, dir = %dir.display(), "purge: remove failed");
|
|
}
|
|
}
|
|
}
|
|
// Meta flake: drop the agent's input + nixosConfiguration so a
|
|
// future spawn under the same name re-seeds cleanly, and so the
|
|
// meta lock doesn't reference a vanished applied repo. Log + keep
|
|
// going on failure — destroy already succeeded at the
|
|
// nixos-container level, the meta repo is just bookkeeping.
|
|
if let Err(e) = sync_meta_after_lifecycle(coord).await {
|
|
tracing::warn!(error = ?e, %name, "meta sync after destroy failed");
|
|
}
|
|
let _ = coord.approvals.fail_pending_for_agent(
|
|
name,
|
|
if purge {
|
|
"agent purged"
|
|
} else {
|
|
"agent destroyed"
|
|
},
|
|
);
|
|
drop(_guard);
|
|
coord.notify_manager(&HelperEvent::Destroyed {
|
|
agent: name.to_owned(),
|
|
});
|
|
// Container row disappeared — rescan so the dashboard fires
|
|
// `ContainerRemoved` for the gone row, then emit the
|
|
// tombstones snapshot (gained one on destroy, lost one on
|
|
// purge — recompute either way).
|
|
coord.rescan_containers_and_emit().await;
|
|
crate::dashboard::emit_tombstones_snapshot(coord).await;
|
|
Ok(())
|
|
}
|
|
|
|
/// Rerender the meta flake from whatever containers still exist on
|
|
/// disk. Called after lifecycle ops that change the agent set (today:
|
|
/// destroy). Idempotent — a no-op when nothing changed.
|
|
async fn sync_meta_after_lifecycle(coord: &Coordinator) -> Result<()> {
|
|
let agents = lifecycle::agents_for_meta_listing().await?;
|
|
crate::meta::sync_agents(
|
|
&coord.hyperhive_flake,
|
|
coord.dashboard_port,
|
|
&coord.operator_pronouns,
|
|
&agents,
|
|
)
|
|
.await
|
|
}
|
|
|
|
pub async fn deny(coord: &Coordinator, id: i64, note: Option<&str>) -> Result<()> {
|
|
let approval = coord.approvals.get(id)?;
|
|
coord.approvals.mark_denied(id, note)?;
|
|
tracing::info!(%id, note, "approval denied");
|
|
let mut tag = None;
|
|
if let Some(a) = approval {
|
|
let sha = a.fetched_sha.clone();
|
|
// ApplyCommit approvals leave a `denied/<id>` tag on the
|
|
// proposal commit so rejected configs are first-class git
|
|
// objects — `git show denied/<id>` in the manager's applied
|
|
// mount yields both the tree the operator rejected and (in
|
|
// the annotated body) the reason. Spawn approvals have no
|
|
// commit to tag, so they fall through unannotated.
|
|
if matches!(a.kind, ApprovalKind::ApplyCommit) {
|
|
let applied_dir = Coordinator::agent_applied_dir(&a.agent);
|
|
let proposal_ref = format!("refs/tags/proposal/{id}");
|
|
if lifecycle::git_rev_parse(&applied_dir, &proposal_ref)
|
|
.await
|
|
.is_ok()
|
|
{
|
|
let tag_name = format!("denied/{id}");
|
|
let body = note.unwrap_or("").to_owned();
|
|
if let Err(e) =
|
|
lifecycle::git_tag_annotated(&applied_dir, &tag_name, &proposal_ref, &body)
|
|
.await
|
|
{
|
|
tracing::warn!(%id, error = ?e, "plant denied tag failed");
|
|
} else {
|
|
tag = Some(tag_name);
|
|
}
|
|
}
|
|
}
|
|
let approval_kind = match a.kind {
|
|
ApprovalKind::Spawn => "spawn",
|
|
ApprovalKind::ApplyCommit => "apply_commit",
|
|
};
|
|
let sha_short = sha.as_deref().map(|s| s[..s.len().min(12)].to_owned());
|
|
let description = a.description.clone();
|
|
let agent_owned = a.agent.clone();
|
|
coord.notify_manager(&HelperEvent::ApprovalResolved {
|
|
id: a.id,
|
|
agent: a.agent,
|
|
commit_ref: a.commit_ref,
|
|
status: ApprovalStatus::Denied,
|
|
note: note.map(String::from),
|
|
sha,
|
|
tag,
|
|
});
|
|
coord.emit_approval_resolved(
|
|
id,
|
|
&agent_owned,
|
|
approval_kind,
|
|
sha_short,
|
|
"denied",
|
|
note.map(String::from),
|
|
description,
|
|
);
|
|
}
|
|
Ok(())
|
|
}
|