//! Operations that are exposed through more than one surface (the host admin //! socket *and* the dashboard's POST endpoints). Each function takes a //! `&Coordinator` and the request parameters; callers stitch the response //! shape they want (HTTP redirect vs JSON). use std::sync::Arc; use anyhow::{Result, bail}; use hive_sh4re::{ApprovalKind, ApprovalStatus, HelperEvent, MANAGER_AGENT}; use crate::coordinator::{Coordinator, TransientKind}; use crate::lifecycle::{self, MANAGER_NAME}; /// Approve a pending request and run the underlying action. Dispatches on the /// approval kind: /// - `ApplyCommit`: read agent.nix at the approval's commit from the proposed /// repo, copy into the applied repo, commit there, rebuild the container. /// Synchronous — returns once the rebuild completes. /// - `Spawn`: create + start a brand-new sub-agent container. Runs in a /// background task so the operator's approve click returns immediately; /// the dashboard surfaces a transient `Spawning` state until the container /// is up. On failure, the approval is marked failed. /// /// In all cases an `ApprovalResolved` helper event lands in the manager's /// inbox when the work resolves. pub async fn approve(coord: Arc, id: i64) -> Result<()> { let approval = coord.approvals.mark_approved(id)?; tracing::info!( %approval.id, %approval.agent, kind = ?approval.kind, %approval.commit_ref, "approval: running action", ); let agent_dir = coord.ensure_runtime(&approval.agent)?; let proposed_dir = Coordinator::agent_proposed_dir(&approval.agent); let applied_dir = Coordinator::agent_applied_dir(&approval.agent); let claude_dir = Coordinator::agent_claude_dir(&approval.agent); let notes_dir = Coordinator::agent_notes_dir(&approval.agent); match approval.kind { ApprovalKind::ApplyCommit => { let (result, terminal_tag) = run_apply_commit( &coord, &approval, &agent_dir, &applied_dir, &claude_dir, ¬es_dir, ) .await; finish_approval(&coord, &approval, result, terminal_tag) } ApprovalKind::Spawn => { // Run the spawn in the background so the approve POST returns // immediately. The dashboard reads `transient` to render a spinner. // Guard is created synchronously here (so the spinner appears // the moment the operator clicks approve) and moved into the // task; it auto-clears even if the runtime drops the task. let coord_bg = coord.clone(); let approval_bg = approval.clone(); let guard = coord_bg.transient_guard(&approval_bg.agent, TransientKind::Spawning); tokio::spawn(async move { let guard = guard; let agent_bg = approval_bg.agent.clone(); let result = lifecycle::spawn( &approval_bg.agent, &coord_bg.hyperhive_flake, &agent_dir, &proposed_dir, &applied_dir, &claude_dir, ¬es_dir, coord_bg.dashboard_port, &coord_bg.operator_pronouns, ) .await; drop(guard); if result.is_ok() && let Err(e) = crate::forge::ensure_user_for(&agent_bg).await { tracing::warn!(agent = %agent_bg, error = ?e, "forge: ensure_user after spawn failed"); } if let Err(e) = finish_approval(&coord_bg, &approval_bg, result, None) { tracing::warn!(agent = %agent_bg, error = ?e, "spawn approval failed"); } }); Ok(()) } } } fn finish_approval( coord: &Coordinator, approval: &hive_sh4re::Approval, result: Result<()>, terminal_tag: Option, ) -> Result<()> { let (status, note, ok) = match &result { Ok(()) => (ApprovalStatus::Approved, None, true), Err(e) => { let note = format!("{e:#}"); let _ = coord.approvals.mark_failed(approval.id, ¬e); (ApprovalStatus::Failed, Some(note), false) } }; coord.notify_manager(&HelperEvent::ApprovalResolved { id: approval.id, agent: approval.agent.clone(), commit_ref: approval.commit_ref.clone(), status, note: note.clone(), sha: approval.fetched_sha.clone(), tag: terminal_tag.clone(), }); // For spawn/rebuild approvals, also surface the underlying action so // the manager knows whether the container actually came up. The // ApprovalResolved event already carries the same `ok` signal but // separating it lets the manager react to the lifecycle change // without having to special-case approvals. match approval.kind { ApprovalKind::Spawn => coord.notify_manager(&HelperEvent::Spawned { agent: approval.agent.clone(), ok, note, sha: approval.fetched_sha.clone(), }), ApprovalKind::ApplyCommit => coord.notify_manager(&HelperEvent::Rebuilt { agent: approval.agent.clone(), ok, note, sha: approval.fetched_sha.clone(), tag: terminal_tag, }), } result } /// Tag-driven `ApplyCommit` handler. Walks the approval through the tag /// state machine documented in `docs/approvals.md`: stamp /// `approved/` and `building/` first so the audit trail /// captures intent, then drop the candidate tree into the working dir /// without moving HEAD, run the rebuild, and either fast-forward /// `applied/main` to the proposal commit on success /// (`deployed/`) or annotate `failed/` with the build error /// and reset the working tree back to the last known-good main. main /// never advances on a failed build, so a crash-and-recover doesn't /// leave the agent pointing at a tree it can't evaluate. async fn run_apply_commit( coord: &Arc, approval: &hive_sh4re::Approval, agent_dir: &std::path::Path, applied_dir: &std::path::Path, claude_dir: &std::path::Path, notes_dir: &std::path::Path, ) -> (Result<()>, Option) { let id = approval.id; let proposal_ref = format!("refs/tags/proposal/{id}"); // Defensive: submit-time should have planted proposal/, but if // the row was migrated from an older schema or the tag got pruned // we fail early with a clear note rather than building a stale // tree. if let Err(e) = lifecycle::git_rev_parse(applied_dir, &proposal_ref).await { return ( Err(anyhow::anyhow!( "missing proposal tag {proposal_ref}: {e:#}" )), None, ); } // Capture the currently-deployed sha so we can roll applied/main // (and the meta lock indirectly) back if the build fails. let prev_main_sha = match lifecycle::git_rev_parse(applied_dir, "refs/heads/main").await { Ok(s) => s, Err(e) => return (Err(anyhow::anyhow!("read applied/main: {e:#}")), None), }; if let Err(e) = lifecycle::git_tag(applied_dir, &format!("approved/{id}"), &proposal_ref).await { return (Err(anyhow::anyhow!("plant approved/{id}: {e:#}")), None); } if let Err(e) = lifecycle::git_tag(applied_dir, &format!("building/{id}"), &proposal_ref).await { return (Err(anyhow::anyhow!("plant building/{id}: {e:#}")), None); } // Fast-forward applied/main to proposal/ + sync the working // tree. Meta input pins `?ref=main`, so this is what makes nix // re-lock to the proposal commit on the prepare_deploy step // below. On build failure we roll main back to prev_main_sha so // a crash leaves the agent on its last-good tree. if let Err(e) = lifecycle::git_update_ref(applied_dir, "refs/heads/main", &proposal_ref).await { return ( Err(anyhow::anyhow!("ff main to {proposal_ref}: {e:#}")), None, ); } if let Err(e) = lifecycle::git_read_tree_reset(applied_dir, "refs/heads/main").await { // main is ahead; working tree didn't sync. Roll main back to // keep the two consistent before bailing. let _ = lifecycle::git_update_ref(applied_dir, "refs/heads/main", &prev_main_sha).await; return ( Err(anyhow::anyhow!("read-tree to main: {e:#}")), None, ); } // Phase 1 of the meta two-phase deploy: relock without committing. if let Err(e) = crate::meta::prepare_deploy(&approval.agent).await { let _ = lifecycle::git_update_ref(applied_dir, "refs/heads/main", &prev_main_sha).await; let _ = lifecycle::git_read_tree_reset(applied_dir, "refs/heads/main").await; return ( Err(anyhow::anyhow!("meta prepare_deploy: {e:#}")), None, ); } // Container-level rebuild against meta#. let build_result = lifecycle::rebuild_no_meta( &approval.agent, agent_dir, applied_dir, claude_dir, notes_dir, ) .await; match build_result { Ok(()) => { let tag = format!("deployed/{id}"); if let Err(e) = lifecycle::git_tag(applied_dir, &tag, &proposal_ref).await { tracing::warn!(agent = %approval.agent, %id, error = ?e, "plant deployed tag failed"); } if let Err(e) = crate::meta::finalize_deploy( &approval.agent, approval.fetched_sha.as_deref().unwrap_or(&proposal_ref), &tag, ) .await { // The build itself succeeded — meta lock landed but // couldn't be committed. Surface as a soft warn so the // operator can git-commit by hand if they care. tracing::warn!(agent = %approval.agent, %id, error = ?e, "meta finalize_deploy failed"); } // Wake the agent on its next turn so claude sees the // config change took effect. Same hint pattern as // auto_update::rebuild_agent — manager approved a // proposal, agent picks up where it left off with the // new env / packages. coord.kick_agent(&approval.agent, "config update applied"); (Ok(()), Some(tag)) } Err(e) => { let tag = format!("failed/{id}"); let body = format!("{e:#}"); if let Err(te) = lifecycle::git_tag_annotated(applied_dir, &tag, &proposal_ref, &body).await { tracing::warn!(agent = %approval.agent, %id, error = ?te, "annotate failed tag failed"); } // Roll main back to last known-good so the on-disk state // matches what nixos-container last successfully built. if let Err(re) = lifecycle::git_update_ref(applied_dir, "refs/heads/main", &prev_main_sha).await { tracing::warn!(agent = %approval.agent, %id, error = ?re, "main rollback failed"); } if let Err(re) = lifecycle::git_read_tree_reset(applied_dir, "refs/heads/main").await { tracing::warn!(agent = %approval.agent, %id, error = ?re, "rollback read-tree failed"); } // Drop the staged meta lock change so the deploy log // only ever shows successes. if let Err(ae) = crate::meta::abort_deploy().await { tracing::warn!(agent = %approval.agent, %id, error = ?ae, "meta abort_deploy failed"); } let _ = coord; (Err(e), Some(tag)) } } } /// Tear down a sub-agent container. By default this is non-destructive to /// persistent state: the proposed/applied config repos and the Claude /// credentials dir under `/var/lib/hyperhive/{agents,applied}//` are /// kept, so recreating an agent of the same name reuses prior config + creds /// (no re-login). The ephemeral runtime dir under `/run/hyperhive/agents/` /// is cleared because its contents (the mcp socket) don't survive restarts /// anyway. With `purge=true` the persistent trees are also wiped — config /// history, claude creds, notes — there is no undo. /// Refuses the manager (declarative; would fight with the host's nixos config). pub async fn destroy(coord: &Arc, name: &str, purge: bool) -> Result<()> { if name == MANAGER_NAME || name == MANAGER_AGENT { bail!("refusing to destroy the manager ({name})"); } tracing::info!(%name, purge, "destroy"); // Guard auto-clears on the success path's final scope exit and on // every early-return / cancellation along the way. let _guard = coord.transient_guard(name, TransientKind::Destroying); lifecycle::destroy(name).await?; coord.unregister_agent(name); let runtime = Coordinator::agent_dir(name); if runtime.exists() { let _ = std::fs::remove_dir_all(&runtime); } if purge { for dir in [ Coordinator::agent_state_root(name), Coordinator::agent_applied_dir(name), ] { if dir.exists() && let Err(e) = std::fs::remove_dir_all(&dir) { tracing::warn!(error = ?e, dir = %dir.display(), "purge: remove failed"); } } } // Meta flake: drop the agent's input + nixosConfiguration so a // future spawn under the same name re-seeds cleanly, and so the // meta lock doesn't reference a vanished applied repo. Log + keep // going on failure — destroy already succeeded at the // nixos-container level, the meta repo is just bookkeeping. if let Err(e) = sync_meta_after_lifecycle(coord).await { tracing::warn!(error = ?e, %name, "meta sync after destroy failed"); } let _ = coord.approvals.fail_pending_for_agent( name, if purge { "agent purged" } else { "agent destroyed" }, ); drop(_guard); coord.notify_manager(&HelperEvent::Destroyed { agent: name.to_owned(), }); Ok(()) } /// Rerender the meta flake from whatever containers still exist on /// disk. Called after lifecycle ops that change the agent set (today: /// destroy). Idempotent — a no-op when nothing changed. async fn sync_meta_after_lifecycle(coord: &Coordinator) -> Result<()> { let agents = lifecycle::agents_for_meta_listing().await?; crate::meta::sync_agents( &coord.hyperhive_flake, coord.dashboard_port, &coord.operator_pronouns, &agents, ) .await } pub async fn deny(coord: &Coordinator, id: i64, note: Option<&str>) -> Result<()> { let approval = coord.approvals.get(id)?; coord.approvals.mark_denied(id, note)?; tracing::info!(%id, note, "approval denied"); let mut tag = None; if let Some(a) = approval { let sha = a.fetched_sha.clone(); // ApplyCommit approvals leave a `denied/` tag on the // proposal commit so rejected configs are first-class git // objects — `git show denied/` in the manager's applied // mount yields both the tree the operator rejected and (in // the annotated body) the reason. Spawn approvals have no // commit to tag, so they fall through unannotated. if matches!(a.kind, ApprovalKind::ApplyCommit) { let applied_dir = Coordinator::agent_applied_dir(&a.agent); let proposal_ref = format!("refs/tags/proposal/{id}"); if lifecycle::git_rev_parse(&applied_dir, &proposal_ref) .await .is_ok() { let tag_name = format!("denied/{id}"); let body = note.unwrap_or("").to_owned(); if let Err(e) = lifecycle::git_tag_annotated( &applied_dir, &tag_name, &proposal_ref, &body, ) .await { tracing::warn!(%id, error = ?e, "plant denied tag failed"); } else { tag = Some(tag_name); } } } coord.notify_manager(&HelperEvent::ApprovalResolved { id: a.id, agent: a.agent, commit_ref: a.commit_ref, status: ApprovalStatus::Denied, note: note.map(String::from), sha, tag, }); } Ok(()) }