hyperhive/hive-c0re/src/actions.rs

//! Operations that are exposed through more than one surface (the host admin
//! socket *and* the dashboard's POST endpoints). Each function takes a
//! `&Coordinator` and the request parameters; callers stitch the response
//! shape they want (HTTP redirect vs JSON).

use std::sync::Arc;

use anyhow::{Result, bail};
use hive_sh4re::{ApprovalKind, ApprovalStatus, HelperEvent, MANAGER_AGENT};

use crate::coordinator::{Coordinator, TransientKind};
use crate::lifecycle::{self, MANAGER_NAME};

/// Approve a pending request and run the underlying action. Dispatches on the
/// approval kind:
/// - `ApplyCommit`: read agent.nix at the approval's commit from the proposed
///   repo, copy into the applied repo, commit there, rebuild the container.
///   Synchronous — returns once the rebuild completes.
/// - `Spawn`: create + start a brand-new sub-agent container. Runs in a
///   background task so the operator's approve click returns immediately;
///   the dashboard surfaces a transient `Spawning` state until the container
///   is up. On failure, the approval is marked failed.
///
/// In all cases an `ApprovalResolved` helper event lands in the manager's
/// inbox when the work resolves.
pub async fn approve(coord: Arc<Coordinator>, id: i64) -> Result<()> {
    let approval = coord.approvals.mark_approved(id)?;
    tracing::info!(
        %approval.id,
        %approval.agent,
        kind = ?approval.kind,
        %approval.commit_ref,
        "approval: running action",
    );

    let agent_dir = coord.ensure_runtime(&approval.agent)?;
    let proposed_dir = Coordinator::agent_proposed_dir(&approval.agent);
    let applied_dir = Coordinator::agent_applied_dir(&approval.agent);
    let claude_dir = Coordinator::agent_claude_dir(&approval.agent);
    let notes_dir = Coordinator::agent_notes_dir(&approval.agent);

    match approval.kind {
        ApprovalKind::ApplyCommit => {
            let (result, terminal_tag) = run_apply_commit(
                &coord,
                &approval,
                &agent_dir,
                &applied_dir,
                &claude_dir,
                &notes_dir,
            )
            .await;
            finish_approval(&coord, &approval, result, terminal_tag)
        }
        ApprovalKind::Spawn => {
            // Run the spawn in the background so the approve POST returns
            // immediately. The dashboard reads `transient` to render a spinner.
            coord.set_transient(&approval.agent, TransientKind::Spawning);
            let coord_bg = coord.clone();
            let approval_bg = approval.clone();
            tokio::spawn(async move {
                let agent_bg = approval_bg.agent.clone();
                let result = lifecycle::spawn(
                    &approval_bg.agent,
                    &coord_bg.hyperhive_flake,
                    &agent_dir,
                    &proposed_dir,
                    &applied_dir,
                    &claude_dir,
                    &notes_dir,
                    coord_bg.dashboard_port,
                )
                .await;
                coord_bg.clear_transient(&agent_bg);
                if let Err(e) = finish_approval(&coord_bg, &approval_bg, result, None) {
                    tracing::warn!(agent = %agent_bg, error = ?e, "spawn approval failed");
                }
            });
            Ok(())
        }
    }
}

fn finish_approval(
    coord: &Coordinator,
    approval: &hive_sh4re::Approval,
    result: Result<()>,
    terminal_tag: Option<String>,
) -> Result<()> {
    let (status, note, ok) = match &result {
        Ok(()) => (ApprovalStatus::Approved, None, true),
        Err(e) => {
            let note = format!("{e:#}");
            let _ = coord.approvals.mark_failed(approval.id, &note);
            (ApprovalStatus::Failed, Some(note), false)
        }
    };
    coord.notify_manager(&HelperEvent::ApprovalResolved {
        id: approval.id,
        agent: approval.agent.clone(),
        commit_ref: approval.commit_ref.clone(),
        status,
        note: note.clone(),
        sha: approval.fetched_sha.clone(),
        tag: terminal_tag.clone(),
    });
    // For spawn/rebuild approvals, also surface the underlying action so
    // the manager knows whether the container actually came up. The
    // ApprovalResolved event already carries the same `ok` signal but
    // separating it lets the manager react to the lifecycle change
    // without having to special-case approvals.
    match approval.kind {
        ApprovalKind::Spawn => coord.notify_manager(&HelperEvent::Spawned {
            agent: approval.agent.clone(),
            ok,
            note,
            sha: approval.fetched_sha.clone(),
        }),
        ApprovalKind::ApplyCommit => coord.notify_manager(&HelperEvent::Rebuilt {
            agent: approval.agent.clone(),
            ok,
            note,
            sha: approval.fetched_sha.clone(),
            tag: terminal_tag,
        }),
    }
    result
}

/// Tag-driven `ApplyCommit` handler. Walks the approval through the tag
/// state machine documented in `docs/approvals.md`: stamp
/// `approved/<id>` and `building/<id>` first so the audit trail
/// captures intent, then drop the candidate tree into the working dir
/// without moving HEAD, run the rebuild, and either fast-forward
/// `applied/main` to the proposal commit on success
/// (`deployed/<id>`) or annotate `failed/<id>` with the build error
/// and reset the working tree back to the last known-good main. main
/// never advances on a failed build, so a crash-and-recover doesn't
/// leave the agent pointing at a tree it can't evaluate.
async fn run_apply_commit(
    coord: &Arc<Coordinator>,
    approval: &hive_sh4re::Approval,
    agent_dir: &std::path::Path,
    applied_dir: &std::path::Path,
    claude_dir: &std::path::Path,
    notes_dir: &std::path::Path,
) -> (Result<()>, Option<String>) {
    let id = approval.id;
    let proposal_ref = format!("refs/tags/proposal/{id}");

    // Defensive: submit-time should have planted proposal/<id>, but if
    // the row was migrated from an older schema or the tag got pruned
    // we fail early with a clear note rather than building a stale
    // tree.
    if let Err(e) = lifecycle::git_rev_parse(applied_dir, &proposal_ref).await {
        return (
            Err(anyhow::anyhow!(
                "missing proposal tag {proposal_ref}: {e:#}"
            )),
            None,
        );
    }

    // Capture the currently-deployed sha so we can roll applied/main
    // (and the meta lock indirectly) back if the build fails.
    let prev_main_sha = match lifecycle::git_rev_parse(applied_dir, "refs/heads/main").await {
        Ok(s) => s,
        Err(e) => return (Err(anyhow::anyhow!("read applied/main: {e:#}")), None),
    };

    if let Err(e) = lifecycle::git_tag(applied_dir, &format!("approved/{id}"), &proposal_ref).await
    {
        return (Err(anyhow::anyhow!("plant approved/{id}: {e:#}")), None);
    }
    if let Err(e) = lifecycle::git_tag(applied_dir, &format!("building/{id}"), &proposal_ref).await
    {
        return (Err(anyhow::anyhow!("plant building/{id}: {e:#}")), None);
    }

    // Fast-forward applied/main to proposal/<id> + sync the working
    // tree. Meta input pins `?ref=main`, so this is what makes nix
    // re-lock to the proposal commit on the prepare_deploy step
    // below. On build failure we roll main back to prev_main_sha so
    // a crash leaves the agent on its last-good tree.
    if let Err(e) =
        lifecycle::git_update_ref(applied_dir, "refs/heads/main", &proposal_ref).await
    {
        return (
            Err(anyhow::anyhow!("ff main to {proposal_ref}: {e:#}")),
            None,
        );
    }
    if let Err(e) = lifecycle::git_read_tree_reset(applied_dir, "refs/heads/main").await {
        // main is ahead; working tree didn't sync. Roll main back to
        // keep the two consistent before bailing.
        let _ = lifecycle::git_update_ref(applied_dir, "refs/heads/main", &prev_main_sha).await;
        return (
            Err(anyhow::anyhow!("read-tree to main: {e:#}")),
            None,
        );
    }

    // Phase 1 of the meta two-phase deploy: relock without committing.
    if let Err(e) = crate::meta::prepare_deploy(&approval.agent).await {
        let _ = lifecycle::git_update_ref(applied_dir, "refs/heads/main", &prev_main_sha).await;
        let _ = lifecycle::git_read_tree_reset(applied_dir, "refs/heads/main").await;
        return (
            Err(anyhow::anyhow!("meta prepare_deploy: {e:#}")),
            None,
        );
    }

    // Container-level rebuild against meta#<name>.
    let build_result = lifecycle::rebuild_no_meta(
        &approval.agent,
        agent_dir,
        applied_dir,
        claude_dir,
        notes_dir,
    )
    .await;

    match build_result {
        Ok(()) => {
            let tag = format!("deployed/{id}");
            if let Err(e) = lifecycle::git_tag(applied_dir, &tag, &proposal_ref).await {
                tracing::warn!(agent = %approval.agent, %id, error = ?e, "plant deployed tag failed");
            }
            if let Err(e) = crate::meta::finalize_deploy(
                &approval.agent,
                approval.fetched_sha.as_deref().unwrap_or(&proposal_ref),
                &tag,
            )
            .await
            {
                // The build itself succeeded — meta lock landed but
                // couldn't be committed. Surface as a soft warn so the
                // operator can git-commit by hand if they care.
                tracing::warn!(agent = %approval.agent, %id, error = ?e, "meta finalize_deploy failed");
            }
            // Don't ignore the coord pointer — keeps the borrow alive
            // for future tracing additions without re-plumbing.
            let _ = coord;
            (Ok(()), Some(tag))
        }
        Err(e) => {
            let tag = format!("failed/{id}");
            let body = format!("{e:#}");
            if let Err(te) =
                lifecycle::git_tag_annotated(applied_dir, &tag, &proposal_ref, &body).await
            {
                tracing::warn!(agent = %approval.agent, %id, error = ?te, "annotate failed tag failed");
            }
            // Roll main back to last known-good so the on-disk state
            // matches what nixos-container last successfully built.
            if let Err(re) =
                lifecycle::git_update_ref(applied_dir, "refs/heads/main", &prev_main_sha).await
            {
                tracing::warn!(agent = %approval.agent, %id, error = ?re, "main rollback failed");
            }
            if let Err(re) = lifecycle::git_read_tree_reset(applied_dir, "refs/heads/main").await {
                tracing::warn!(agent = %approval.agent, %id, error = ?re, "rollback read-tree failed");
            }
            // Drop the staged meta lock change so the deploy log
            // only ever shows successes.
            if let Err(ae) = crate::meta::abort_deploy().await {
                tracing::warn!(agent = %approval.agent, %id, error = ?ae, "meta abort_deploy failed");
            }
            let _ = coord;
            (Err(e), Some(tag))
        }
    }
}

/// Tear down a sub-agent container. By default this is non-destructive to
/// persistent state: the proposed/applied config repos and the Claude
/// credentials dir under `/var/lib/hyperhive/{agents,applied}/<name>/` are
/// kept, so recreating an agent of the same name reuses prior config + creds
/// (no re-login). The ephemeral runtime dir under `/run/hyperhive/agents/`
/// is cleared because its contents (the mcp socket) don't survive restarts
/// anyway. With `purge=true` the persistent trees are also wiped — config
/// history, claude creds, notes — there is no undo.
/// Refuses the manager (declarative; would fight with the host's nixos config).
pub async fn destroy(coord: &Coordinator, name: &str, purge: bool) -> Result<()> {
    if name == MANAGER_NAME || name == MANAGER_AGENT {
        bail!("refusing to destroy the manager ({name})");
    }
    tracing::info!(%name, purge, "destroy");
    coord.set_transient(name, TransientKind::Destroying);
    let result = lifecycle::destroy(name).await;
    if result.is_err() {
        coord.clear_transient(name);
    }
    result?;
    coord.unregister_agent(name);
    let runtime = Coordinator::agent_dir(name);
    if runtime.exists() {
        let _ = std::fs::remove_dir_all(&runtime);
    }
    if purge {
        for dir in [
            Coordinator::agent_state_root(name),
            Coordinator::agent_applied_dir(name),
        ] {
            if dir.exists()
                && let Err(e) = std::fs::remove_dir_all(&dir)
            {
                tracing::warn!(error = ?e, dir = %dir.display(), "purge: remove failed");
            }
        }
    }
    // Meta flake: drop the agent's input + nixosConfiguration so a
    // future spawn under the same name re-seeds cleanly, and so the
    // meta lock doesn't reference a vanished applied repo. Log + keep
    // going on failure — destroy already succeeded at the
    // nixos-container level, the meta repo is just bookkeeping.
    if let Err(e) = sync_meta_after_lifecycle(coord).await {
        tracing::warn!(error = ?e, %name, "meta sync after destroy failed");
    }
    let _ = coord.approvals.fail_pending_for_agent(
        name,
        if purge {
            "agent purged"
        } else {
            "agent destroyed"
        },
    );
    coord.clear_transient(name);
    coord.notify_manager(&HelperEvent::Destroyed {
        agent: name.to_owned(),
    });
    Ok(())
}

/// Rerender the meta flake from whatever containers still exist on
/// disk. Called after lifecycle ops that change the agent set (today:
/// destroy). Idempotent — a no-op when nothing changed.
async fn sync_meta_after_lifecycle(coord: &Coordinator) -> Result<()> {
    let agents = lifecycle::agents_for_meta_listing().await?;
    crate::meta::sync_agents(&coord.hyperhive_flake, coord.dashboard_port, &agents).await
}

pub async fn deny(coord: &Coordinator, id: i64, note: Option<&str>) -> Result<()> {
    let approval = coord.approvals.get(id)?;
    coord.approvals.mark_denied(id, note)?;
    tracing::info!(%id, note, "approval denied");
    let mut tag = None;
    if let Some(a) = approval {
        let sha = a.fetched_sha.clone();
        // ApplyCommit approvals leave a `denied/<id>` tag on the
        // proposal commit so rejected configs are first-class git
        // objects — `git show denied/<id>` in the manager's applied
        // mount yields both the tree the operator rejected and (in
        // the annotated body) the reason. Spawn approvals have no
        // commit to tag, so they fall through unannotated.
        if matches!(a.kind, ApprovalKind::ApplyCommit) {
            let applied_dir = Coordinator::agent_applied_dir(&a.agent);
            let proposal_ref = format!("refs/tags/proposal/{id}");
            if lifecycle::git_rev_parse(&applied_dir, &proposal_ref)
                .await
                .is_ok()
            {
                let tag_name = format!("denied/{id}");
                let body = note.unwrap_or("").to_owned();
                if let Err(e) = lifecycle::git_tag_annotated(
                    &applied_dir,
                    &tag_name,
                    &proposal_ref,
                    &body,
                )
                .await
                {
                    tracing::warn!(%id, error = ?e, "plant denied tag failed");
                } else {
                    tag = Some(tag_name);
                }
            }
        }
        coord.notify_manager(&HelperEvent::ApprovalResolved {
            id: a.id,
            agent: a.agent,
            commit_ref: a.commit_ref,
            status: ApprovalStatus::Denied,
            note: note.map(String::from),
            sha,
            tag,
        });
    }
    Ok(())
}