actions: tag-driven approve(ApplyCommit) flow

run_apply_commit walks the approval through the tag state
machine in applied: approved/<id> + building/<id> stamped
before the build, then git read-tree --reset to proposal/<id>
populates the working dir without moving HEAD. on rebuild
success deployed/<id> is planted and refs/heads/main fast-
forwards to the proposal. on failure failed/<id> is annotated
with the build error and the working tree resets back to main
so the agent stays evaluable. helper events Rebuilt +
ApprovalResolved both carry the terminal tag so the manager
can git-show the exact tree (and read the failure note from
an annotated tag) against its read-only applied.git mount.
finish_approval grows a terminal_tag param; spawn path passes
None. lifecycle::apply_commit deleted.
This commit is contained in:
müde 2026-05-15 23:00:01 +02:00
parent 35b0edaf27
commit 315d4289c7
2 changed files with 110 additions and 35 deletions

View file

@ -41,21 +41,16 @@ pub async fn approve(coord: Arc<Coordinator>, id: i64) -> Result<()> {
match approval.kind {
ApprovalKind::ApplyCommit => {
let result = async {
lifecycle::apply_commit(&applied_dir, &proposed_dir, &approval.commit_ref).await?;
lifecycle::rebuild(
&approval.agent,
&coord.hyperhive_flake,
&agent_dir,
&applied_dir,
&claude_dir,
&notes_dir,
coord.dashboard_port,
)
.await
}
let (result, terminal_tag) = run_apply_commit(
&coord,
&approval,
&agent_dir,
&applied_dir,
&claude_dir,
&notes_dir,
)
.await;
finish_approval(&coord, &approval, result)
finish_approval(&coord, &approval, result, terminal_tag)
}
ApprovalKind::Spawn => {
// Run the spawn in the background so the approve POST returns
@ -77,7 +72,7 @@ pub async fn approve(coord: Arc<Coordinator>, id: i64) -> Result<()> {
)
.await;
coord_bg.clear_transient(&agent_bg);
if let Err(e) = finish_approval(&coord_bg, &approval_bg, result) {
if let Err(e) = finish_approval(&coord_bg, &approval_bg, result, None) {
tracing::warn!(agent = %agent_bg, error = ?e, "spawn approval failed");
}
});
@ -90,6 +85,7 @@ fn finish_approval(
coord: &Coordinator,
approval: &hive_sh4re::Approval,
result: Result<()>,
terminal_tag: Option<String>,
) -> Result<()> {
let (status, note, ok) = match &result {
Ok(()) => (ApprovalStatus::Approved, None, true),
@ -106,7 +102,7 @@ fn finish_approval(
status,
note: note.clone(),
sha: approval.fetched_sha.clone(),
tag: None,
tag: terminal_tag.clone(),
});
// For spawn/rebuild approvals, also surface the underlying action so
// the manager knows whether the container actually came up. The
@ -125,12 +121,109 @@ fn finish_approval(
ok,
note,
sha: approval.fetched_sha.clone(),
tag: None,
tag: terminal_tag,
}),
}
result
}
/// Tag-driven ApplyCommit handler. Walks the approval through the tag
/// state machine documented in `docs/approvals.md`: stamp `approved/<id>`
/// + `building/<id>` first so the audit trail captures intent, then
/// drop the candidate tree into the working dir without moving HEAD,
/// run the rebuild, and either fast-forward `applied/main` to the
/// proposal commit on success (`deployed/<id>`) or annotate
/// `failed/<id>` with the build error and reset the working tree back
/// to the last known-good main. main never advances on a failed
/// build, so a crash-and-recover doesn't leave the agent pointing at
/// a tree it can't evaluate.
async fn run_apply_commit(
coord: &Arc<Coordinator>,
approval: &hive_sh4re::Approval,
agent_dir: &std::path::Path,
applied_dir: &std::path::Path,
claude_dir: &std::path::Path,
notes_dir: &std::path::Path,
) -> (Result<()>, Option<String>) {
let id = approval.id;
let proposal_ref = format!("refs/tags/proposal/{id}");
// Defensive: submit-time should have planted proposal/<id>, but if
// the row was migrated from an older schema or the tag got pruned
// we fail early with a clear note rather than building a stale
// tree.
if let Err(e) = lifecycle::git_rev_parse(applied_dir, &proposal_ref).await {
return (
Err(anyhow::anyhow!(
"missing proposal tag {proposal_ref}: {e:#}"
)),
None,
);
}
if let Err(e) = lifecycle::git_tag(applied_dir, &format!("approved/{id}"), &proposal_ref).await
{
return (Err(anyhow::anyhow!("plant approved/{id}: {e:#}")), None);
}
if let Err(e) = lifecycle::git_tag(applied_dir, &format!("building/{id}"), &proposal_ref).await
{
return (Err(anyhow::anyhow!("plant building/{id}: {e:#}")), None);
}
if let Err(e) = lifecycle::git_read_tree_reset(applied_dir, &proposal_ref).await {
return (
Err(anyhow::anyhow!("read-tree to {proposal_ref}: {e:#}")),
None,
);
}
let rebuild_result = lifecycle::rebuild(
&approval.agent,
&coord.hyperhive_flake,
agent_dir,
applied_dir,
claude_dir,
notes_dir,
coord.dashboard_port,
)
.await;
match rebuild_result {
Ok(()) => {
let tag = format!("deployed/{id}");
if let Err(e) = lifecycle::git_tag(applied_dir, &tag, &proposal_ref).await {
tracing::warn!(agent = %approval.agent, %id, error = ?e, "plant deployed tag failed");
}
if let Err(e) =
lifecycle::git_update_ref(applied_dir, "refs/heads/main", &proposal_ref).await
{
// Working tree already matches proposal/<id>, but main
// didn't advance — surface as a build failure so the
// operator notices the desync.
return (
Err(anyhow::anyhow!("ff main to {proposal_ref}: {e:#}")),
Some(tag),
);
}
(Ok(()), Some(tag))
}
Err(e) => {
let tag = format!("failed/{id}");
let body = format!("{e:#}");
if let Err(te) =
lifecycle::git_tag_annotated(applied_dir, &tag, &proposal_ref, &body).await
{
tracing::warn!(agent = %approval.agent, %id, error = ?te, "annotate failed tag failed");
}
// Roll working tree back to last known-good main so the
// on-disk state matches what nixos-container last
// successfully built. main hasn't moved, so this is just
// a content reset.
if let Err(re) = lifecycle::git_read_tree_reset(applied_dir, "refs/heads/main").await {
tracing::warn!(agent = %approval.agent, %id, error = ?re, "rollback read-tree failed");
}
(Err(e), Some(tag))
}
}
}
/// Tear down a sub-agent container. By default this is non-destructive to
/// persistent state: the proposed/applied config repos and the Claude
/// credentials dir under `/var/lib/hyperhive/{agents,applied}/<name>/` are