From 06fdbac1ac13b7e3ec15eccb67ce871d3c059c61 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?m=C3=BCde?= Date: Sat, 16 May 2026 00:32:16 +0200 Subject: [PATCH] actions::run_apply_commit through meta two-phase MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit approval-driven deploys now walk the meta flake via prepare_deploy / finalize_deploy / abort_deploy so a failed build leaves no commit in meta's deploy log: 1. capture applied/main sha for rollback 2. tag approved/ + building/ 3. ff applied/main to proposal/, read-tree sync working tree 4. meta::prepare_deploy(name) — nix flake lock --update-input agent- without committing 5. lifecycle::rebuild_no_meta — container-level only (new extracted helper; public lifecycle::rebuild still wraps it with single-phase meta sync + commit for dashboard / auto _update callers that don't care about rollback) 6a. on success: tag deployed/, meta::finalize_deploy commits the staged lock with 'deploy deployed/ ' 6b. on failure: tag failed/ annotated with the build error, git_update_ref applied/main back to prev sha, read-tree to main, meta::abort_deploy git-restores flake.lock meta's git log now records only successful deploys; failures + denials still live in applied as annotated tags. --- hive-c0re/src/actions.rs | 86 +++++++++++++++++++++++++++++--------- hive-c0re/src/lifecycle.rs | 35 +++++++++++----- 2 files changed, 92 insertions(+), 29 deletions(-) diff --git a/hive-c0re/src/actions.rs b/hive-c0re/src/actions.rs index bfa7904..4596743 100644 --- a/hive-c0re/src/actions.rs +++ b/hive-c0re/src/actions.rs @@ -147,6 +147,7 @@ async fn run_apply_commit( ) -> (Result<()>, Option) { let id = approval.id; let proposal_ref = format!("refs/tags/proposal/{id}"); + // Defensive: submit-time should have planted proposal/, but if // the row was migrated from an older schema or the tag got pruned // we fail early with a clear note rather than building a stale @@ -159,6 +160,14 @@ async fn run_apply_commit( None, ); } + + // Capture the currently-deployed sha so we can roll applied/main + // (and the meta lock indirectly) back if the build fails. + let prev_main_sha = match lifecycle::git_rev_parse(applied_dir, "refs/heads/main").await { + Ok(s) => s, + Err(e) => return (Err(anyhow::anyhow!("read applied/main: {e:#}")), None), + }; + if let Err(e) = lifecycle::git_tag(applied_dir, &format!("approved/{id}"), &proposal_ref).await { return (Err(anyhow::anyhow!("plant approved/{id}: {e:#}")), None); @@ -167,41 +176,71 @@ async fn run_apply_commit( { return (Err(anyhow::anyhow!("plant building/{id}: {e:#}")), None); } - if let Err(e) = lifecycle::git_read_tree_reset(applied_dir, &proposal_ref).await { + + // Fast-forward applied/main to proposal/ + sync the working + // tree. Meta input pins `?ref=main`, so this is what makes nix + // re-lock to the proposal commit on the prepare_deploy step + // below. On build failure we roll main back to prev_main_sha so + // a crash leaves the agent on its last-good tree. + if let Err(e) = + lifecycle::git_update_ref(applied_dir, "refs/heads/main", &proposal_ref).await + { return ( - Err(anyhow::anyhow!("read-tree to {proposal_ref}: {e:#}")), + Err(anyhow::anyhow!("ff main to {proposal_ref}: {e:#}")), + None, + ); + } + if let Err(e) = lifecycle::git_read_tree_reset(applied_dir, "refs/heads/main").await { + // main is ahead; working tree didn't sync. Roll main back to + // keep the two consistent before bailing. + let _ = lifecycle::git_update_ref(applied_dir, "refs/heads/main", &prev_main_sha).await; + return ( + Err(anyhow::anyhow!("read-tree to main: {e:#}")), None, ); } - let rebuild_result = lifecycle::rebuild( + // Phase 1 of the meta two-phase deploy: relock without committing. + if let Err(e) = crate::meta::prepare_deploy(&approval.agent).await { + let _ = lifecycle::git_update_ref(applied_dir, "refs/heads/main", &prev_main_sha).await; + let _ = lifecycle::git_read_tree_reset(applied_dir, "refs/heads/main").await; + return ( + Err(anyhow::anyhow!("meta prepare_deploy: {e:#}")), + None, + ); + } + + // Container-level rebuild against meta#. + let build_result = lifecycle::rebuild_no_meta( &approval.agent, - &coord.hyperhive_flake, agent_dir, applied_dir, claude_dir, notes_dir, - coord.dashboard_port, ) .await; - match rebuild_result { + match build_result { Ok(()) => { let tag = format!("deployed/{id}"); if let Err(e) = lifecycle::git_tag(applied_dir, &tag, &proposal_ref).await { tracing::warn!(agent = %approval.agent, %id, error = ?e, "plant deployed tag failed"); } - if let Err(e) = - lifecycle::git_update_ref(applied_dir, "refs/heads/main", &proposal_ref).await + if let Err(e) = crate::meta::finalize_deploy( + &approval.agent, + approval.fetched_sha.as_deref().unwrap_or(&proposal_ref), + &tag, + ) + .await { - // Working tree already matches proposal/, but main - // didn't advance — surface as a build failure so the - // operator notices the desync. - return ( - Err(anyhow::anyhow!("ff main to {proposal_ref}: {e:#}")), - Some(tag), - ); + // The build itself succeeded — meta lock landed but + // couldn't be committed. Surface as a soft warn so the + // operator can git-commit by hand if they care. + tracing::warn!(agent = %approval.agent, %id, error = ?e, "meta finalize_deploy failed"); } + // Don't ignore the coord pointer — keeps the borrow alive + // for future tracing additions without re-plumbing. + let _ = coord; (Ok(()), Some(tag)) } Err(e) => { @@ -212,13 +251,22 @@ async fn run_apply_commit( { tracing::warn!(agent = %approval.agent, %id, error = ?te, "annotate failed tag failed"); } - // Roll working tree back to last known-good main so the - // on-disk state matches what nixos-container last - // successfully built. main hasn't moved, so this is just - // a content reset. + // Roll main back to last known-good so the on-disk state + // matches what nixos-container last successfully built. + if let Err(re) = + lifecycle::git_update_ref(applied_dir, "refs/heads/main", &prev_main_sha).await + { + tracing::warn!(agent = %approval.agent, %id, error = ?re, "main rollback failed"); + } if let Err(re) = lifecycle::git_read_tree_reset(applied_dir, "refs/heads/main").await { tracing::warn!(agent = %approval.agent, %id, error = ?re, "rollback read-tree failed"); } + // Drop the staged meta lock change so the deploy log + // only ever shows successes. + if let Err(ae) = crate::meta::abort_deploy().await { + tracing::warn!(agent = %approval.agent, %id, error = ?ae, "meta abort_deploy failed"); + } + let _ = coord; (Err(e), Some(tag)) } } diff --git a/hive-c0re/src/lifecycle.rs b/hive-c0re/src/lifecycle.rs index 1004c45..9307fd7 100644 --- a/hive-c0re/src/lifecycle.rs +++ b/hive-c0re/src/lifecycle.rs @@ -280,16 +280,6 @@ pub async fn rebuild( notes_dir: &Path, dashboard_port: u16, ) -> Result<()> { - validate(name)?; - if let Some(other) = port_collision(name).await { - bail!( - "port {} is already taken by '{other}' — rename one of them and retry", - agent_web_port(name) - ); - } - setup_applied(applied_dir, None, name).await?; - ensure_claude_dir(claude_dir)?; - ensure_state_dir(notes_dir)?; // Sync the meta flake (idempotent — no-op when the rendered // flake matches disk) so a manual rebuild from the dashboard // can also recover from a divergent meta repo (e.g. an agent @@ -301,6 +291,31 @@ pub async fn rebuild( // `applied//main` currently points at (deployed/). // Commits the lock if it changed. crate::meta::lock_update_for_rebuild(name).await?; + rebuild_no_meta(name, agent_dir, applied_dir, claude_dir, notes_dir).await +} + +/// Container-level rebuild without touching the meta repo. Callers +/// that own the meta side themselves (`actions::run_apply_commit` +/// drives meta through the two-phase prepare/finalize/abort flow) +/// use this directly. Public `rebuild` wraps it with idempotent meta +/// sync + lock-bump-and-commit. +pub async fn rebuild_no_meta( + name: &str, + agent_dir: &Path, + applied_dir: &Path, + claude_dir: &Path, + notes_dir: &Path, +) -> Result<()> { + validate(name)?; + if let Some(other) = port_collision(name).await { + bail!( + "port {} is already taken by '{other}' — rename one of them and retry", + agent_web_port(name) + ); + } + setup_applied(applied_dir, None, name).await?; + ensure_claude_dir(claude_dir)?; + ensure_state_dir(notes_dir)?; let container = container_name(name); let flake_ref = format!("{}#{name}", crate::meta::meta_dir().display()); set_nspawn_flags(&container, agent_dir, claude_dir, notes_dir)?;