actions::run_apply_commit through meta two-phase
approval-driven deploys now walk the meta flake via
prepare_deploy / finalize_deploy / abort_deploy so a failed
build leaves no commit in meta's deploy log:
1. capture applied/main sha for rollback
2. tag approved/<id> + building/<id>
3. ff applied/main to proposal/<id>, read-tree sync working tree
4. meta::prepare_deploy(name) — nix flake lock --update-input
agent-<n> without committing
5. lifecycle::rebuild_no_meta — container-level only (new
extracted helper; public lifecycle::rebuild still wraps it
with single-phase meta sync + commit for dashboard / auto
_update callers that don't care about rollback)
6a. on success: tag deployed/<id>, meta::finalize_deploy commits
the staged lock with 'deploy <n> deployed/<id> <sha12>'
6b. on failure: tag failed/<id> annotated with the build error,
git_update_ref applied/main back to prev sha, read-tree to
main, meta::abort_deploy git-restores flake.lock
meta's git log now records only successful deploys; failures
+ denials still live in applied as annotated tags.
This commit is contained in:
parent
22f35def8f
commit
06fdbac1ac
2 changed files with 92 additions and 29 deletions
|
|
@ -147,6 +147,7 @@ async fn run_apply_commit(
|
||||||
) -> (Result<()>, Option<String>) {
|
) -> (Result<()>, Option<String>) {
|
||||||
let id = approval.id;
|
let id = approval.id;
|
||||||
let proposal_ref = format!("refs/tags/proposal/{id}");
|
let proposal_ref = format!("refs/tags/proposal/{id}");
|
||||||
|
|
||||||
// Defensive: submit-time should have planted proposal/<id>, but if
|
// Defensive: submit-time should have planted proposal/<id>, but if
|
||||||
// the row was migrated from an older schema or the tag got pruned
|
// the row was migrated from an older schema or the tag got pruned
|
||||||
// we fail early with a clear note rather than building a stale
|
// we fail early with a clear note rather than building a stale
|
||||||
|
|
@ -159,6 +160,14 @@ async fn run_apply_commit(
|
||||||
None,
|
None,
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Capture the currently-deployed sha so we can roll applied/main
|
||||||
|
// (and the meta lock indirectly) back if the build fails.
|
||||||
|
let prev_main_sha = match lifecycle::git_rev_parse(applied_dir, "refs/heads/main").await {
|
||||||
|
Ok(s) => s,
|
||||||
|
Err(e) => return (Err(anyhow::anyhow!("read applied/main: {e:#}")), None),
|
||||||
|
};
|
||||||
|
|
||||||
if let Err(e) = lifecycle::git_tag(applied_dir, &format!("approved/{id}"), &proposal_ref).await
|
if let Err(e) = lifecycle::git_tag(applied_dir, &format!("approved/{id}"), &proposal_ref).await
|
||||||
{
|
{
|
||||||
return (Err(anyhow::anyhow!("plant approved/{id}: {e:#}")), None);
|
return (Err(anyhow::anyhow!("plant approved/{id}: {e:#}")), None);
|
||||||
|
|
@ -167,41 +176,71 @@ async fn run_apply_commit(
|
||||||
{
|
{
|
||||||
return (Err(anyhow::anyhow!("plant building/{id}: {e:#}")), None);
|
return (Err(anyhow::anyhow!("plant building/{id}: {e:#}")), None);
|
||||||
}
|
}
|
||||||
if let Err(e) = lifecycle::git_read_tree_reset(applied_dir, &proposal_ref).await {
|
|
||||||
|
// Fast-forward applied/main to proposal/<id> + sync the working
|
||||||
|
// tree. Meta input pins `?ref=main`, so this is what makes nix
|
||||||
|
// re-lock to the proposal commit on the prepare_deploy step
|
||||||
|
// below. On build failure we roll main back to prev_main_sha so
|
||||||
|
// a crash leaves the agent on its last-good tree.
|
||||||
|
if let Err(e) =
|
||||||
|
lifecycle::git_update_ref(applied_dir, "refs/heads/main", &proposal_ref).await
|
||||||
|
{
|
||||||
return (
|
return (
|
||||||
Err(anyhow::anyhow!("read-tree to {proposal_ref}: {e:#}")),
|
Err(anyhow::anyhow!("ff main to {proposal_ref}: {e:#}")),
|
||||||
|
None,
|
||||||
|
);
|
||||||
|
}
|
||||||
|
if let Err(e) = lifecycle::git_read_tree_reset(applied_dir, "refs/heads/main").await {
|
||||||
|
// main is ahead; working tree didn't sync. Roll main back to
|
||||||
|
// keep the two consistent before bailing.
|
||||||
|
let _ = lifecycle::git_update_ref(applied_dir, "refs/heads/main", &prev_main_sha).await;
|
||||||
|
return (
|
||||||
|
Err(anyhow::anyhow!("read-tree to main: {e:#}")),
|
||||||
None,
|
None,
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
let rebuild_result = lifecycle::rebuild(
|
// Phase 1 of the meta two-phase deploy: relock without committing.
|
||||||
|
if let Err(e) = crate::meta::prepare_deploy(&approval.agent).await {
|
||||||
|
let _ = lifecycle::git_update_ref(applied_dir, "refs/heads/main", &prev_main_sha).await;
|
||||||
|
let _ = lifecycle::git_read_tree_reset(applied_dir, "refs/heads/main").await;
|
||||||
|
return (
|
||||||
|
Err(anyhow::anyhow!("meta prepare_deploy: {e:#}")),
|
||||||
|
None,
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Container-level rebuild against meta#<name>.
|
||||||
|
let build_result = lifecycle::rebuild_no_meta(
|
||||||
&approval.agent,
|
&approval.agent,
|
||||||
&coord.hyperhive_flake,
|
|
||||||
agent_dir,
|
agent_dir,
|
||||||
applied_dir,
|
applied_dir,
|
||||||
claude_dir,
|
claude_dir,
|
||||||
notes_dir,
|
notes_dir,
|
||||||
coord.dashboard_port,
|
|
||||||
)
|
)
|
||||||
.await;
|
.await;
|
||||||
|
|
||||||
match rebuild_result {
|
match build_result {
|
||||||
Ok(()) => {
|
Ok(()) => {
|
||||||
let tag = format!("deployed/{id}");
|
let tag = format!("deployed/{id}");
|
||||||
if let Err(e) = lifecycle::git_tag(applied_dir, &tag, &proposal_ref).await {
|
if let Err(e) = lifecycle::git_tag(applied_dir, &tag, &proposal_ref).await {
|
||||||
tracing::warn!(agent = %approval.agent, %id, error = ?e, "plant deployed tag failed");
|
tracing::warn!(agent = %approval.agent, %id, error = ?e, "plant deployed tag failed");
|
||||||
}
|
}
|
||||||
if let Err(e) =
|
if let Err(e) = crate::meta::finalize_deploy(
|
||||||
lifecycle::git_update_ref(applied_dir, "refs/heads/main", &proposal_ref).await
|
&approval.agent,
|
||||||
|
approval.fetched_sha.as_deref().unwrap_or(&proposal_ref),
|
||||||
|
&tag,
|
||||||
|
)
|
||||||
|
.await
|
||||||
{
|
{
|
||||||
// Working tree already matches proposal/<id>, but main
|
// The build itself succeeded — meta lock landed but
|
||||||
// didn't advance — surface as a build failure so the
|
// couldn't be committed. Surface as a soft warn so the
|
||||||
// operator notices the desync.
|
// operator can git-commit by hand if they care.
|
||||||
return (
|
tracing::warn!(agent = %approval.agent, %id, error = ?e, "meta finalize_deploy failed");
|
||||||
Err(anyhow::anyhow!("ff main to {proposal_ref}: {e:#}")),
|
|
||||||
Some(tag),
|
|
||||||
);
|
|
||||||
}
|
}
|
||||||
|
// Don't ignore the coord pointer — keeps the borrow alive
|
||||||
|
// for future tracing additions without re-plumbing.
|
||||||
|
let _ = coord;
|
||||||
(Ok(()), Some(tag))
|
(Ok(()), Some(tag))
|
||||||
}
|
}
|
||||||
Err(e) => {
|
Err(e) => {
|
||||||
|
|
@ -212,13 +251,22 @@ async fn run_apply_commit(
|
||||||
{
|
{
|
||||||
tracing::warn!(agent = %approval.agent, %id, error = ?te, "annotate failed tag failed");
|
tracing::warn!(agent = %approval.agent, %id, error = ?te, "annotate failed tag failed");
|
||||||
}
|
}
|
||||||
// Roll working tree back to last known-good main so the
|
// Roll main back to last known-good so the on-disk state
|
||||||
// on-disk state matches what nixos-container last
|
// matches what nixos-container last successfully built.
|
||||||
// successfully built. main hasn't moved, so this is just
|
if let Err(re) =
|
||||||
// a content reset.
|
lifecycle::git_update_ref(applied_dir, "refs/heads/main", &prev_main_sha).await
|
||||||
|
{
|
||||||
|
tracing::warn!(agent = %approval.agent, %id, error = ?re, "main rollback failed");
|
||||||
|
}
|
||||||
if let Err(re) = lifecycle::git_read_tree_reset(applied_dir, "refs/heads/main").await {
|
if let Err(re) = lifecycle::git_read_tree_reset(applied_dir, "refs/heads/main").await {
|
||||||
tracing::warn!(agent = %approval.agent, %id, error = ?re, "rollback read-tree failed");
|
tracing::warn!(agent = %approval.agent, %id, error = ?re, "rollback read-tree failed");
|
||||||
}
|
}
|
||||||
|
// Drop the staged meta lock change so the deploy log
|
||||||
|
// only ever shows successes.
|
||||||
|
if let Err(ae) = crate::meta::abort_deploy().await {
|
||||||
|
tracing::warn!(agent = %approval.agent, %id, error = ?ae, "meta abort_deploy failed");
|
||||||
|
}
|
||||||
|
let _ = coord;
|
||||||
(Err(e), Some(tag))
|
(Err(e), Some(tag))
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -280,16 +280,6 @@ pub async fn rebuild(
|
||||||
notes_dir: &Path,
|
notes_dir: &Path,
|
||||||
dashboard_port: u16,
|
dashboard_port: u16,
|
||||||
) -> Result<()> {
|
) -> Result<()> {
|
||||||
validate(name)?;
|
|
||||||
if let Some(other) = port_collision(name).await {
|
|
||||||
bail!(
|
|
||||||
"port {} is already taken by '{other}' — rename one of them and retry",
|
|
||||||
agent_web_port(name)
|
|
||||||
);
|
|
||||||
}
|
|
||||||
setup_applied(applied_dir, None, name).await?;
|
|
||||||
ensure_claude_dir(claude_dir)?;
|
|
||||||
ensure_state_dir(notes_dir)?;
|
|
||||||
// Sync the meta flake (idempotent — no-op when the rendered
|
// Sync the meta flake (idempotent — no-op when the rendered
|
||||||
// flake matches disk) so a manual rebuild from the dashboard
|
// flake matches disk) so a manual rebuild from the dashboard
|
||||||
// can also recover from a divergent meta repo (e.g. an agent
|
// can also recover from a divergent meta repo (e.g. an agent
|
||||||
|
|
@ -301,6 +291,31 @@ pub async fn rebuild(
|
||||||
// `applied/<n>/main` currently points at (deployed/<latest>).
|
// `applied/<n>/main` currently points at (deployed/<latest>).
|
||||||
// Commits the lock if it changed.
|
// Commits the lock if it changed.
|
||||||
crate::meta::lock_update_for_rebuild(name).await?;
|
crate::meta::lock_update_for_rebuild(name).await?;
|
||||||
|
rebuild_no_meta(name, agent_dir, applied_dir, claude_dir, notes_dir).await
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Container-level rebuild without touching the meta repo. Callers
|
||||||
|
/// that own the meta side themselves (`actions::run_apply_commit`
|
||||||
|
/// drives meta through the two-phase prepare/finalize/abort flow)
|
||||||
|
/// use this directly. Public `rebuild` wraps it with idempotent meta
|
||||||
|
/// sync + lock-bump-and-commit.
|
||||||
|
pub async fn rebuild_no_meta(
|
||||||
|
name: &str,
|
||||||
|
agent_dir: &Path,
|
||||||
|
applied_dir: &Path,
|
||||||
|
claude_dir: &Path,
|
||||||
|
notes_dir: &Path,
|
||||||
|
) -> Result<()> {
|
||||||
|
validate(name)?;
|
||||||
|
if let Some(other) = port_collision(name).await {
|
||||||
|
bail!(
|
||||||
|
"port {} is already taken by '{other}' — rename one of them and retry",
|
||||||
|
agent_web_port(name)
|
||||||
|
);
|
||||||
|
}
|
||||||
|
setup_applied(applied_dir, None, name).await?;
|
||||||
|
ensure_claude_dir(claude_dir)?;
|
||||||
|
ensure_state_dir(notes_dir)?;
|
||||||
let container = container_name(name);
|
let container = container_name(name);
|
||||||
let flake_ref = format!("{}#{name}", crate::meta::meta_dir().display());
|
let flake_ref = format!("{}#{name}", crate::meta::meta_dir().display());
|
||||||
set_nspawn_flags(&container, agent_dir, claude_dir, notes_dir)?;
|
set_nspawn_flags(&container, agent_dir, claude_dir, notes_dir)?;
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue