suppress crash_watch during background rebuilds + meta repoint
crash_watch fires ContainerCrash whenever it sees a previously- running container in a non-running state without a transient flag set. dashboard rebuilds already set Rebuilding via lifecycle_action; the two other rebuild paths didn't: - migrate::repoint_container: phase 4 walks every container, each nixos-container update activation briefly takes the systemd unit down. previously fired ContainerCrash for every agent during the migration; manager would then spuriously call start() on agents that were already coming back up. - auto_update::rebuild_agent: startup scan + admin-socket caller bypass lifecycle_action. both paths now set the Rebuilding transient around the rebuild + clear after. matches what dashboard does.
This commit is contained in:
parent
63e8a98df2
commit
d202f3785c
2 changed files with 15 additions and 1 deletions
|
|
@ -59,6 +59,11 @@ pub async fn rebuild_agent(coord: &Arc<Coordinator>, name: &str, current_rev: &s
|
||||||
let applied_dir = Coordinator::agent_applied_dir(name);
|
let applied_dir = Coordinator::agent_applied_dir(name);
|
||||||
let claude_dir = Coordinator::agent_claude_dir(name);
|
let claude_dir = Coordinator::agent_claude_dir(name);
|
||||||
let notes_dir = Coordinator::agent_notes_dir(name);
|
let notes_dir = Coordinator::agent_notes_dir(name);
|
||||||
|
// Suppress crash_watch during the stop+start window inside
|
||||||
|
// lifecycle::rebuild. Dashboard rebuilds already do this via
|
||||||
|
// lifecycle_action; this catches the auto-update scan + any
|
||||||
|
// other direct caller.
|
||||||
|
coord.set_transient(name, crate::coordinator::TransientKind::Rebuilding);
|
||||||
let result = lifecycle::rebuild(
|
let result = lifecycle::rebuild(
|
||||||
name,
|
name,
|
||||||
&coord.hyperhive_flake,
|
&coord.hyperhive_flake,
|
||||||
|
|
@ -69,6 +74,7 @@ pub async fn rebuild_agent(coord: &Arc<Coordinator>, name: &str, current_rev: &s
|
||||||
coord.dashboard_port,
|
coord.dashboard_port,
|
||||||
)
|
)
|
||||||
.await;
|
.await;
|
||||||
|
coord.clear_transient(name);
|
||||||
match &result {
|
match &result {
|
||||||
Ok(()) => {
|
Ok(()) => {
|
||||||
if let Err(e) = std::fs::write(rev_marker_path(name), current_rev) {
|
if let Err(e) = std::fs::write(rev_marker_path(name), current_rev) {
|
||||||
|
|
|
||||||
|
|
@ -74,7 +74,15 @@ pub async fn run(coord: &Arc<Coordinator>) -> Result<()> {
|
||||||
}
|
}
|
||||||
let mut all_ok = true;
|
let mut all_ok = true;
|
||||||
for name in &names {
|
for name in &names {
|
||||||
if let Err(e) = repoint_container(name).await {
|
// Mark Rebuilding so the crash watcher skips this container
|
||||||
|
// during the brief stop+start window the nixos-container
|
||||||
|
// update activation triggers. Without this, crash_watch
|
||||||
|
// would fire ContainerCrash for every agent here and the
|
||||||
|
// manager would spuriously try to recover them.
|
||||||
|
coord.set_transient(name, crate::coordinator::TransientKind::Rebuilding);
|
||||||
|
let result = repoint_container(name).await;
|
||||||
|
coord.clear_transient(name);
|
||||||
|
if let Err(e) = result {
|
||||||
tracing::warn!(%name, error = ?e, "migration: container repoint failed");
|
tracing::warn!(%name, error = ?e, "migration: container repoint failed");
|
||||||
all_ok = false;
|
all_ok = false;
|
||||||
}
|
}
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue