auto-update agents on startup when hyperhive rev changes
This commit is contained in:
parent
1cbfacda20
commit
a4e1556f90
3 changed files with 141 additions and 0 deletions
13
CLAUDE.md
13
CLAUDE.md
|
|
@ -156,6 +156,19 @@ docs/damocles-migration.md options for moving damocles onto hyperhive
|
||||||
marks them `failed` with note `"agent state dir missing"` so they fall out
|
marks them `failed` with note `"agent state dir missing"` so they fall out
|
||||||
of `pending`. They stay in sqlite for audit.
|
of `pending`. They stay in sqlite for audit.
|
||||||
|
|
||||||
|
## Auto-update on startup
|
||||||
|
|
||||||
|
`hive-c0re serve` runs `auto_update::run` in a background task right after
|
||||||
|
opening the coordinator. It enumerates sub-agent containers (manager
|
||||||
|
excluded — its config comes from the host's NixOS module) and rebuilds any
|
||||||
|
whose recorded hyperhive rev differs from the current one. Rev = canonical
|
||||||
|
filesystem path of `cfg.hyperhiveFlake` (so `/etc/hyperhive` resolving to a
|
||||||
|
new `/nix/store/...-source` triggers a rebuild). Marker file:
|
||||||
|
`/var/lib/hyperhive/applied/.<name>.hyperhive-rev`. If the flake input has
|
||||||
|
no canonical path (e.g. a `github:` URL), auto-update is a no-op — rebuild
|
||||||
|
manually. The task is async and never blocks the admin socket; failures are
|
||||||
|
logged and don't take the daemon down.
|
||||||
|
|
||||||
## Build / deploy / test
|
## Build / deploy / test
|
||||||
|
|
||||||
```sh
|
```sh
|
||||||
|
|
|
||||||
118
hive-c0re/src/auto_update.rs
Normal file
118
hive-c0re/src/auto_update.rs
Normal file
|
|
@ -0,0 +1,118 @@
|
||||||
|
//! Startup auto-update: on `hive-c0re serve` boot, rebuild any sub-agent
|
||||||
|
//! container whose recorded "hyperhive rev" differs from the current one,
|
||||||
|
//! then write the new rev as the marker. Skips rebuild when nothing changed
|
||||||
|
//! so warm restarts are near-free.
|
||||||
|
//!
|
||||||
|
//! "Rev" is the canonical filesystem path of the configured hyperhive flake
|
||||||
|
//! (e.g. `/nix/store/<hash>-source` when `/etc/hyperhive` is a symlink the
|
||||||
|
//! NixOS module wires up). For non-path flake URLs we don't have a cheap rev
|
||||||
|
//! signal, so auto-update is a no-op — operators rebuild manually.
|
||||||
|
|
||||||
|
use std::path::{Path, PathBuf};
|
||||||
|
use std::sync::Arc;
|
||||||
|
|
||||||
|
use anyhow::Result;
|
||||||
|
|
||||||
|
use crate::coordinator::Coordinator;
|
||||||
|
use crate::lifecycle::{self, AGENT_PREFIX};
|
||||||
|
|
||||||
|
/// Marker file recording the hyperhive rev a sub-agent's container was last
|
||||||
|
/// built against. Sibling of `applied/<name>/` (rather than inside it) to
|
||||||
|
/// keep it out of the applied repo's git history.
|
||||||
|
fn rev_marker_path(name: &str) -> PathBuf {
|
||||||
|
PathBuf::from(format!("/var/lib/hyperhive/applied/.{name}.hyperhive-rev"))
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Resolve the current rev of `hyperhive_flake`. For a path on disk we
|
||||||
|
/// canonicalize (following symlinks) so a /etc/hyperhive → /nix/store/...
|
||||||
|
/// update yields a different string. For anything else we return None.
|
||||||
|
fn current_flake_rev(hyperhive_flake: &str) -> Option<String> {
|
||||||
|
let path = Path::new(hyperhive_flake);
|
||||||
|
if !path.exists() {
|
||||||
|
return None;
|
||||||
|
}
|
||||||
|
std::fs::canonicalize(path)
|
||||||
|
.ok()
|
||||||
|
.map(|p| p.display().to_string())
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Rebuild every sub-agent whose marker differs from the current rev. Logs
|
||||||
|
/// per-agent outcomes and continues past failures. Returns Ok even if some
|
||||||
|
/// rebuilds failed — startup shouldn't be blocked by a broken agent.
|
||||||
|
pub async fn run(coord: Arc<Coordinator>) -> Result<()> {
|
||||||
|
let Some(current_rev) = current_flake_rev(&coord.hyperhive_flake) else {
|
||||||
|
tracing::info!(
|
||||||
|
flake = %coord.hyperhive_flake,
|
||||||
|
"auto-update: hyperhive_flake has no canonical path; skipping",
|
||||||
|
);
|
||||||
|
return Ok(());
|
||||||
|
};
|
||||||
|
tracing::info!(rev = %current_rev, "auto-update: scanning agents");
|
||||||
|
|
||||||
|
let containers = match lifecycle::list().await {
|
||||||
|
Ok(c) => c,
|
||||||
|
Err(e) => {
|
||||||
|
tracing::warn!(error = ?e, "auto-update: nixos-container list failed");
|
||||||
|
return Ok(());
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
let mut tasks = Vec::new();
|
||||||
|
for container in containers {
|
||||||
|
let Some(name) = container.strip_prefix(AGENT_PREFIX) else {
|
||||||
|
continue;
|
||||||
|
};
|
||||||
|
let name = name.to_owned();
|
||||||
|
let marker = rev_marker_path(&name);
|
||||||
|
let prev = std::fs::read_to_string(&marker).ok();
|
||||||
|
if prev.as_deref().map(str::trim) == Some(current_rev.as_str()) {
|
||||||
|
tracing::debug!(%name, "auto-update: up-to-date");
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
let coord = coord.clone();
|
||||||
|
let current_rev = current_rev.clone();
|
||||||
|
tasks.push(tokio::spawn(async move {
|
||||||
|
tracing::info!(
|
||||||
|
%name,
|
||||||
|
prev = ?prev,
|
||||||
|
rev = %current_rev,
|
||||||
|
"auto-update: rebuilding agent",
|
||||||
|
);
|
||||||
|
let agent_dir = match coord.register_agent(&name) {
|
||||||
|
Ok(d) => d,
|
||||||
|
Err(e) => {
|
||||||
|
tracing::warn!(%name, error = ?e, "auto-update: register_agent failed");
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
};
|
||||||
|
let applied_dir = Coordinator::agent_applied_dir(&name);
|
||||||
|
let claude_dir = Coordinator::agent_claude_dir(&name);
|
||||||
|
match lifecycle::rebuild(
|
||||||
|
&name,
|
||||||
|
&coord.hyperhive_flake,
|
||||||
|
&agent_dir,
|
||||||
|
&applied_dir,
|
||||||
|
&claude_dir,
|
||||||
|
)
|
||||||
|
.await
|
||||||
|
{
|
||||||
|
Ok(()) => {
|
||||||
|
if let Err(e) = std::fs::write(&marker, ¤t_rev) {
|
||||||
|
tracing::warn!(%name, error = ?e, "auto-update: write rev marker failed");
|
||||||
|
} else {
|
||||||
|
tracing::info!(%name, "auto-update: agent rebuilt");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
Err(e) => {
|
||||||
|
tracing::warn!(%name, error = ?e, "auto-update: rebuild failed");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}));
|
||||||
|
}
|
||||||
|
|
||||||
|
for t in tasks {
|
||||||
|
let _ = t.await;
|
||||||
|
}
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
@ -8,6 +8,7 @@ use hive_sh4re::{HostRequest, HostResponse};
|
||||||
mod actions;
|
mod actions;
|
||||||
mod agent_server;
|
mod agent_server;
|
||||||
mod approvals;
|
mod approvals;
|
||||||
|
mod auto_update;
|
||||||
mod broker;
|
mod broker;
|
||||||
mod client;
|
mod client;
|
||||||
mod coordinator;
|
mod coordinator;
|
||||||
|
|
@ -85,6 +86,15 @@ async fn main() -> Result<()> {
|
||||||
dashboard_port,
|
dashboard_port,
|
||||||
} => {
|
} => {
|
||||||
let coord = Arc::new(Coordinator::open(&db, hyperhive_flake)?);
|
let coord = Arc::new(Coordinator::open(&db, hyperhive_flake)?);
|
||||||
|
// Run auto-update in the background — don't block service start.
|
||||||
|
// Operators sometimes need the admin socket up to debug a stuck
|
||||||
|
// agent, and the rebuild loop can take tens of seconds.
|
||||||
|
let update_coord = coord.clone();
|
||||||
|
tokio::spawn(async move {
|
||||||
|
if let Err(e) = auto_update::run(update_coord).await {
|
||||||
|
tracing::warn!(error = ?e, "auto-update task failed");
|
||||||
|
}
|
||||||
|
});
|
||||||
manager_server::start(coord.clone())?;
|
manager_server::start(coord.clone())?;
|
||||||
let dash_coord = coord.clone();
|
let dash_coord = coord.clone();
|
||||||
tokio::spawn(async move {
|
tokio::spawn(async move {
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue