manager: same lifecycle as agents; auto-spawn on hive-c0re start
This commit is contained in:
parent
d81a845dbe
commit
f99ed3fe7a
8 changed files with 168 additions and 65 deletions
28
CLAUDE.md
28
CLAUDE.md
|
|
@ -156,6 +156,34 @@ docs/damocles-migration.md options for moving damocles onto hyperhive
|
|||
marks them `failed` with note `"agent state dir missing"` so they fall out
|
||||
of `pending`. They stay in sqlite for audit.
|
||||
|
||||
## Manager (hm1nd) is hive-c0re-managed
|
||||
|
||||
The manager container runs through the **same lifecycle as sub-agents** —
|
||||
no separate code path. On `hive-c0re serve` startup, if `nixos-container
|
||||
list` doesn't include `hm1nd`, hive-c0re creates it. The manager's flake
|
||||
lives at `/var/lib/hyperhive/applied/hm1nd/`; its proposed (manager-editable)
|
||||
config at `/var/lib/hyperhive/agents/hm1nd/config/`. Manager can edit its
|
||||
own `agent.nix` (visible inside the container at `/agents/hm1nd/config/`),
|
||||
commit, and submit `request-apply-commit hm1nd <sha>` for operator
|
||||
approval — same flow as for sub-agents.
|
||||
|
||||
Differences from sub-agents:
|
||||
- `flake.nix` extends `hyperhive.nixosConfigurations.manager` (vs
|
||||
`agent-base`).
|
||||
- Container name is `hm1nd` (no `h-` prefix).
|
||||
- Fixed web UI port (`MANAGER_PORT = 8000`).
|
||||
- `set_nspawn_flags` adds an extra bind: `/var/lib/hyperhive/agents` →
|
||||
`/agents` (RW), so the manager can edit per-agent proposed repos.
|
||||
- First-deploy spawn bypasses the approval queue (manager is required
|
||||
infrastructure).
|
||||
- Per-agent socket is the manager socket at `/run/hyperhive/manager/`, owned
|
||||
by `manager_server::start`. `coordinator::ensure_runtime` returns that
|
||||
path for manager and the usual `/run/hyperhive/agents/<name>/` for the
|
||||
rest.
|
||||
|
||||
**Migration note:** drop any `containers.hm1nd = { ... }` block from your
|
||||
host NixOS config. hyperhive creates and updates the manager itself now.
|
||||
|
||||
## Auto-update on startup
|
||||
|
||||
`hive-c0re serve` runs `auto_update::run` in a background task right after
|
||||
|
|
|
|||
|
|
@ -33,7 +33,7 @@ pub async fn approve(coord: Arc<Coordinator>, id: i64) -> Result<()> {
|
|||
"approval: running action",
|
||||
);
|
||||
|
||||
let agent_dir = coord.register_agent(&approval.agent)?;
|
||||
let agent_dir = coord.ensure_runtime(&approval.agent)?;
|
||||
let proposed_dir = Coordinator::agent_proposed_dir(&approval.agent);
|
||||
let applied_dir = Coordinator::agent_applied_dir(&approval.agent);
|
||||
let claude_dir = Coordinator::agent_claude_dir(&approval.agent);
|
||||
|
|
|
|||
|
|
@ -11,8 +11,7 @@
|
|||
use std::path::{Path, PathBuf};
|
||||
use std::sync::Arc;
|
||||
|
||||
use anyhow::{Context, Result, bail};
|
||||
use tokio::process::Command;
|
||||
use anyhow::{Context, Result};
|
||||
|
||||
use crate::coordinator::Coordinator;
|
||||
use crate::lifecycle::{self, AGENT_PREFIX, MANAGER_NAME};
|
||||
|
|
@ -55,8 +54,8 @@ pub fn agent_needs_update(name: &str, current_rev: &str) -> bool {
|
|||
pub async fn rebuild_agent(coord: &Arc<Coordinator>, name: &str, current_rev: &str) -> Result<()> {
|
||||
tracing::info!(%name, rev = %current_rev, "rebuild agent");
|
||||
let agent_dir = coord
|
||||
.register_agent(name)
|
||||
.with_context(|| format!("register_agent {name}"))?;
|
||||
.ensure_runtime(name)
|
||||
.with_context(|| format!("ensure_runtime {name}"))?;
|
||||
let applied_dir = Coordinator::agent_applied_dir(name);
|
||||
let claude_dir = Coordinator::agent_claude_dir(name);
|
||||
lifecycle::rebuild(
|
||||
|
|
@ -72,26 +71,34 @@ pub async fn rebuild_agent(coord: &Arc<Coordinator>, name: &str, current_rev: &s
|
|||
Ok(())
|
||||
}
|
||||
|
||||
/// Apply the manager's host-declared config: `nixos-container update hm1nd`
|
||||
/// (no `--flake`) re-reads `/etc/nixos-containers/hm1nd.conf`, which the
|
||||
/// host's `nixos-rebuild switch` rewrites to point at the new `SYSTEM_PATH`.
|
||||
/// Idempotent when nothing has changed.
|
||||
pub async fn rebuild_manager(current_rev: &str) -> Result<()> {
|
||||
tracing::info!(rev = %current_rev, "rebuild manager (nixos-container update hm1nd)");
|
||||
let out = Command::new("nixos-container")
|
||||
.args(["update", MANAGER_NAME])
|
||||
.output()
|
||||
.await
|
||||
.context("invoke nixos-container update hm1nd")?;
|
||||
if !out.status.success() {
|
||||
bail!(
|
||||
"nixos-container update {MANAGER_NAME} failed ({}): {}",
|
||||
out.status,
|
||||
String::from_utf8_lossy(&out.stderr).trim()
|
||||
);
|
||||
|
||||
/// Auto-create the manager container on startup if it isn't already there.
|
||||
/// hive-c0re manages hm1nd end-to-end (Phase 8 follow-up): operators no
|
||||
/// longer declare `containers.hm1nd` in their host NixOS config. Bypasses
|
||||
/// the approval queue — manager is required infrastructure. Idempotent.
|
||||
pub async fn ensure_manager(coord: &Arc<Coordinator>) -> Result<()> {
|
||||
let existing = lifecycle::list().await.unwrap_or_default();
|
||||
if existing.iter().any(|c| c == MANAGER_NAME) {
|
||||
tracing::debug!("manager container already present");
|
||||
return Ok(());
|
||||
}
|
||||
tracing::info!("manager container missing — spawning");
|
||||
let runtime = coord.ensure_runtime(MANAGER_NAME)?;
|
||||
let proposed = Coordinator::agent_proposed_dir(MANAGER_NAME);
|
||||
let applied = Coordinator::agent_applied_dir(MANAGER_NAME);
|
||||
let claude_dir = Coordinator::agent_claude_dir(MANAGER_NAME);
|
||||
lifecycle::spawn(
|
||||
MANAGER_NAME,
|
||||
&coord.hyperhive_flake,
|
||||
&runtime,
|
||||
&proposed,
|
||||
&applied,
|
||||
&claude_dir,
|
||||
)
|
||||
.await?;
|
||||
if let Some(rev) = current_flake_rev(&coord.hyperhive_flake) {
|
||||
let _ = std::fs::write(rev_marker_path(MANAGER_NAME), rev);
|
||||
}
|
||||
std::fs::write(rev_marker_path(MANAGER_NAME), current_rev)
|
||||
.with_context(|| format!("write rev marker for {MANAGER_NAME}"))?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
|
|
@ -117,16 +124,17 @@ pub async fn run(coord: Arc<Coordinator>) -> Result<()> {
|
|||
};
|
||||
|
||||
let mut tasks = Vec::new();
|
||||
let mut manager_present = false;
|
||||
for container in containers {
|
||||
if container == MANAGER_NAME {
|
||||
manager_present = true;
|
||||
continue;
|
||||
}
|
||||
let Some(name) = container.strip_prefix(AGENT_PREFIX) else {
|
||||
// Manager and sub-agents share the same lifecycle now; both go
|
||||
// through rebuild_agent with name-derived paths.
|
||||
let logical = if container == MANAGER_NAME {
|
||||
Some(MANAGER_NAME.to_owned())
|
||||
} else {
|
||||
container.strip_prefix(AGENT_PREFIX).map(str::to_owned)
|
||||
};
|
||||
let Some(name) = logical else {
|
||||
continue;
|
||||
};
|
||||
let name = name.to_owned();
|
||||
if !agent_needs_update(&name, ¤t_rev) {
|
||||
tracing::debug!(%name, "auto-update: up-to-date");
|
||||
continue;
|
||||
|
|
@ -140,19 +148,6 @@ pub async fn run(coord: Arc<Coordinator>) -> Result<()> {
|
|||
}));
|
||||
}
|
||||
|
||||
// Manager runs unconditionally when its marker differs: even if the host
|
||||
// hasn't been rebuilt yet, `nixos-container update hm1nd` is a no-op, so
|
||||
// there's no harm. The host's own activation already updates declarative
|
||||
// containers — this is belt-and-braces for hive-c0re restarts.
|
||||
if manager_present && agent_needs_update(MANAGER_NAME, ¤t_rev) {
|
||||
let current_rev = current_rev.clone();
|
||||
tasks.push(tokio::spawn(async move {
|
||||
if let Err(e) = rebuild_manager(¤t_rev).await {
|
||||
tracing::warn!(error = ?e, "auto-update: manager rebuild failed");
|
||||
}
|
||||
}));
|
||||
}
|
||||
|
||||
for t in tasks {
|
||||
let _ = t.await;
|
||||
}
|
||||
|
|
|
|||
|
|
@ -118,6 +118,21 @@ impl Coordinator {
|
|||
Self::manager_dir().join("mcp.sock")
|
||||
}
|
||||
|
||||
/// Ensure a runtime dir + (for sub-agents) per-agent socket exists. For
|
||||
/// the manager, `manager_server::start` owns the socket — just return
|
||||
/// the dir. For sub-agents this is `register_agent` (creates a fresh
|
||||
/// listener bound to `socket_path(name)`). Source directory of the
|
||||
/// `/run/hive/mcp.sock` bind that ends up in `set_nspawn_flags`.
|
||||
pub fn ensure_runtime(&self, name: &str) -> Result<PathBuf> {
|
||||
if name == crate::lifecycle::MANAGER_NAME {
|
||||
let dir = Self::manager_dir();
|
||||
std::fs::create_dir_all(&dir)
|
||||
.with_context(|| format!("create manager dir {}", dir.display()))?;
|
||||
return Ok(dir);
|
||||
}
|
||||
self.register_agent(name)
|
||||
}
|
||||
|
||||
/// Per-agent state root (parent of `config/`, future `prompts/`, etc.).
|
||||
pub fn agent_state_root(name: &str) -> PathBuf {
|
||||
PathBuf::from(format!("{AGENT_STATE_ROOT}/{name}"))
|
||||
|
|
|
|||
|
|
@ -172,11 +172,7 @@ async fn post_rebuild(State(state): State<AppState>, AxumPath(name): AxumPath<St
|
|||
"rebuild: hyperhive_flake has no canonical path; manual rebuild only via `hive-c0re rebuild`",
|
||||
);
|
||||
};
|
||||
let result = if name == lifecycle::MANAGER_NAME {
|
||||
crate::auto_update::rebuild_manager(¤t_rev).await
|
||||
} else {
|
||||
crate::auto_update::rebuild_agent(&state.coord, &name, ¤t_rev).await
|
||||
};
|
||||
let result = crate::auto_update::rebuild_agent(&state.coord, &name, ¤t_rev).await;
|
||||
match result {
|
||||
Ok(()) => Redirect::to("/").into_response(),
|
||||
Err(e) => error_response(&format!("rebuild {name} failed: {e:#}")),
|
||||
|
|
|
|||
|
|
@ -10,9 +10,15 @@ use tokio::process::Command;
|
|||
/// name itself can be at most `MAX_AGENT_NAME` chars.
|
||||
pub const AGENT_PREFIX: &str = "h-";
|
||||
pub const MAX_AGENT_NAME: usize = 9;
|
||||
/// Container name of the manager (a separate slot from sub-agents).
|
||||
/// Container name of the manager. Lives in the same path scheme as sub-agents
|
||||
/// (`/var/lib/hyperhive/agents/hm1nd/`, `/var/lib/hyperhive/applied/hm1nd/`),
|
||||
/// but its container has no `h-` prefix and extends a different
|
||||
/// nixosConfiguration (`manager`, not `agent-base`).
|
||||
pub const MANAGER_NAME: &str = "hm1nd";
|
||||
|
||||
/// Web UI port reserved for the manager (sub-agents hash into 8100..8999).
|
||||
pub const MANAGER_PORT: u16 = 8000;
|
||||
|
||||
/// Mount point of the per-agent runtime directory inside the container.
|
||||
pub const CONTAINER_RUNTIME_MOUNT: &str = "/run/hive";
|
||||
|
||||
|
|
@ -35,9 +41,13 @@ const DEFAULT_MEMORY_MAX: &str = "2G";
|
|||
const DEFAULT_CPU_QUOTA: &str = "50%";
|
||||
|
||||
/// Returns the per-agent web UI port. Same hash on both sides — manager,
|
||||
/// dashboard, and agent harness all agree.
|
||||
/// dashboard, and agent harness all agree. Manager is fixed at
|
||||
/// `MANAGER_PORT`.
|
||||
#[must_use]
|
||||
pub fn agent_web_port(name: &str) -> u16 {
|
||||
if name == MANAGER_NAME {
|
||||
return MANAGER_PORT;
|
||||
}
|
||||
let mut hash: u32 = 2_166_136_261;
|
||||
for b in name.bytes() {
|
||||
hash ^= u32::from(b);
|
||||
|
|
@ -47,14 +57,34 @@ pub fn agent_web_port(name: &str) -> u16 {
|
|||
WEB_PORT_BASE + u16::try_from(hash % u32::from(WEB_PORT_RANGE)).unwrap_or(0)
|
||||
}
|
||||
|
||||
#[must_use]
|
||||
pub fn container_name(name: &str) -> String {
|
||||
if name == MANAGER_NAME {
|
||||
MANAGER_NAME.to_owned()
|
||||
} else {
|
||||
format!("{AGENT_PREFIX}{name}")
|
||||
}
|
||||
}
|
||||
|
||||
#[must_use]
|
||||
pub fn is_manager(name: &str) -> bool {
|
||||
name == MANAGER_NAME
|
||||
}
|
||||
|
||||
/// The nixosConfiguration in the hyperhive flake the agent's `flake.nix`
|
||||
/// extends. Manager → `manager`; everyone else → `agent-base`.
|
||||
#[must_use]
|
||||
pub fn flake_base(name: &str) -> &'static str {
|
||||
if is_manager(name) { "manager" } else { "agent-base" }
|
||||
}
|
||||
|
||||
fn validate(name: &str) -> Result<()> {
|
||||
if name.is_empty() {
|
||||
bail!("agent name must not be empty");
|
||||
}
|
||||
if is_manager(name) {
|
||||
return Ok(());
|
||||
}
|
||||
if name.len() > MAX_AGENT_NAME {
|
||||
bail!(
|
||||
"agent name '{name}' is too long ({} chars); max {MAX_AGENT_NAME}",
|
||||
|
|
@ -180,14 +210,25 @@ pub async fn setup_applied(applied_dir: &Path, name: &str, hyperhive_flake: &str
|
|||
.with_context(|| format!("create {}", applied_dir.display()))?;
|
||||
|
||||
let port = agent_web_port(name);
|
||||
let base = flake_base(name);
|
||||
let service = if is_manager(name) {
|
||||
"hive-m1nd"
|
||||
} else {
|
||||
"hive-ag3nt"
|
||||
};
|
||||
let description = if is_manager(name) {
|
||||
format!("hyperhive manager {name}")
|
||||
} else {
|
||||
format!("hyperhive sub-agent {name}")
|
||||
};
|
||||
let flake_body = format!(
|
||||
r#"{{
|
||||
description = "hyperhive sub-agent {name}";
|
||||
description = "{description}";
|
||||
inputs.hyperhive.url = "{hyperhive_flake}";
|
||||
outputs =
|
||||
{{ hyperhive, ... }}:
|
||||
{{
|
||||
nixosConfigurations.default = hyperhive.nixosConfigurations.agent-base.extendModules {{
|
||||
nixosConfigurations.default = hyperhive.nixosConfigurations.{base}.extendModules {{
|
||||
modules = [
|
||||
./agent.nix
|
||||
{{
|
||||
|
|
@ -198,7 +239,7 @@ pub async fn setup_applied(applied_dir: &Path, name: &str, hyperhive_flake: &str
|
|||
[init]
|
||||
defaultBranch = main
|
||||
'';
|
||||
systemd.services.hive-ag3nt.environment = {{
|
||||
systemd.services.{service}.environment = {{
|
||||
HIVE_PORT = "{port}";
|
||||
HIVE_LABEL = "{name}";
|
||||
}};
|
||||
|
|
@ -372,14 +413,35 @@ async fn systemd_daemon_reload() -> Result<()> {
|
|||
/// is reachable on the host) and `EXTRA_NSPAWN_FLAGS` (the runtime-dir bind).
|
||||
/// The start script expands `$EXTRA_NSPAWN_FLAGS` unquoted into the
|
||||
/// `systemd-nspawn` command.
|
||||
fn set_nspawn_flags(container: &str, agent_dir: &Path, claude_dir: &Path) -> Result<()> {
|
||||
/// Where in the container's filesystem the manager sees its agents tree.
|
||||
/// Matches the `/agents` path that pre-Phase-8 hosts declared via
|
||||
/// `containers.hm1nd.bindMounts."/agents"`.
|
||||
pub const CONTAINER_MANAGER_AGENTS_MOUNT: &str = "/agents";
|
||||
|
||||
/// The on-host root that gets bind-mounted to `/agents` inside the manager.
|
||||
/// Hard-coded to match `AGENT_STATE_ROOT` in coordinator.rs (kept duplicated
|
||||
/// here so lifecycle stays usable as a leaf module).
|
||||
const HOST_AGENTS_ROOT: &str = "/var/lib/hyperhive/agents";
|
||||
|
||||
fn set_nspawn_flags(container: &str, runtime_dir: &Path, claude_dir: &Path) -> Result<()> {
|
||||
let path = format!("/etc/nixos-containers/{container}.conf");
|
||||
let original = std::fs::read_to_string(&path).with_context(|| format!("read {path}"))?;
|
||||
let bind_flag = format!(
|
||||
"EXTRA_NSPAWN_FLAGS=\"--bind={runtime}:{CONTAINER_RUNTIME_MOUNT} --bind={claude}:{CONTAINER_CLAUDE_MOUNT}\"",
|
||||
runtime = agent_dir.display(),
|
||||
let mut binds = format!(
|
||||
"--bind={runtime}:{CONTAINER_RUNTIME_MOUNT} --bind={claude}:{CONTAINER_CLAUDE_MOUNT}",
|
||||
runtime = runtime_dir.display(),
|
||||
claude = claude_dir.display(),
|
||||
);
|
||||
if container == MANAGER_NAME {
|
||||
// Manager edits sub-agent proposed/ repos and its own. RW so it can
|
||||
// git-commit. Sub-agents see only their own /run/hive socket and
|
||||
// /root/.claude (no /agents).
|
||||
use std::fmt::Write as _;
|
||||
let _ = write!(
|
||||
binds,
|
||||
" --bind={HOST_AGENTS_ROOT}:{CONTAINER_MANAGER_AGENTS_MOUNT}"
|
||||
);
|
||||
}
|
||||
let bind_flag = format!("EXTRA_NSPAWN_FLAGS=\"{binds}\"");
|
||||
let mut lines: Vec<String> = original
|
||||
.lines()
|
||||
.filter(|line| {
|
||||
|
|
|
|||
|
|
@ -86,16 +86,23 @@ async fn main() -> Result<()> {
|
|||
dashboard_port,
|
||||
} => {
|
||||
let coord = Arc::new(Coordinator::open(&db, hyperhive_flake)?);
|
||||
// Run auto-update in the background — don't block service start.
|
||||
// Operators sometimes need the admin socket up to debug a stuck
|
||||
// agent, and the rebuild loop can take tens of seconds.
|
||||
manager_server::start(coord.clone())?;
|
||||
// Auto-create the manager container if it isn't there yet. Block
|
||||
// on this — without hm1nd the system has no manager harness.
|
||||
// Failures are logged but allowed: a broken auto-spawn shouldn't
|
||||
// make the dashboard unreachable for debugging.
|
||||
if let Err(e) = auto_update::ensure_manager(&coord).await {
|
||||
tracing::warn!(error = ?e, "auto-spawn manager failed");
|
||||
}
|
||||
// Auto-update in the background — don't block service start.
|
||||
// Sub-agent rebuilds can take tens of seconds; we want the admin
|
||||
// socket up immediately.
|
||||
let update_coord = coord.clone();
|
||||
tokio::spawn(async move {
|
||||
if let Err(e) = auto_update::run(update_coord).await {
|
||||
tracing::warn!(error = ?e, "auto-update task failed");
|
||||
}
|
||||
});
|
||||
manager_server::start(coord.clone())?;
|
||||
let dash_coord = coord.clone();
|
||||
tokio::spawn(async move {
|
||||
if let Err(e) = dashboard::serve(dashboard_port, dash_coord).await {
|
||||
|
|
|
|||
|
|
@ -61,7 +61,7 @@ async fn dispatch(req: &HostRequest, coord: Arc<Coordinator>) -> HostResponse {
|
|||
Ok(match req {
|
||||
HostRequest::Spawn { name } => {
|
||||
tracing::info!(%name, "spawn");
|
||||
let agent_dir = coord.register_agent(name)?;
|
||||
let agent_dir = coord.ensure_runtime(name)?;
|
||||
let proposed_dir = Coordinator::agent_proposed_dir(name);
|
||||
let applied_dir = Coordinator::agent_applied_dir(name);
|
||||
let claude_dir = Coordinator::agent_claude_dir(name);
|
||||
|
|
@ -101,7 +101,7 @@ async fn dispatch(req: &HostRequest, coord: Arc<Coordinator>) -> HostResponse {
|
|||
}
|
||||
HostRequest::Rebuild { name } => {
|
||||
tracing::info!(%name, "rebuild");
|
||||
let agent_dir = coord.register_agent(name)?;
|
||||
let agent_dir = coord.ensure_runtime(name)?;
|
||||
let applied_dir = Coordinator::agent_applied_dir(name);
|
||||
let claude_dir = Coordinator::agent_claude_dir(name);
|
||||
lifecycle::rebuild(
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue