Phase 5b: per-agent config flakes; approve validates + advances commit
This commit is contained in:
parent
22b65d35f3
commit
433c0d212e
6 changed files with 182 additions and 25 deletions
|
|
@ -14,22 +14,25 @@ use crate::broker::Broker;
|
||||||
|
|
||||||
const AGENT_RUNTIME_ROOT: &str = "/run/hyperhive/agents";
|
const AGENT_RUNTIME_ROOT: &str = "/run/hyperhive/agents";
|
||||||
const MANAGER_RUNTIME_ROOT: &str = "/run/hyperhive/manager";
|
const MANAGER_RUNTIME_ROOT: &str = "/run/hyperhive/manager";
|
||||||
|
const AGENT_STATE_ROOT: &str = "/var/lib/hyperhive/agents";
|
||||||
|
|
||||||
pub struct Coordinator {
|
pub struct Coordinator {
|
||||||
pub broker: Arc<Broker>,
|
pub broker: Arc<Broker>,
|
||||||
pub approvals: Arc<Approvals>,
|
pub approvals: Arc<Approvals>,
|
||||||
pub agent_flake: String,
|
/// URL of the hyperhive flake (no fragment). Inlined into per-agent
|
||||||
|
/// `flake.nix` files as `inputs.hyperhive.url`.
|
||||||
|
pub hyperhive_flake: String,
|
||||||
agents: Mutex<HashMap<String, AgentSocket>>,
|
agents: Mutex<HashMap<String, AgentSocket>>,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl Coordinator {
|
impl Coordinator {
|
||||||
pub fn open(db_path: &Path, agent_flake: String) -> Result<Self> {
|
pub fn open(db_path: &Path, hyperhive_flake: String) -> Result<Self> {
|
||||||
let broker = Broker::open(db_path).context("open broker")?;
|
let broker = Broker::open(db_path).context("open broker")?;
|
||||||
let approvals = Approvals::open(db_path).context("open approvals")?;
|
let approvals = Approvals::open(db_path).context("open approvals")?;
|
||||||
Ok(Self {
|
Ok(Self {
|
||||||
broker: Arc::new(broker),
|
broker: Arc::new(broker),
|
||||||
approvals: Arc::new(approvals),
|
approvals: Arc::new(approvals),
|
||||||
agent_flake,
|
hyperhive_flake,
|
||||||
agents: Mutex::new(HashMap::new()),
|
agents: Mutex::new(HashMap::new()),
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
|
|
@ -69,4 +72,8 @@ impl Coordinator {
|
||||||
pub fn manager_socket_path() -> PathBuf {
|
pub fn manager_socket_path() -> PathBuf {
|
||||||
Self::manager_dir().join("mcp.sock")
|
Self::manager_dir().join("mcp.sock")
|
||||||
}
|
}
|
||||||
|
|
||||||
|
pub fn agent_config_dir(name: &str) -> PathBuf {
|
||||||
|
PathBuf::from(format!("{AGENT_STATE_ROOT}/{name}/config"))
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -1,4 +1,4 @@
|
||||||
//! Thin async wrappers over `nixos-container`.
|
//! `nixos-container` lifecycle + per-agent config flake generation.
|
||||||
|
|
||||||
use std::path::Path;
|
use std::path::Path;
|
||||||
|
|
||||||
|
|
@ -16,6 +16,9 @@ pub const MANAGER_NAME: &str = "hm1nd";
|
||||||
/// Mount point of the per-agent runtime directory inside the container.
|
/// Mount point of the per-agent runtime directory inside the container.
|
||||||
pub const CONTAINER_RUNTIME_MOUNT: &str = "/run/hive";
|
pub const CONTAINER_RUNTIME_MOUNT: &str = "/run/hive";
|
||||||
|
|
||||||
|
const GIT_NAME: &str = "hive-c0re";
|
||||||
|
const GIT_EMAIL: &str = "hive-c0re@hyperhive";
|
||||||
|
|
||||||
pub fn container_name(name: &str) -> String {
|
pub fn container_name(name: &str) -> String {
|
||||||
format!("{AGENT_PREFIX}{name}")
|
format!("{AGENT_PREFIX}{name}")
|
||||||
}
|
}
|
||||||
|
|
@ -33,10 +36,17 @@ fn validate(name: &str) -> Result<()> {
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
pub async fn spawn(name: &str, agent_flake: &str, agent_dir: &Path) -> Result<()> {
|
pub async fn spawn(
|
||||||
|
name: &str,
|
||||||
|
hyperhive_flake: &str,
|
||||||
|
agent_dir: &Path,
|
||||||
|
config_dir: &Path,
|
||||||
|
) -> Result<()> {
|
||||||
validate(name)?;
|
validate(name)?;
|
||||||
|
setup_config(config_dir, name, hyperhive_flake).await?;
|
||||||
let container = container_name(name);
|
let container = container_name(name);
|
||||||
run(&["create", &container, "--flake", agent_flake]).await?;
|
let flake_ref = format!("{}#default", config_dir.display());
|
||||||
|
run(&["create", &container, "--flake", &flake_ref]).await?;
|
||||||
set_nspawn_flags(&container, agent_dir)?;
|
set_nspawn_flags(&container, agent_dir)?;
|
||||||
run(&["start", &container]).await
|
run(&["start", &container]).await
|
||||||
}
|
}
|
||||||
|
|
@ -47,11 +57,18 @@ pub async fn kill(name: &str) -> Result<()> {
|
||||||
run(&["stop", &container]).await
|
run(&["stop", &container]).await
|
||||||
}
|
}
|
||||||
|
|
||||||
pub async fn rebuild(name: &str, agent_flake: &str, agent_dir: &Path) -> Result<()> {
|
pub async fn rebuild(
|
||||||
|
name: &str,
|
||||||
|
hyperhive_flake: &str,
|
||||||
|
agent_dir: &Path,
|
||||||
|
config_dir: &Path,
|
||||||
|
) -> Result<()> {
|
||||||
validate(name)?;
|
validate(name)?;
|
||||||
|
setup_config(config_dir, name, hyperhive_flake).await?;
|
||||||
let container = container_name(name);
|
let container = container_name(name);
|
||||||
|
let flake_ref = format!("{}#default", config_dir.display());
|
||||||
set_nspawn_flags(&container, agent_dir)?;
|
set_nspawn_flags(&container, agent_dir)?;
|
||||||
run(&["update", &container, "--flake", agent_flake]).await?;
|
run(&["update", &container, "--flake", &flake_ref]).await?;
|
||||||
// Restart so any nspawn-level changes (bind mounts, networking, etc.) apply.
|
// Restart so any nspawn-level changes (bind mounts, networking, etc.) apply.
|
||||||
run(&["stop", &container]).await?;
|
run(&["stop", &container]).await?;
|
||||||
run(&["start", &container]).await
|
run(&["start", &container]).await
|
||||||
|
|
@ -78,6 +95,113 @@ pub async fn list() -> Result<Vec<String>> {
|
||||||
.collect())
|
.collect())
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Ensure `config_dir` exists as a git repo containing a per-agent flake. The
|
||||||
|
/// `flake.nix` is rewritten every call (so a new hyperhive store path
|
||||||
|
/// propagates on rebuild); `agent.nix` is written only the first time
|
||||||
|
/// (manager-editable thereafter).
|
||||||
|
pub async fn setup_config(config_dir: &Path, name: &str, hyperhive_flake: &str) -> Result<()> {
|
||||||
|
std::fs::create_dir_all(config_dir)
|
||||||
|
.with_context(|| format!("create {}", config_dir.display()))?;
|
||||||
|
|
||||||
|
let flake_path = config_dir.join("flake.nix");
|
||||||
|
let flake_body = format!(
|
||||||
|
r#"{{
|
||||||
|
description = "hyperhive sub-agent {name}";
|
||||||
|
inputs.hyperhive.url = "{hyperhive_flake}";
|
||||||
|
outputs =
|
||||||
|
{{ hyperhive, ... }}:
|
||||||
|
{{
|
||||||
|
nixosConfigurations.default = hyperhive.nixosConfigurations.agent-base.extendModules {{
|
||||||
|
modules = [ ./agent.nix ];
|
||||||
|
}};
|
||||||
|
}};
|
||||||
|
}}
|
||||||
|
"#,
|
||||||
|
);
|
||||||
|
std::fs::write(&flake_path, flake_body)
|
||||||
|
.with_context(|| format!("write {}", flake_path.display()))?;
|
||||||
|
|
||||||
|
let agent_path = config_dir.join("agent.nix");
|
||||||
|
if !agent_path.exists() {
|
||||||
|
let initial = format!(
|
||||||
|
"{{ ... }}:\n{{\n # Per-agent overrides for {name}. The manager edits this\n # file (and commits) to customise the agent's NixOS config.\n}}\n",
|
||||||
|
);
|
||||||
|
std::fs::write(&agent_path, initial)
|
||||||
|
.with_context(|| format!("write {}", agent_path.display()))?;
|
||||||
|
}
|
||||||
|
|
||||||
|
if !config_dir.join(".git").exists() {
|
||||||
|
git(config_dir, &["init", "--initial-branch=main"]).await?;
|
||||||
|
}
|
||||||
|
git(config_dir, &["add", "-A"]).await?;
|
||||||
|
let clean = git_status(config_dir, &["diff", "--cached", "--quiet"]).await?;
|
||||||
|
if !clean {
|
||||||
|
git(
|
||||||
|
config_dir,
|
||||||
|
&[
|
||||||
|
"-c",
|
||||||
|
&format!("user.name={GIT_NAME}"),
|
||||||
|
"-c",
|
||||||
|
&format!("user.email={GIT_EMAIL}"),
|
||||||
|
"commit",
|
||||||
|
"-m",
|
||||||
|
"hive-c0re sync",
|
||||||
|
],
|
||||||
|
)
|
||||||
|
.await?;
|
||||||
|
}
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Verify `commit_ref` exists in the config repo, advance `main` to it, and
|
||||||
|
/// reset the working tree. Caller is responsible for the subsequent rebuild.
|
||||||
|
pub async fn apply_commit(config_dir: &Path, commit_ref: &str) -> Result<()> {
|
||||||
|
let st = Command::new("git")
|
||||||
|
.current_dir(config_dir)
|
||||||
|
.args(["cat-file", "-e", commit_ref])
|
||||||
|
.status()
|
||||||
|
.await
|
||||||
|
.with_context(|| format!("git cat-file in {}", config_dir.display()))?;
|
||||||
|
if !st.success() {
|
||||||
|
bail!(
|
||||||
|
"commit {commit_ref} not found in {}",
|
||||||
|
config_dir.display()
|
||||||
|
);
|
||||||
|
}
|
||||||
|
git(config_dir, &["update-ref", "refs/heads/main", commit_ref]).await?;
|
||||||
|
git(config_dir, &["reset", "--hard", commit_ref]).await?;
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
|
async fn git(dir: &Path, args: &[&str]) -> Result<()> {
|
||||||
|
let out = Command::new("git")
|
||||||
|
.current_dir(dir)
|
||||||
|
.args(args)
|
||||||
|
.output()
|
||||||
|
.await
|
||||||
|
.with_context(|| format!("git {} in {}", args.join(" "), dir.display()))?;
|
||||||
|
if !out.status.success() {
|
||||||
|
bail!(
|
||||||
|
"git {} failed ({}): {}",
|
||||||
|
args.join(" "),
|
||||||
|
out.status,
|
||||||
|
String::from_utf8_lossy(&out.stderr).trim()
|
||||||
|
);
|
||||||
|
}
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Returns true if the command exits 0.
|
||||||
|
async fn git_status(dir: &Path, args: &[&str]) -> Result<bool> {
|
||||||
|
let st = Command::new("git")
|
||||||
|
.current_dir(dir)
|
||||||
|
.args(args)
|
||||||
|
.status()
|
||||||
|
.await
|
||||||
|
.with_context(|| format!("git {} in {}", args.join(" "), dir.display()))?;
|
||||||
|
Ok(st.success())
|
||||||
|
}
|
||||||
|
|
||||||
/// Idempotently rewrite the `EXTRA_NSPAWN_FLAGS` line in
|
/// Idempotently rewrite the `EXTRA_NSPAWN_FLAGS` line in
|
||||||
/// `/etc/nixos-containers/<container>.conf`. The start script expands this
|
/// `/etc/nixos-containers/<container>.conf`. The start script expands this
|
||||||
/// variable unquoted into the `systemd-nspawn` command.
|
/// variable unquoted into the `systemd-nspawn` command.
|
||||||
|
|
|
||||||
|
|
@ -31,9 +31,10 @@ struct Cli {
|
||||||
enum Cmd {
|
enum Cmd {
|
||||||
/// Run the coordinator daemon.
|
/// Run the coordinator daemon.
|
||||||
Serve {
|
Serve {
|
||||||
/// Flake reference for the agent base template.
|
/// URL of the hyperhive flake. Inlined into each per-agent
|
||||||
#[arg(long, default_value = "/etc/hyperhive#agent-base")]
|
/// `flake.nix` as the `hyperhive` input.
|
||||||
agent_flake: String,
|
#[arg(long, default_value = "/etc/hyperhive")]
|
||||||
|
hyperhive_flake: String,
|
||||||
/// Path to the sqlite message store.
|
/// Path to the sqlite message store.
|
||||||
#[arg(long, default_value = "/var/lib/hyperhive/broker.sqlite")]
|
#[arg(long, default_value = "/var/lib/hyperhive/broker.sqlite")]
|
||||||
db: PathBuf,
|
db: PathBuf,
|
||||||
|
|
@ -65,8 +66,11 @@ async fn main() -> Result<()> {
|
||||||
|
|
||||||
let cli = Cli::parse();
|
let cli = Cli::parse();
|
||||||
match cli.cmd {
|
match cli.cmd {
|
||||||
Cmd::Serve { agent_flake, db } => {
|
Cmd::Serve {
|
||||||
let coord = Arc::new(Coordinator::open(&db, agent_flake)?);
|
hyperhive_flake,
|
||||||
|
db,
|
||||||
|
} => {
|
||||||
|
let coord = Arc::new(Coordinator::open(&db, hyperhive_flake)?);
|
||||||
manager_server::start(coord.clone())?;
|
manager_server::start(coord.clone())?;
|
||||||
server::serve(&cli.socket, coord).await
|
server::serve(&cli.socket, coord).await
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -95,7 +95,10 @@ async fn dispatch(req: &ManagerRequest, coord: &Coordinator) -> ManagerResponse
|
||||||
tracing::info!(%name, "manager: spawn");
|
tracing::info!(%name, "manager: spawn");
|
||||||
let result: Result<()> = async {
|
let result: Result<()> = async {
|
||||||
let agent_dir = coord.register_agent(name)?;
|
let agent_dir = coord.register_agent(name)?;
|
||||||
if let Err(e) = lifecycle::spawn(name, &coord.agent_flake, &agent_dir).await {
|
let config_dir = Coordinator::agent_config_dir(name);
|
||||||
|
if let Err(e) =
|
||||||
|
lifecycle::spawn(name, &coord.hyperhive_flake, &agent_dir, &config_dir).await
|
||||||
|
{
|
||||||
coord.unregister_agent(name);
|
coord.unregister_agent(name);
|
||||||
return Err(e);
|
return Err(e);
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -20,7 +20,7 @@ pub async fn serve(socket: &Path, coord: Arc<Coordinator>) -> Result<()> {
|
||||||
|
|
||||||
let listener = UnixListener::bind(socket)
|
let listener = UnixListener::bind(socket)
|
||||||
.with_context(|| format!("bind admin socket {}", socket.display()))?;
|
.with_context(|| format!("bind admin socket {}", socket.display()))?;
|
||||||
tracing::info!(socket = %socket.display(), agent_flake = %coord.agent_flake, "hive-c0re admin listening");
|
tracing::info!(socket = %socket.display(), hyperhive_flake = %coord.hyperhive_flake, "hive-c0re admin listening");
|
||||||
|
|
||||||
loop {
|
loop {
|
||||||
let (stream, _) = listener.accept().await.context("accept connection")?;
|
let (stream, _) = listener.accept().await.context("accept connection")?;
|
||||||
|
|
@ -61,7 +61,10 @@ async fn dispatch(req: &HostRequest, coord: &Coordinator) -> HostResponse {
|
||||||
HostRequest::Spawn { name } => {
|
HostRequest::Spawn { name } => {
|
||||||
tracing::info!(%name, "spawn");
|
tracing::info!(%name, "spawn");
|
||||||
let agent_dir = coord.register_agent(name)?;
|
let agent_dir = coord.register_agent(name)?;
|
||||||
if let Err(e) = lifecycle::spawn(name, &coord.agent_flake, &agent_dir).await {
|
let config_dir = Coordinator::agent_config_dir(name);
|
||||||
|
if let Err(e) =
|
||||||
|
lifecycle::spawn(name, &coord.hyperhive_flake, &agent_dir, &config_dir).await
|
||||||
|
{
|
||||||
// Roll back socket registration if container creation failed.
|
// Roll back socket registration if container creation failed.
|
||||||
coord.unregister_agent(name);
|
coord.unregister_agent(name);
|
||||||
return Err(e);
|
return Err(e);
|
||||||
|
|
@ -77,18 +80,29 @@ async fn dispatch(req: &HostRequest, coord: &Coordinator) -> HostResponse {
|
||||||
HostRequest::Rebuild { name } => {
|
HostRequest::Rebuild { name } => {
|
||||||
tracing::info!(%name, "rebuild");
|
tracing::info!(%name, "rebuild");
|
||||||
let agent_dir = coord.register_agent(name)?;
|
let agent_dir = coord.register_agent(name)?;
|
||||||
lifecycle::rebuild(name, &coord.agent_flake, &agent_dir).await?;
|
let config_dir = Coordinator::agent_config_dir(name);
|
||||||
|
lifecycle::rebuild(name, &coord.hyperhive_flake, &agent_dir, &config_dir).await?;
|
||||||
HostResponse::success()
|
HostResponse::success()
|
||||||
}
|
}
|
||||||
HostRequest::List => HostResponse::list(lifecycle::list().await?),
|
HostRequest::List => HostResponse::list(lifecycle::list().await?),
|
||||||
HostRequest::Pending => HostResponse::pending(coord.approvals.pending()?),
|
HostRequest::Pending => HostResponse::pending(coord.approvals.pending()?),
|
||||||
HostRequest::Approve { id } => {
|
HostRequest::Approve { id } => {
|
||||||
let approval = coord.approvals.mark_approved(*id)?;
|
let approval = coord.approvals.mark_approved(*id)?;
|
||||||
tracing::info!(%approval.id, %approval.agent, %approval.commit_ref, "approval applied: rebuilding agent");
|
tracing::info!(%approval.id, %approval.agent, %approval.commit_ref, "approval applied: advancing main + rebuilding");
|
||||||
let agent_dir = coord.register_agent(&approval.agent)?;
|
let agent_dir = coord.register_agent(&approval.agent)?;
|
||||||
if let Err(e) =
|
let config_dir = Coordinator::agent_config_dir(&approval.agent);
|
||||||
lifecycle::rebuild(&approval.agent, &coord.agent_flake, &agent_dir).await
|
let result: anyhow::Result<()> = async {
|
||||||
{
|
lifecycle::apply_commit(&config_dir, &approval.commit_ref).await?;
|
||||||
|
lifecycle::rebuild(
|
||||||
|
&approval.agent,
|
||||||
|
&coord.hyperhive_flake,
|
||||||
|
&agent_dir,
|
||||||
|
&config_dir,
|
||||||
|
)
|
||||||
|
.await
|
||||||
|
}
|
||||||
|
.await;
|
||||||
|
if let Err(e) = result {
|
||||||
let note = format!("{e:#}");
|
let note = format!("{e:#}");
|
||||||
let _ = coord.approvals.mark_failed(approval.id, ¬e);
|
let _ = coord.approvals.mark_failed(approval.id, ¬e);
|
||||||
return Err(e);
|
return Err(e);
|
||||||
|
|
|
||||||
|
|
@ -16,10 +16,15 @@ in
|
||||||
defaultText = lib.literalExpression "pkgs.hyperhive";
|
defaultText = lib.literalExpression "pkgs.hyperhive";
|
||||||
description = "Package that provides /bin/hive-c0re.";
|
description = "Package that provides /bin/hive-c0re.";
|
||||||
};
|
};
|
||||||
agentFlake = lib.mkOption {
|
hyperhiveFlake = lib.mkOption {
|
||||||
type = lib.types.str;
|
type = lib.types.str;
|
||||||
default = "/etc/hyperhive#agent-base";
|
default = "/etc/hyperhive";
|
||||||
description = "Flake reference passed to `nixos-container create --flake` when spawning sub-agents.";
|
description = ''
|
||||||
|
URL of the hyperhive flake (no fragment). Inlined into each
|
||||||
|
per-agent `flake.nix` at `inputs.hyperhive.url`. The per-agent
|
||||||
|
flake then pulls `hyperhive.nixosConfigurations.agent-base` to
|
||||||
|
build the container.
|
||||||
|
'';
|
||||||
};
|
};
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|
@ -31,7 +36,7 @@ in
|
||||||
wantedBy = [ "multi-user.target" ];
|
wantedBy = [ "multi-user.target" ];
|
||||||
path = [ "/run/current-system/sw" ];
|
path = [ "/run/current-system/sw" ];
|
||||||
serviceConfig = {
|
serviceConfig = {
|
||||||
ExecStart = "${cfg.package}/bin/hive-c0re --socket /run/hyperhive/host.sock serve --agent-flake ${cfg.agentFlake}";
|
ExecStart = "${cfg.package}/bin/hive-c0re --socket /run/hyperhive/host.sock serve --hyperhive-flake ${cfg.hyperhiveFlake}";
|
||||||
Restart = "on-failure";
|
Restart = "on-failure";
|
||||||
RestartSec = 2;
|
RestartSec = 2;
|
||||||
RuntimeDirectory = "hyperhive";
|
RuntimeDirectory = "hyperhive";
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue