phase 8 step 1: per-agent claude creds bind + destroy keeps state

This commit is contained in:
müde 2026-05-15 12:39:22 +02:00
parent 0fc287c768
commit a42fdb3a5c
9 changed files with 158 additions and 24 deletions

View file

@ -21,6 +21,7 @@ pub async fn approve(coord: &Coordinator, id: i64) -> Result<()> {
let agent_dir = coord.register_agent(&approval.agent)?;
let proposed_dir = Coordinator::agent_proposed_dir(&approval.agent);
let applied_dir = Coordinator::agent_applied_dir(&approval.agent);
let claude_dir = Coordinator::agent_claude_dir(&approval.agent);
let result: Result<()> = async {
lifecycle::apply_commit(&applied_dir, &proposed_dir, &approval.commit_ref).await?;
lifecycle::rebuild(
@ -28,6 +29,7 @@ pub async fn approve(coord: &Coordinator, id: i64) -> Result<()> {
&coord.hyperhive_flake,
&agent_dir,
&applied_dir,
&claude_dir,
)
.await
}
@ -64,8 +66,14 @@ pub async fn approve(coord: &Coordinator, id: i64) -> Result<()> {
}
}
/// Fully tear down a sub-agent. Refuses the manager (declarative; would fight
/// with the host's nixos config).
/// Tear down a sub-agent container. By default this is non-destructive to
/// persistent state: the proposed/applied config repos and the Claude
/// credentials dir under `/var/lib/hyperhive/{agents,applied}/<name>/` are
/// kept, so recreating an agent of the same name reuses prior config + creds
/// (no re-login). The ephemeral runtime dir under `/run/hyperhive/agents/`
/// is cleared because its contents (the mcp socket) don't survive restarts
/// anyway. A future `--purge` path can wipe state when the operator opts in.
/// Refuses the manager (declarative; would fight with the host's nixos config).
pub async fn destroy(coord: &Coordinator, name: &str) -> Result<()> {
if name == MANAGER_NAME || name == MANAGER_AGENT {
bail!("refusing to destroy the manager ({name})");
@ -77,14 +85,6 @@ pub async fn destroy(coord: &Coordinator, name: &str) -> Result<()> {
if runtime.exists() {
let _ = std::fs::remove_dir_all(&runtime);
}
let state = Coordinator::agent_state_root(name);
if state.exists() {
let _ = std::fs::remove_dir_all(&state);
}
let applied = Coordinator::agent_applied_dir(name);
if applied.exists() {
let _ = std::fs::remove_dir_all(&applied);
}
let _ = coord
.approvals
.fail_pending_for_agent(name, "agent destroyed");

View file

@ -91,6 +91,14 @@ impl Coordinator {
Self::agent_state_root(name).join("config")
}
/// Per-agent Claude credentials dir. Bind-mounted RW into the agent
/// container at `/root/.claude` so OAuth state survives container
/// destroy/recreate. Each agent owns its own token lineage — sharing
/// would break on the first refresh-token rotation.
pub fn agent_claude_dir(name: &str) -> PathBuf {
Self::agent_state_root(name).join("claude")
}
/// Authoritative applied config repo. Hive-c0re-only.
pub fn agent_applied_dir(name: &str) -> PathBuf {
PathBuf::from(format!("{APPLIED_STATE_ROOT}/{name}"))

View file

@ -162,7 +162,7 @@ fn render_containers(containers: &[String], hostname: &str) -> String {
let port = lifecycle::agent_web_port(name);
let _ = writeln!(
out,
"<li><span class=\"glyph\">▒░▒░░</span> <a href=\"http://{hostname}:{port}/\">{name}</a> <span class=\"role role-ag3nt\">ag3nt</span> <span class=\"meta\">{container} :{port}</span>\n <form method=\"POST\" action=\"/destroy/{name}\" class=\"inline\" onsubmit=\"return confirm('destroy {name}? this wipes the agent\\'s state.');\"><button class=\"btn btn-destroy\" type=\"submit\">DESTR0Y</button></form>\n</li>",
"<li><span class=\"glyph\">▒░▒░░</span> <a href=\"http://{hostname}:{port}/\">{name}</a> <span class=\"role role-ag3nt\">ag3nt</span> <span class=\"meta\">{container} :{port}</span>\n <form method=\"POST\" action=\"/destroy/{name}\" class=\"inline\" onsubmit=\"return confirm('destroy {name}? container is removed; state + creds kept.');\"><button class=\"btn btn-destroy\" type=\"submit\">DESTR0Y</button></form>\n</li>",
);
}
}

View file

@ -16,6 +16,10 @@ pub const MANAGER_NAME: &str = "hm1nd";
/// Mount point of the per-agent runtime directory inside the container.
pub const CONTAINER_RUNTIME_MOUNT: &str = "/run/hive";
/// Mount point of the per-agent Claude credentials dir inside the container.
/// Persistent across destroy/recreate so OAuth login survives.
pub const CONTAINER_CLAUDE_MOUNT: &str = "/root/.claude";
const GIT_NAME: &str = "hive-c0re";
const GIT_EMAIL: &str = "hive-c0re@hyperhive";
@ -66,14 +70,16 @@ pub async fn spawn(
agent_dir: &Path,
proposed_dir: &Path,
applied_dir: &Path,
claude_dir: &Path,
) -> Result<()> {
validate(name)?;
setup_proposed(proposed_dir, name).await?;
setup_applied(applied_dir, name, hyperhive_flake).await?;
ensure_claude_dir(claude_dir)?;
let container = container_name(name);
let flake_ref = format!("{}#default", applied_dir.display());
run(&["create", &container, "--flake", &flake_ref]).await?;
set_nspawn_flags(&container, agent_dir)?;
set_nspawn_flags(&container, agent_dir, claude_dir)?;
set_resource_limits(&container)?;
systemd_daemon_reload().await?;
run(&["start", &container]).await
@ -108,12 +114,14 @@ pub async fn rebuild(
hyperhive_flake: &str,
agent_dir: &Path,
applied_dir: &Path,
claude_dir: &Path,
) -> Result<()> {
validate(name)?;
setup_applied(applied_dir, name, hyperhive_flake).await?;
ensure_claude_dir(claude_dir)?;
let container = container_name(name);
let flake_ref = format!("{}#default", applied_dir.display());
set_nspawn_flags(&container, agent_dir)?;
set_nspawn_flags(&container, agent_dir, claude_dir)?;
set_resource_limits(&container)?;
systemd_daemon_reload().await?;
run(&["update", &container, "--flake", &flake_ref]).await?;
@ -248,6 +256,23 @@ pub async fn apply_commit(applied_dir: &Path, proposed_dir: &Path, commit_ref: &
Ok(())
}
/// Create the per-agent Claude credentials dir if missing. Mode 0700 — only
/// root inside the container reads/writes it. Idempotent: existing dirs are
/// left untouched (an agent's OAuth tokens survive `destroy`/recreate).
fn ensure_claude_dir(claude_dir: &Path) -> Result<()> {
if !claude_dir.exists() {
std::fs::create_dir_all(claude_dir)
.with_context(|| format!("create {}", claude_dir.display()))?;
#[cfg(unix)]
{
use std::os::unix::fs::PermissionsExt;
std::fs::set_permissions(claude_dir, std::fs::Permissions::from_mode(0o700))
.with_context(|| format!("chmod {}", claude_dir.display()))?;
}
}
Ok(())
}
fn initial_agent_nix(name: &str) -> String {
format!(
"{{ ... }}:\n{{\n # Per-agent overrides for {name}. The manager edits this\n # file (and commits) to customise the agent's NixOS config.\n}}\n",
@ -347,12 +372,13 @@ async fn systemd_daemon_reload() -> Result<()> {
/// is reachable on the host) and `EXTRA_NSPAWN_FLAGS` (the runtime-dir bind).
/// The start script expands `$EXTRA_NSPAWN_FLAGS` unquoted into the
/// `systemd-nspawn` command.
fn set_nspawn_flags(container: &str, agent_dir: &Path) -> Result<()> {
fn set_nspawn_flags(container: &str, agent_dir: &Path, claude_dir: &Path) -> Result<()> {
let path = format!("/etc/nixos-containers/{container}.conf");
let original = std::fs::read_to_string(&path).with_context(|| format!("read {path}"))?;
let bind_flag = format!(
"EXTRA_NSPAWN_FLAGS=\"--bind={}:{CONTAINER_RUNTIME_MOUNT}\"",
agent_dir.display()
"EXTRA_NSPAWN_FLAGS=\"--bind={runtime}:{CONTAINER_RUNTIME_MOUNT} --bind={claude}:{CONTAINER_CLAUDE_MOUNT}\"",
runtime = agent_dir.display(),
claude = claude_dir.display(),
);
let mut lines: Vec<String> = original
.lines()

View file

@ -97,12 +97,14 @@ async fn dispatch(req: &ManagerRequest, coord: &Coordinator) -> ManagerResponse
let agent_dir = coord.register_agent(name)?;
let proposed_dir = Coordinator::agent_proposed_dir(name);
let applied_dir = Coordinator::agent_applied_dir(name);
let claude_dir = Coordinator::agent_claude_dir(name);
if let Err(e) = lifecycle::spawn(
name,
&coord.hyperhive_flake,
&agent_dir,
&proposed_dir,
&applied_dir,
&claude_dir,
)
.await
{

View file

@ -64,12 +64,14 @@ async fn dispatch(req: &HostRequest, coord: &Coordinator) -> HostResponse {
let agent_dir = coord.register_agent(name)?;
let proposed_dir = Coordinator::agent_proposed_dir(name);
let applied_dir = Coordinator::agent_applied_dir(name);
let claude_dir = Coordinator::agent_claude_dir(name);
if let Err(e) = lifecycle::spawn(
name,
&coord.hyperhive_flake,
&agent_dir,
&proposed_dir,
&applied_dir,
&claude_dir,
)
.await
{
@ -93,7 +95,15 @@ async fn dispatch(req: &HostRequest, coord: &Coordinator) -> HostResponse {
tracing::info!(%name, "rebuild");
let agent_dir = coord.register_agent(name)?;
let applied_dir = Coordinator::agent_applied_dir(name);
lifecycle::rebuild(name, &coord.hyperhive_flake, &agent_dir, &applied_dir).await?;
let claude_dir = Coordinator::agent_claude_dir(name);
lifecycle::rebuild(
name,
&coord.hyperhive_flake,
&agent_dir,
&applied_dir,
&claude_dir,
)
.await?;
HostResponse::success()
}
HostRequest::List => HostResponse::list(lifecycle::list().await?),