From a42fdb3a5c3b78393c25ee61e69aabb43b49a5b4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?m=C3=BCde?= Date: Fri, 15 May 2026 12:39:22 +0200 Subject: [PATCH] phase 8 step 1: per-agent claude creds bind + destroy keeps state --- CLAUDE.md | 39 +++++++++++++++++++--- PLAN.md | 57 ++++++++++++++++++++++++++++++++- hive-c0re/src/actions.rs | 20 ++++++------ hive-c0re/src/coordinator.rs | 8 +++++ hive-c0re/src/dashboard.rs | 2 +- hive-c0re/src/lifecycle.rs | 36 ++++++++++++++++++--- hive-c0re/src/manager_server.rs | 2 ++ hive-c0re/src/server.rs | 12 ++++++- hive-sh4re/src/lib.rs | 6 ++-- 9 files changed, 158 insertions(+), 24 deletions(-) diff --git a/CLAUDE.md b/CLAUDE.md index 1acd0c8..eb1a4de 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -140,10 +140,13 @@ docs/damocles-migration.md options for moving damocles onto hyperhive (stable lags too far). The overlay imports unstable with its own `allowUnfreePredicate` so the access inside the overlay doesn't itself trip. - **Claude credentials are stateful and per-container.** No `ANTHROPIC_API_KEY` - env var path. For now: `nixos-container root-login h-` → `claude` - (interactive) → log in once. The harness falls back to echo replies when - `claude --print` fails. Future: bind-mount a shared `~/.claude` dir from the - host so creds survive container destroy/recreate. + env var path. Today's stopgap: `nixos-container root-login h-` → + `claude` (interactive) → log in once. The harness falls back to echo + replies when `claude --print` fails. **Phase 8** moves this to a per-agent + persistent dir at `/var/lib/hyperhive/agents//claude/` bind-mounted + into the container, with the interactive login driven from the agent's web + UI. Sharing one `~/.claude` across agents is NOT viable — OAuth refresh + tokens rotate, so any sibling refresh invalidates all the others. - **Echo guard.** `hive-ag3nt serve` skips auto-reply when the incoming body starts with `"echo: "`. Prevents ping-pong loops when both sides fall back to echo. Real conversations between claude-backed agents *will* runaway — @@ -217,6 +220,34 @@ already. `set_nspawn_flags` so sub-agent web UI ports are reachable on the host - `HYPERHIVE_GIT` env var (absolute path) bypasses PATH ambiguity +## Phase 8 — real claude in containers + login UX (in progress) + +See PLAN.md → "Phase 8" for the full design. Summary: + +- **Per-agent persistent creds dir.** Bind + `/var/lib/hyperhive/agents//claude/` → `/root/.claude` (RW) in + `set_nspawn_flags`. One OAuth lineage per agent; refresh rotations stay + contained to that agent. +- **State dirs persist by default.** `destroy` keeps + `/var/lib/hyperhive/agents//` unless the operator passes an explicit + wipe flag. Recreating an agent of the same name reuses prior creds. +- **First spawn is approval-gated.** New agent names go through the same + approval queue as config edits. Dashboard shows a spinner during + `nixos-container create` + `update` + `start`. +- **"needs login" partial-run state.** No valid session in `~/.claude/` → + harness binds the web UI but does NOT start the turn loop. Dashboard + surfaces this state per-agent. +- **Login from the per-agent web UI.** Spawn `claude /login` with plain + stdio pipes (no PTY initially), surface the OAuth URL from stdout on the + page, accept the resulting code via a paste field, write it to the process + stdin. On success, harness transitions out of "needs login" and enters the + turn loop. If pipes turn out to be insufficient (claude refuses without a + TTY, raw-mode input, ANSI-only output) we redo the backend with a PTY. + +Implementation order: bind-mount/dir creation → approval-gated spawn + +spinner → "needs login" partial run → PTY login endpoint. The login UI has +nowhere to live until the partial-run mode exists, so don't ship it earlier. + ## Approval flow End-to-end: manager edits per-agent `proposed` repo → commits → submits commit diff --git a/PLAN.md b/PLAN.md index 67b5813..8aecbbb 100644 --- a/PLAN.md +++ b/PLAN.md @@ -99,7 +99,13 @@ A multi-Claude-Code-agent setup on a single host: **Manager concurrency = event loop.** `hive-m1nd` pulls from a heterogeneous `next_event` stream: inbound agent messages, replies to sync sends, lifecycle events from `hive-c0re` (crash, OOM, approval-resolved), and dashboard signals. One queue, claude turn per event. -**Anthropic credentials.** Shared key on host, bind-mounted into every container. No per-agent keys in v1. +**Anthropic credentials.** ~~Shared key on host~~ — revised in Phase 8. +Per-agent persistent `~/.claude/` dir bind-mounted from +`/var/lib/hyperhive/agents//claude/`. OAuth refresh tokens rotate, so +sharing across agents is a non-starter (any sibling refresh invalidates all +the others). One interactive login per agent, ever; creds survive +`destroy`/recreate by default. Login flow runs from the per-agent web UI +(see Phase 8). **Workdir bootstrap.** Each agent's `state/` starts empty. Initial-task message tells the agent what to clone/set up. Manager can drop big artefacts into `state/` directly (it has RW) and pass the path as a message reference. @@ -230,6 +236,55 @@ The original open-decisions list, with what we picked: subcommand (`serve` / `spawn` / `kill` / `rebuild` / `list` / `pending` / `approve` / `deny`). +### ⏳ Phase 8 — real claude in containers + login UX + +Until this lands the harness falls back to the echo path; we've never run an +end-to-end turn with a real model in a real container. + +**Credential model.** Per-agent persistent dir at +`/var/lib/hyperhive/agents//claude/` bind-mounted RW to `/root/.claude` +inside the container. *Not* shared across agents: OAuth refresh tokens rotate, +and sharing one dir means the first refresh by any sibling invalidates all the +others. Each agent owns its own token lineage from first login onward. + +**State-dir persistence.** Agent state dirs (including the claude creds dir) +persist across `destroy`/recreate by default. The `destroy` verb only purges +state when given an explicit "wipe" flag from the operator — recreating an +agent of the same name reuses prior creds with no re-login. + +**First-deploy approval.** Spawning a brand-new agent name goes through the +existing approval queue (same path as config edits). The dashboard shows a +spinner while `nixos-container create` + `update` + `start` run. + +**"needs login" agent state.** If the bound `~/.claude/` has no valid session, +the harness boots in a partial mode: per-agent web UI is up, but the turn +loop does NOT start. Dashboard surfaces the state per-agent so the operator +knows where to click. + +**Login over the per-agent web UI.** No more `nixos-container root-login` for +the common case. The agent's web UI exposes a "log in" action that: +1. Spawns `claude /login` (or equivalent) inside the container with plain + stdio pipes — no PTY unless we discover we need one. +2. Reads the OAuth URL from the process stdout and shows it on the page. +3. Provides a paste field for the resulting code; writes it to the process + stdin. +4. On success, transitions out of "needs login" and starts the turn loop. + +If `claude` turns out to require a TTY (refuses on `!isatty()`, uses raw-mode +input, or only renders the URL with ANSI styling), redo the backend with a +PTY (e.g. `portable-pty`). Don't pre-build for that — start simple. + +**Sequence.** Ship in this order — don't do (4) before (3) or there's nowhere +for the login UI to live: (1) bind-mount + per-agent dir creation in +`lifecycle::set_nspawn_flags`, (2) approval-gated first spawn + dashboard +spinner, (3) harness "needs login" partial-run mode, (4) PTY-backed login +endpoint on the per-agent UI. + +**Exit:** spawn a new agent from the dashboard → approve → wait for spinner +→ click "log in" on the agent's page → complete OAuth in the browser → +paste code → agent enters the turn loop and replies to a T4LK message via +real `claude --print`. + ## Polish backlog (not phased) See CLAUDE.md → "Polish backlog" for the live list. Highlights: operator diff --git a/hive-c0re/src/actions.rs b/hive-c0re/src/actions.rs index 1d5683f..4c5899b 100644 --- a/hive-c0re/src/actions.rs +++ b/hive-c0re/src/actions.rs @@ -21,6 +21,7 @@ pub async fn approve(coord: &Coordinator, id: i64) -> Result<()> { let agent_dir = coord.register_agent(&approval.agent)?; let proposed_dir = Coordinator::agent_proposed_dir(&approval.agent); let applied_dir = Coordinator::agent_applied_dir(&approval.agent); + let claude_dir = Coordinator::agent_claude_dir(&approval.agent); let result: Result<()> = async { lifecycle::apply_commit(&applied_dir, &proposed_dir, &approval.commit_ref).await?; lifecycle::rebuild( @@ -28,6 +29,7 @@ pub async fn approve(coord: &Coordinator, id: i64) -> Result<()> { &coord.hyperhive_flake, &agent_dir, &applied_dir, + &claude_dir, ) .await } @@ -64,8 +66,14 @@ pub async fn approve(coord: &Coordinator, id: i64) -> Result<()> { } } -/// Fully tear down a sub-agent. Refuses the manager (declarative; would fight -/// with the host's nixos config). +/// Tear down a sub-agent container. By default this is non-destructive to +/// persistent state: the proposed/applied config repos and the Claude +/// credentials dir under `/var/lib/hyperhive/{agents,applied}//` are +/// kept, so recreating an agent of the same name reuses prior config + creds +/// (no re-login). The ephemeral runtime dir under `/run/hyperhive/agents/` +/// is cleared because its contents (the mcp socket) don't survive restarts +/// anyway. A future `--purge` path can wipe state when the operator opts in. +/// Refuses the manager (declarative; would fight with the host's nixos config). pub async fn destroy(coord: &Coordinator, name: &str) -> Result<()> { if name == MANAGER_NAME || name == MANAGER_AGENT { bail!("refusing to destroy the manager ({name})"); @@ -77,14 +85,6 @@ pub async fn destroy(coord: &Coordinator, name: &str) -> Result<()> { if runtime.exists() { let _ = std::fs::remove_dir_all(&runtime); } - let state = Coordinator::agent_state_root(name); - if state.exists() { - let _ = std::fs::remove_dir_all(&state); - } - let applied = Coordinator::agent_applied_dir(name); - if applied.exists() { - let _ = std::fs::remove_dir_all(&applied); - } let _ = coord .approvals .fail_pending_for_agent(name, "agent destroyed"); diff --git a/hive-c0re/src/coordinator.rs b/hive-c0re/src/coordinator.rs index 12a5056..3430685 100644 --- a/hive-c0re/src/coordinator.rs +++ b/hive-c0re/src/coordinator.rs @@ -91,6 +91,14 @@ impl Coordinator { Self::agent_state_root(name).join("config") } + /// Per-agent Claude credentials dir. Bind-mounted RW into the agent + /// container at `/root/.claude` so OAuth state survives container + /// destroy/recreate. Each agent owns its own token lineage — sharing + /// would break on the first refresh-token rotation. + pub fn agent_claude_dir(name: &str) -> PathBuf { + Self::agent_state_root(name).join("claude") + } + /// Authoritative applied config repo. Hive-c0re-only. pub fn agent_applied_dir(name: &str) -> PathBuf { PathBuf::from(format!("{APPLIED_STATE_ROOT}/{name}")) diff --git a/hive-c0re/src/dashboard.rs b/hive-c0re/src/dashboard.rs index beca307..8f8cb31 100644 --- a/hive-c0re/src/dashboard.rs +++ b/hive-c0re/src/dashboard.rs @@ -162,7 +162,7 @@ fn render_containers(containers: &[String], hostname: &str) -> String { let port = lifecycle::agent_web_port(name); let _ = writeln!( out, - "
  • ▒░▒░░ {name} ag3nt {container} :{port}\n
    \n
  • ", + "
  • ▒░▒░░ {name} ag3nt {container} :{port}\n
    \n
  • ", ); } } diff --git a/hive-c0re/src/lifecycle.rs b/hive-c0re/src/lifecycle.rs index eb4738c..cc80066 100644 --- a/hive-c0re/src/lifecycle.rs +++ b/hive-c0re/src/lifecycle.rs @@ -16,6 +16,10 @@ pub const MANAGER_NAME: &str = "hm1nd"; /// Mount point of the per-agent runtime directory inside the container. pub const CONTAINER_RUNTIME_MOUNT: &str = "/run/hive"; +/// Mount point of the per-agent Claude credentials dir inside the container. +/// Persistent across destroy/recreate so OAuth login survives. +pub const CONTAINER_CLAUDE_MOUNT: &str = "/root/.claude"; + const GIT_NAME: &str = "hive-c0re"; const GIT_EMAIL: &str = "hive-c0re@hyperhive"; @@ -66,14 +70,16 @@ pub async fn spawn( agent_dir: &Path, proposed_dir: &Path, applied_dir: &Path, + claude_dir: &Path, ) -> Result<()> { validate(name)?; setup_proposed(proposed_dir, name).await?; setup_applied(applied_dir, name, hyperhive_flake).await?; + ensure_claude_dir(claude_dir)?; let container = container_name(name); let flake_ref = format!("{}#default", applied_dir.display()); run(&["create", &container, "--flake", &flake_ref]).await?; - set_nspawn_flags(&container, agent_dir)?; + set_nspawn_flags(&container, agent_dir, claude_dir)?; set_resource_limits(&container)?; systemd_daemon_reload().await?; run(&["start", &container]).await @@ -108,12 +114,14 @@ pub async fn rebuild( hyperhive_flake: &str, agent_dir: &Path, applied_dir: &Path, + claude_dir: &Path, ) -> Result<()> { validate(name)?; setup_applied(applied_dir, name, hyperhive_flake).await?; + ensure_claude_dir(claude_dir)?; let container = container_name(name); let flake_ref = format!("{}#default", applied_dir.display()); - set_nspawn_flags(&container, agent_dir)?; + set_nspawn_flags(&container, agent_dir, claude_dir)?; set_resource_limits(&container)?; systemd_daemon_reload().await?; run(&["update", &container, "--flake", &flake_ref]).await?; @@ -248,6 +256,23 @@ pub async fn apply_commit(applied_dir: &Path, proposed_dir: &Path, commit_ref: & Ok(()) } +/// Create the per-agent Claude credentials dir if missing. Mode 0700 — only +/// root inside the container reads/writes it. Idempotent: existing dirs are +/// left untouched (an agent's OAuth tokens survive `destroy`/recreate). +fn ensure_claude_dir(claude_dir: &Path) -> Result<()> { + if !claude_dir.exists() { + std::fs::create_dir_all(claude_dir) + .with_context(|| format!("create {}", claude_dir.display()))?; + #[cfg(unix)] + { + use std::os::unix::fs::PermissionsExt; + std::fs::set_permissions(claude_dir, std::fs::Permissions::from_mode(0o700)) + .with_context(|| format!("chmod {}", claude_dir.display()))?; + } + } + Ok(()) +} + fn initial_agent_nix(name: &str) -> String { format!( "{{ ... }}:\n{{\n # Per-agent overrides for {name}. The manager edits this\n # file (and commits) to customise the agent's NixOS config.\n}}\n", @@ -347,12 +372,13 @@ async fn systemd_daemon_reload() -> Result<()> { /// is reachable on the host) and `EXTRA_NSPAWN_FLAGS` (the runtime-dir bind). /// The start script expands `$EXTRA_NSPAWN_FLAGS` unquoted into the /// `systemd-nspawn` command. -fn set_nspawn_flags(container: &str, agent_dir: &Path) -> Result<()> { +fn set_nspawn_flags(container: &str, agent_dir: &Path, claude_dir: &Path) -> Result<()> { let path = format!("/etc/nixos-containers/{container}.conf"); let original = std::fs::read_to_string(&path).with_context(|| format!("read {path}"))?; let bind_flag = format!( - "EXTRA_NSPAWN_FLAGS=\"--bind={}:{CONTAINER_RUNTIME_MOUNT}\"", - agent_dir.display() + "EXTRA_NSPAWN_FLAGS=\"--bind={runtime}:{CONTAINER_RUNTIME_MOUNT} --bind={claude}:{CONTAINER_CLAUDE_MOUNT}\"", + runtime = agent_dir.display(), + claude = claude_dir.display(), ); let mut lines: Vec = original .lines() diff --git a/hive-c0re/src/manager_server.rs b/hive-c0re/src/manager_server.rs index 408bc4a..674e59d 100644 --- a/hive-c0re/src/manager_server.rs +++ b/hive-c0re/src/manager_server.rs @@ -97,12 +97,14 @@ async fn dispatch(req: &ManagerRequest, coord: &Coordinator) -> ManagerResponse let agent_dir = coord.register_agent(name)?; let proposed_dir = Coordinator::agent_proposed_dir(name); let applied_dir = Coordinator::agent_applied_dir(name); + let claude_dir = Coordinator::agent_claude_dir(name); if let Err(e) = lifecycle::spawn( name, &coord.hyperhive_flake, &agent_dir, &proposed_dir, &applied_dir, + &claude_dir, ) .await { diff --git a/hive-c0re/src/server.rs b/hive-c0re/src/server.rs index 447f94c..bd89c6f 100644 --- a/hive-c0re/src/server.rs +++ b/hive-c0re/src/server.rs @@ -64,12 +64,14 @@ async fn dispatch(req: &HostRequest, coord: &Coordinator) -> HostResponse { let agent_dir = coord.register_agent(name)?; let proposed_dir = Coordinator::agent_proposed_dir(name); let applied_dir = Coordinator::agent_applied_dir(name); + let claude_dir = Coordinator::agent_claude_dir(name); if let Err(e) = lifecycle::spawn( name, &coord.hyperhive_flake, &agent_dir, &proposed_dir, &applied_dir, + &claude_dir, ) .await { @@ -93,7 +95,15 @@ async fn dispatch(req: &HostRequest, coord: &Coordinator) -> HostResponse { tracing::info!(%name, "rebuild"); let agent_dir = coord.register_agent(name)?; let applied_dir = Coordinator::agent_applied_dir(name); - lifecycle::rebuild(name, &coord.hyperhive_flake, &agent_dir, &applied_dir).await?; + let claude_dir = Coordinator::agent_claude_dir(name); + lifecycle::rebuild( + name, + &coord.hyperhive_flake, + &agent_dir, + &applied_dir, + &claude_dir, + ) + .await?; HostResponse::success() } HostRequest::List => HostResponse::list(lifecycle::list().await?), diff --git a/hive-sh4re/src/lib.rs b/hive-sh4re/src/lib.rs index d4d5650..26bef10 100644 --- a/hive-sh4re/src/lib.rs +++ b/hive-sh4re/src/lib.rs @@ -16,8 +16,10 @@ pub enum HostRequest { Spawn { name: String }, /// Stop a managed container (graceful). Kill { name: String }, - /// Fully tear down a sub-agent: stop, wipe state + applied repo, drop the - /// systemd drop-in, purge pending approvals. Manager not destroyable. + /// Tear down a sub-agent container: stop + remove + drop the systemd + /// drop-in, purge pending approvals. Persistent state (proposed/applied + /// repos, Claude credentials) is KEPT by default — recreating the agent + /// with the same name reuses prior config + login. Manager not destroyable. Destroy { name: String }, /// Apply pending config to a managed container. Rebuild { name: String },