From a42fdb3a5c3b78393c25ee61e69aabb43b49a5b4 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?m=C3=BCde?= <git@darkest.space>
Date: Fri, 15 May 2026 12:39:22 +0200
Subject: [PATCH] phase 8 step 1: per-agent claude creds bind + destroy keeps
 state

---
 CLAUDE.md                       | 39 +++++++++++++++++++---
 PLAN.md                         | 57 ++++++++++++++++++++++++++++++++-
 hive-c0re/src/actions.rs        | 20 ++++++------
 hive-c0re/src/coordinator.rs    |  8 +++++
 hive-c0re/src/dashboard.rs      |  2 +-
 hive-c0re/src/lifecycle.rs      | 36 ++++++++++++++++++---
 hive-c0re/src/manager_server.rs |  2 ++
 hive-c0re/src/server.rs         | 12 ++++++-
 hive-sh4re/src/lib.rs           |  6 ++--
 9 files changed, 158 insertions(+), 24 deletions(-)
diff --git a/CLAUDE.md b/CLAUDE.md
index 1acd0c8..eb1a4de 100644
--- a/CLAUDE.md
+++ b/CLAUDE.md
@@ -140,10 +140,13 @@ docs/damocles-migration.md   options for moving damocles onto hyperhive
   (stable lags too far). The overlay imports unstable with its own
   `allowUnfreePredicate` so the access inside the overlay doesn't itself trip.
 - **Claude credentials are stateful and per-container.** No `ANTHROPIC_API_KEY`
-  env var path. For now: `nixos-container root-login h-<name>` → `claude`
-  (interactive) → log in once. The harness falls back to echo replies when
-  `claude --print` fails. Future: bind-mount a shared `~/.claude` dir from the
-  host so creds survive container destroy/recreate.
+  env var path. Today's stopgap: `nixos-container root-login h-<name>` →
+  `claude` (interactive) → log in once. The harness falls back to echo
+  replies when `claude --print` fails. **Phase 8** moves this to a per-agent
+  persistent dir at `/var/lib/hyperhive/agents/<name>/claude/` bind-mounted
+  into the container, with the interactive login driven from the agent's web
+  UI. Sharing one `~/.claude` across agents is NOT viable — OAuth refresh
+  tokens rotate, so any sibling refresh invalidates all the others.
 - **Echo guard.** `hive-ag3nt serve` skips auto-reply when the incoming body
   starts with `"echo: "`. Prevents ping-pong loops when both sides fall back
   to echo. Real conversations between claude-backed agents *will* runaway —
@@ -217,6 +220,34 @@ already.
     `set_nspawn_flags` so sub-agent web UI ports are reachable on the host
   - `HYPERHIVE_GIT` env var (absolute path) bypasses PATH ambiguity
 
+## Phase 8 — real claude in containers + login UX (in progress)
+
+See PLAN.md → "Phase 8" for the full design. Summary:
+
+- **Per-agent persistent creds dir.** Bind
+  `/var/lib/hyperhive/agents/<name>/claude/` → `/root/.claude` (RW) in
+  `set_nspawn_flags`. One OAuth lineage per agent; refresh rotations stay
+  contained to that agent.
+- **State dirs persist by default.** `destroy` keeps
+  `/var/lib/hyperhive/agents/<name>/` unless the operator passes an explicit
+  wipe flag. Recreating an agent of the same name reuses prior creds.
+- **First spawn is approval-gated.** New agent names go through the same
+  approval queue as config edits. Dashboard shows a spinner during
+  `nixos-container create` + `update` + `start`.
+- **"needs login" partial-run state.** No valid session in `~/.claude/` →
+  harness binds the web UI but does NOT start the turn loop. Dashboard
+  surfaces this state per-agent.
+- **Login from the per-agent web UI.** Spawn `claude /login` with plain
+  stdio pipes (no PTY initially), surface the OAuth URL from stdout on the
+  page, accept the resulting code via a paste field, write it to the process
+  stdin. On success, harness transitions out of "needs login" and enters the
+  turn loop. If pipes turn out to be insufficient (claude refuses without a
+  TTY, raw-mode input, ANSI-only output) we redo the backend with a PTY.
+
+Implementation order: bind-mount/dir creation → approval-gated spawn +
+spinner → "needs login" partial run → PTY login endpoint. The login UI has
+nowhere to live until the partial-run mode exists, so don't ship it earlier.
+
 ## Approval flow
 
 End-to-end: manager edits per-agent `proposed` repo → commits → submits commit
diff --git a/PLAN.md b/PLAN.md
index 67b5813..8aecbbb 100644
--- a/PLAN.md
+++ b/PLAN.md
@@ -99,7 +99,13 @@ A multi-Claude-Code-agent setup on a single host:
 
 **Manager concurrency = event loop.** `hive-m1nd` pulls from a heterogeneous `next_event` stream: inbound agent messages, replies to sync sends, lifecycle events from `hive-c0re` (crash, OOM, approval-resolved), and dashboard signals. One queue, claude turn per event.
 
-**Anthropic credentials.** Shared key on host, bind-mounted into every container. No per-agent keys in v1.
+**Anthropic credentials.** ~~Shared key on host~~ — revised in Phase 8.
+Per-agent persistent `~/.claude/` dir bind-mounted from
+`/var/lib/hyperhive/agents/<name>/claude/`. OAuth refresh tokens rotate, so
+sharing across agents is a non-starter (any sibling refresh invalidates all
+the others). One interactive login per agent, ever; creds survive
+`destroy`/recreate by default. Login flow runs from the per-agent web UI
+(see Phase 8).
 
 **Workdir bootstrap.** Each agent's `state/` starts empty. Initial-task message tells the agent what to clone/set up. Manager can drop big artefacts into `state/` directly (it has RW) and pass the path as a message reference.
 
@@ -230,6 +236,55 @@ The original open-decisions list, with what we picked:
   subcommand (`serve` / `spawn` / `kill` / `rebuild` / `list` / `pending` /
   `approve` / `deny`).
 
+### ⏳ Phase 8 — real claude in containers + login UX
+
+Until this lands the harness falls back to the echo path; we've never run an
+end-to-end turn with a real model in a real container.
+
+**Credential model.** Per-agent persistent dir at
+`/var/lib/hyperhive/agents/<name>/claude/` bind-mounted RW to `/root/.claude`
+inside the container. *Not* shared across agents: OAuth refresh tokens rotate,
+and sharing one dir means the first refresh by any sibling invalidates all the
+others. Each agent owns its own token lineage from first login onward.
+
+**State-dir persistence.** Agent state dirs (including the claude creds dir)
+persist across `destroy`/recreate by default. The `destroy` verb only purges
+state when given an explicit "wipe" flag from the operator — recreating an
+agent of the same name reuses prior creds with no re-login.
+
+**First-deploy approval.** Spawning a brand-new agent name goes through the
+existing approval queue (same path as config edits). The dashboard shows a
+spinner while `nixos-container create` + `update` + `start` run.
+
+**"needs login" agent state.** If the bound `~/.claude/` has no valid session,
+the harness boots in a partial mode: per-agent web UI is up, but the turn
+loop does NOT start. Dashboard surfaces the state per-agent so the operator
+knows where to click.
+
+**Login over the per-agent web UI.** No more `nixos-container root-login` for
+the common case. The agent's web UI exposes a "log in" action that:
+1. Spawns `claude /login` (or equivalent) inside the container with plain
+   stdio pipes — no PTY unless we discover we need one.
+2. Reads the OAuth URL from the process stdout and shows it on the page.
+3. Provides a paste field for the resulting code; writes it to the process
+   stdin.
+4. On success, transitions out of "needs login" and starts the turn loop.
+
+If `claude` turns out to require a TTY (refuses on `!isatty()`, uses raw-mode
+input, or only renders the URL with ANSI styling), redo the backend with a
+PTY (e.g. `portable-pty`). Don't pre-build for that — start simple.
+
+**Sequence.** Ship in this order — don't do (4) before (3) or there's nowhere
+for the login UI to live: (1) bind-mount + per-agent dir creation in
+`lifecycle::set_nspawn_flags`, (2) approval-gated first spawn + dashboard
+spinner, (3) harness "needs login" partial-run mode, (4) PTY-backed login
+endpoint on the per-agent UI.
+
+**Exit:** spawn a new agent from the dashboard → approve → wait for spinner
+→ click "log in" on the agent's page → complete OAuth in the browser →
+paste code → agent enters the turn loop and replies to a T4LK message via
+real `claude --print`.
+
 ## Polish backlog (not phased)
 
 See CLAUDE.md → "Polish backlog" for the live list. Highlights: operator
diff --git a/hive-c0re/src/actions.rs b/hive-c0re/src/actions.rs
index 1d5683f..4c5899b 100644
--- a/hive-c0re/src/actions.rs
+++ b/hive-c0re/src/actions.rs
@@ -21,6 +21,7 @@ pub async fn approve(coord: &Coordinator, id: i64) -> Result<()> {
     let agent_dir = coord.register_agent(&approval.agent)?;
     let proposed_dir = Coordinator::agent_proposed_dir(&approval.agent);
     let applied_dir = Coordinator::agent_applied_dir(&approval.agent);
+    let claude_dir = Coordinator::agent_claude_dir(&approval.agent);
     let result: Result<()> = async {
         lifecycle::apply_commit(&applied_dir, &proposed_dir, &approval.commit_ref).await?;
         lifecycle::rebuild(
@@ -28,6 +29,7 @@ pub async fn approve(coord: &Coordinator, id: i64) -> Result<()> {
             &coord.hyperhive_flake,
             &agent_dir,
             &applied_dir,
+            &claude_dir,
         )
         .await
     }
@@ -64,8 +66,14 @@ pub async fn approve(coord: &Coordinator, id: i64) -> Result<()> {
     }
 }
 
-/// Fully tear down a sub-agent. Refuses the manager (declarative; would fight
-/// with the host's nixos config).
+/// Tear down a sub-agent container. By default this is non-destructive to
+/// persistent state: the proposed/applied config repos and the Claude
+/// credentials dir under `/var/lib/hyperhive/{agents,applied}/<name>/` are
+/// kept, so recreating an agent of the same name reuses prior config + creds
+/// (no re-login). The ephemeral runtime dir under `/run/hyperhive/agents/`
+/// is cleared because its contents (the mcp socket) don't survive restarts
+/// anyway. A future `--purge` path can wipe state when the operator opts in.
+/// Refuses the manager (declarative; would fight with the host's nixos config).
 pub async fn destroy(coord: &Coordinator, name: &str) -> Result<()> {
     if name == MANAGER_NAME || name == MANAGER_AGENT {
         bail!("refusing to destroy the manager ({name})");
@@ -77,14 +85,6 @@ pub async fn destroy(coord: &Coordinator, name: &str) -> Result<()> {
     if runtime.exists() {
         let _ = std::fs::remove_dir_all(&runtime);
     }
-    let state = Coordinator::agent_state_root(name);
-    if state.exists() {
-        let _ = std::fs::remove_dir_all(&state);
-    }
-    let applied = Coordinator::agent_applied_dir(name);
-    if applied.exists() {
-        let _ = std::fs::remove_dir_all(&applied);
-    }
     let _ = coord
         .approvals
         .fail_pending_for_agent(name, "agent destroyed");
diff --git a/hive-c0re/src/coordinator.rs b/hive-c0re/src/coordinator.rs
index 12a5056..3430685 100644
--- a/hive-c0re/src/coordinator.rs
+++ b/hive-c0re/src/coordinator.rs
@@ -91,6 +91,14 @@ impl Coordinator {
         Self::agent_state_root(name).join("config")
     }
 
+    /// Per-agent Claude credentials dir. Bind-mounted RW into the agent
+    /// container at `/root/.claude` so OAuth state survives container
+    /// destroy/recreate. Each agent owns its own token lineage — sharing
+    /// would break on the first refresh-token rotation.
+    pub fn agent_claude_dir(name: &str) -> PathBuf {
+        Self::agent_state_root(name).join("claude")
+    }
+
     /// Authoritative applied config repo. Hive-c0re-only.
     pub fn agent_applied_dir(name: &str) -> PathBuf {
         PathBuf::from(format!("{APPLIED_STATE_ROOT}/{name}"))
diff --git a/hive-c0re/src/dashboard.rs b/hive-c0re/src/dashboard.rs
index beca307..8f8cb31 100644
--- a/hive-c0re/src/dashboard.rs
+++ b/hive-c0re/src/dashboard.rs
@@ -162,7 +162,7 @@ fn render_containers(containers: &[String], hostname: &str) -> String {
             let port = lifecycle::agent_web_port(name);
             let _ = writeln!(
                 out,
-                "<li><span class=\"glyph\">▒░▒░░</span> <a href=\"http://{hostname}:{port}/\">{name}</a> <span class=\"role role-ag3nt\">ag3nt</span> <span class=\"meta\">{container} :{port}</span>\n  <form method=\"POST\" action=\"/destroy/{name}\" class=\"inline\" onsubmit=\"return confirm('destroy {name}? this wipes the agent\\'s state.');\"><button class=\"btn btn-destroy\" type=\"submit\">DESTR0Y</button></form>\n</li>",
+                "<li><span class=\"glyph\">▒░▒░░</span> <a href=\"http://{hostname}:{port}/\">{name}</a> <span class=\"role role-ag3nt\">ag3nt</span> <span class=\"meta\">{container} :{port}</span>\n  <form method=\"POST\" action=\"/destroy/{name}\" class=\"inline\" onsubmit=\"return confirm('destroy {name}? container is removed; state + creds kept.');\"><button class=\"btn btn-destroy\" type=\"submit\">DESTR0Y</button></form>\n</li>",
             );
         }
     }
diff --git a/hive-c0re/src/lifecycle.rs b/hive-c0re/src/lifecycle.rs
index eb4738c..cc80066 100644
--- a/hive-c0re/src/lifecycle.rs
+++ b/hive-c0re/src/lifecycle.rs
@@ -16,6 +16,10 @@ pub const MANAGER_NAME: &str = "hm1nd";
 /// Mount point of the per-agent runtime directory inside the container.
 pub const CONTAINER_RUNTIME_MOUNT: &str = "/run/hive";
 
+/// Mount point of the per-agent Claude credentials dir inside the container.
+/// Persistent across destroy/recreate so OAuth login survives.
+pub const CONTAINER_CLAUDE_MOUNT: &str = "/root/.claude";
+
 const GIT_NAME: &str = "hive-c0re";
 const GIT_EMAIL: &str = "hive-c0re@hyperhive";
 
@@ -66,14 +70,16 @@ pub async fn spawn(
     agent_dir: &Path,
     proposed_dir: &Path,
     applied_dir: &Path,
+    claude_dir: &Path,
 ) -> Result<()> {
     validate(name)?;
     setup_proposed(proposed_dir, name).await?;
     setup_applied(applied_dir, name, hyperhive_flake).await?;
+    ensure_claude_dir(claude_dir)?;
     let container = container_name(name);
     let flake_ref = format!("{}#default", applied_dir.display());
     run(&["create", &container, "--flake", &flake_ref]).await?;
-    set_nspawn_flags(&container, agent_dir)?;
+    set_nspawn_flags(&container, agent_dir, claude_dir)?;
     set_resource_limits(&container)?;
     systemd_daemon_reload().await?;
     run(&["start", &container]).await
@@ -108,12 +114,14 @@ pub async fn rebuild(
     hyperhive_flake: &str,
     agent_dir: &Path,
     applied_dir: &Path,
+    claude_dir: &Path,
 ) -> Result<()> {
     validate(name)?;
     setup_applied(applied_dir, name, hyperhive_flake).await?;
+    ensure_claude_dir(claude_dir)?;
     let container = container_name(name);
     let flake_ref = format!("{}#default", applied_dir.display());
-    set_nspawn_flags(&container, agent_dir)?;
+    set_nspawn_flags(&container, agent_dir, claude_dir)?;
     set_resource_limits(&container)?;
     systemd_daemon_reload().await?;
     run(&["update", &container, "--flake", &flake_ref]).await?;
@@ -248,6 +256,23 @@ pub async fn apply_commit(applied_dir: &Path, proposed_dir: &Path, commit_ref: &
     Ok(())
 }
 
+/// Create the per-agent Claude credentials dir if missing. Mode 0700 — only
+/// root inside the container reads/writes it. Idempotent: existing dirs are
+/// left untouched (an agent's OAuth tokens survive `destroy`/recreate).
+fn ensure_claude_dir(claude_dir: &Path) -> Result<()> {
+    if !claude_dir.exists() {
+        std::fs::create_dir_all(claude_dir)
+            .with_context(|| format!("create {}", claude_dir.display()))?;
+        #[cfg(unix)]
+        {
+            use std::os::unix::fs::PermissionsExt;
+            std::fs::set_permissions(claude_dir, std::fs::Permissions::from_mode(0o700))
+                .with_context(|| format!("chmod {}", claude_dir.display()))?;
+        }
+    }
+    Ok(())
+}
+
 fn initial_agent_nix(name: &str) -> String {
     format!(
         "{{ ... }}:\n{{\n  # Per-agent overrides for {name}. The manager edits this\n  # file (and commits) to customise the agent's NixOS config.\n}}\n",
@@ -347,12 +372,13 @@ async fn systemd_daemon_reload() -> Result<()> {
 /// is reachable on the host) and `EXTRA_NSPAWN_FLAGS` (the runtime-dir bind).
 /// The start script expands `$EXTRA_NSPAWN_FLAGS` unquoted into the
 /// `systemd-nspawn` command.
-fn set_nspawn_flags(container: &str, agent_dir: &Path) -> Result<()> {
+fn set_nspawn_flags(container: &str, agent_dir: &Path, claude_dir: &Path) -> Result<()> {
     let path = format!("/etc/nixos-containers/{container}.conf");
     let original = std::fs::read_to_string(&path).with_context(|| format!("read {path}"))?;
     let bind_flag = format!(
-        "EXTRA_NSPAWN_FLAGS=\"--bind={}:{CONTAINER_RUNTIME_MOUNT}\"",
-        agent_dir.display()
+        "EXTRA_NSPAWN_FLAGS=\"--bind={runtime}:{CONTAINER_RUNTIME_MOUNT} --bind={claude}:{CONTAINER_CLAUDE_MOUNT}\"",
+        runtime = agent_dir.display(),
+        claude = claude_dir.display(),
     );
     let mut lines: Vec<String> = original
         .lines()
diff --git a/hive-c0re/src/manager_server.rs b/hive-c0re/src/manager_server.rs
index 408bc4a..674e59d 100644
--- a/hive-c0re/src/manager_server.rs
+++ b/hive-c0re/src/manager_server.rs
@@ -97,12 +97,14 @@ async fn dispatch(req: &ManagerRequest, coord: &Coordinator) -> ManagerResponse
                 let agent_dir = coord.register_agent(name)?;
                 let proposed_dir = Coordinator::agent_proposed_dir(name);
                 let applied_dir = Coordinator::agent_applied_dir(name);
+                let claude_dir = Coordinator::agent_claude_dir(name);
                 if let Err(e) = lifecycle::spawn(
                     name,
                     &coord.hyperhive_flake,
                     &agent_dir,
                     &proposed_dir,
                     &applied_dir,
+                    &claude_dir,
                 )
                 .await
                 {
diff --git a/hive-c0re/src/server.rs b/hive-c0re/src/server.rs
index 447f94c..bd89c6f 100644
--- a/hive-c0re/src/server.rs
+++ b/hive-c0re/src/server.rs
@@ -64,12 +64,14 @@ async fn dispatch(req: &HostRequest, coord: &Coordinator) -> HostResponse {
                 let agent_dir = coord.register_agent(name)?;
                 let proposed_dir = Coordinator::agent_proposed_dir(name);
                 let applied_dir = Coordinator::agent_applied_dir(name);
+                let claude_dir = Coordinator::agent_claude_dir(name);
                 if let Err(e) = lifecycle::spawn(
                     name,
                     &coord.hyperhive_flake,
                     &agent_dir,
                     &proposed_dir,
                     &applied_dir,
+                    &claude_dir,
                 )
                 .await
                 {
@@ -93,7 +95,15 @@ async fn dispatch(req: &HostRequest, coord: &Coordinator) -> HostResponse {
                 tracing::info!(%name, "rebuild");
                 let agent_dir = coord.register_agent(name)?;
                 let applied_dir = Coordinator::agent_applied_dir(name);
-                lifecycle::rebuild(name, &coord.hyperhive_flake, &agent_dir, &applied_dir).await?;
+                let claude_dir = Coordinator::agent_claude_dir(name);
+                lifecycle::rebuild(
+                    name,
+                    &coord.hyperhive_flake,
+                    &agent_dir,
+                    &applied_dir,
+                    &claude_dir,
+                )
+                .await?;
                 HostResponse::success()
             }
             HostRequest::List => HostResponse::list(lifecycle::list().await?),
diff --git a/hive-sh4re/src/lib.rs b/hive-sh4re/src/lib.rs
index d4d5650..26bef10 100644
--- a/hive-sh4re/src/lib.rs
+++ b/hive-sh4re/src/lib.rs
@@ -16,8 +16,10 @@ pub enum HostRequest {
     Spawn { name: String },
     /// Stop a managed container (graceful).
     Kill { name: String },
-    /// Fully tear down a sub-agent: stop, wipe state + applied repo, drop the
-    /// systemd drop-in, purge pending approvals. Manager not destroyable.
+    /// Tear down a sub-agent container: stop + remove + drop the systemd
+    /// drop-in, purge pending approvals. Persistent state (proposed/applied
+    /// repos, Claude credentials) is KEPT by default — recreating the agent
+    /// with the same name reuses prior config + login. Manager not destroyable.
     Destroy { name: String },
     /// Apply pending config to a managed container.
     Rebuild { name: String },