From 94781ccd0818d5eb243791255669b3a8f9b262f0 Mon Sep 17 00:00:00 2001
From: damocles <damocles@hyperhive>
Date: Wed, 20 May 2026 11:25:08 +0200
Subject: [PATCH] lifecycle: append container journal tail to failed
 nixos-container update

---
 CLAUDE.md                  | 11 +++++++++++
 hive-c0re/src/lifecycle.rs | 35 ++++++++++++++++++++++++++++++++++-
 2 files changed, 45 insertions(+), 1 deletion(-)

diff --git a/CLAUDE.md b/CLAUDE.md
index b306ca1..bf5c0d1 100644
--- a/CLAUDE.md
+++ b/CLAUDE.md
@@ -185,6 +185,17 @@ read them à la carte.
 In-flight or recent context that hasn't earned a section yet.
 Prune freely.
 
+- **Just landed:** failed `nixos-container update` self-documents.
+  `lifecycle::run` now appends the tail (40 lines) of the target
+  container's own journal to the bail message when an `update`
+  fails. `nixos-container`'s own stderr on a reload-phase failure
+  is terse ("failed to reload container"); the real cause —
+  which unit failed `switch-to-configuration` — lives in the
+  *container* journal. Scoped to `update` (container's still up
+  on the old generation, so `journalctl -M` works); best-effort,
+  appends nothing if the journal can't be read. The manager's
+  `update` tool / rebuild errors now carry the failing-unit
+  detail without a second `get_logs` call.
 - **Just landed:** `hyperhive.westonRdp.enable` option. New
   `nix/templates/weston-rdp.nix` declares a per-agent bool;
   enabling it runs weston with the RDP backend as a systemd
diff --git a/hive-c0re/src/lifecycle.rs b/hive-c0re/src/lifecycle.rs
index c48300c..ad96f6f 100644
--- a/hive-c0re/src/lifecycle.rs
+++ b/hive-c0re/src/lifecycle.rs
@@ -959,7 +959,40 @@ async fn run(args: &[&str]) -> Result<()> {
             .cloned()
             .collect::<Vec<_>>()
             .join("\n");
-        bail!("nixos-container {cmdline} failed ({status}): {tail}");
+        let journal = container_journal_tail(args).await;
+        bail!("nixos-container {cmdline} failed ({status}): {tail}{journal}");
     }
     Ok(())
 }
+
+/// On a failed `nixos-container update`, the stderr nixos-container
+/// itself prints is often terse ("failed to reload container") — the
+/// real reason (which unit failed `switch-to-configuration` during
+/// the reload phase) lands in the *container's* own journal, not on
+/// the host. Fetch the tail of it so a failed rebuild self-documents
+/// the failing unit in the error string, no second round-trip.
+///
+/// Scoped to `update`: that's the reload-phase case, and the
+/// container is still up (running the old generation) so
+/// `journalctl -M` works. Best-effort — returns "" for other verbs
+/// or when the journal can't be read (machine gone, journalctl
+/// missing); it never produces an error of its own.
+async fn container_journal_tail(args: &[&str]) -> String {
+    if args.first().copied() != Some("update") {
+        return String::new();
+    }
+    let Some(container) = args.get(1) else {
+        return String::new();
+    };
+    let out = Command::new("journalctl")
+        .args(["-M", container, "-n", "40", "--no-pager", "--output=short"])
+        .output()
+        .await;
+    match out {
+        Ok(o) if !o.stdout.is_empty() => format!(
+            "\n--- last 40 journal lines from container '{container}' ---\n{}",
+            String::from_utf8_lossy(&o.stdout).trim_end()
+        ),
+        _ => String::new(),
+    }
+}