From 94781ccd0818d5eb243791255669b3a8f9b262f0 Mon Sep 17 00:00:00 2001 From: damocles Date: Wed, 20 May 2026 11:25:08 +0200 Subject: [PATCH] lifecycle: append container journal tail to failed nixos-container update --- CLAUDE.md | 11 +++++++++++ hive-c0re/src/lifecycle.rs | 35 ++++++++++++++++++++++++++++++++++- 2 files changed, 45 insertions(+), 1 deletion(-) diff --git a/CLAUDE.md b/CLAUDE.md index b306ca1..bf5c0d1 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -185,6 +185,17 @@ read them à la carte. In-flight or recent context that hasn't earned a section yet. Prune freely. +- **Just landed:** failed `nixos-container update` self-documents. + `lifecycle::run` now appends the tail (40 lines) of the target + container's own journal to the bail message when an `update` + fails. `nixos-container`'s own stderr on a reload-phase failure + is terse ("failed to reload container"); the real cause — + which unit failed `switch-to-configuration` — lives in the + *container* journal. Scoped to `update` (container's still up + on the old generation, so `journalctl -M` works); best-effort, + appends nothing if the journal can't be read. The manager's + `update` tool / rebuild errors now carry the failing-unit + detail without a second `get_logs` call. - **Just landed:** `hyperhive.westonRdp.enable` option. New `nix/templates/weston-rdp.nix` declares a per-agent bool; enabling it runs weston with the RDP backend as a systemd diff --git a/hive-c0re/src/lifecycle.rs b/hive-c0re/src/lifecycle.rs index c48300c..ad96f6f 100644 --- a/hive-c0re/src/lifecycle.rs +++ b/hive-c0re/src/lifecycle.rs @@ -959,7 +959,40 @@ async fn run(args: &[&str]) -> Result<()> { .cloned() .collect::>() .join("\n"); - bail!("nixos-container {cmdline} failed ({status}): {tail}"); + let journal = container_journal_tail(args).await; + bail!("nixos-container {cmdline} failed ({status}): {tail}{journal}"); } Ok(()) } + +/// On a failed `nixos-container update`, the stderr nixos-container +/// itself prints is often terse ("failed to reload container") — the +/// real reason (which unit failed `switch-to-configuration` during +/// the reload phase) lands in the *container's* own journal, not on +/// the host. Fetch the tail of it so a failed rebuild self-documents +/// the failing unit in the error string, no second round-trip. +/// +/// Scoped to `update`: that's the reload-phase case, and the +/// container is still up (running the old generation) so +/// `journalctl -M` works. Best-effort — returns "" for other verbs +/// or when the journal can't be read (machine gone, journalctl +/// missing); it never produces an error of its own. +async fn container_journal_tail(args: &[&str]) -> String { + if args.first().copied() != Some("update") { + return String::new(); + } + let Some(container) = args.get(1) else { + return String::new(); + }; + let out = Command::new("journalctl") + .args(["-M", container, "-n", "40", "--no-pager", "--output=short"]) + .output() + .await; + match out { + Ok(o) if !o.stdout.is_empty() => format!( + "\n--- last 40 journal lines from container '{container}' ---\n{}", + String::from_utf8_lossy(&o.stdout).trim_end() + ), + _ => String::new(), + } +}