agent badges: split into ctx (last-inference) + cost (cumulative)

the existing ctx badge was misnamed: it summed `result.usage`, which is the cumulative tokens billed across every inference in the turn. for tool-heavy turns that easily exceeds the model's context window (a 600k cached prefix × 15 sub-calls = 9M cache_read), making it useless as a "should i compact?" signal. now two separate badges: ctx · N last inference's prompt size = actual context window in use right now. parsed from each `assistant` event's `.message.usage`; the harness tracks the most recent one across the stream and snapshots it when the `result` event lands. cost · M cumulative tokens billed across the whole turn (the previous behaviour, now correctly labelled). both update via a single `TokenUsageChanged { ctx, cost }` SSE event at turn-end. turn_stats grows four columns (`last_input_tokens`, `last_output_tokens`, `last_cache_read_input_tokens`, `last_cache_creation_input_tokens`) so the cold-load seed can paint both badges on page load. migrations run try-and-ignore ALTERs so existing agent dbs catch up; pre-migration rows have last-inference zeros and yield no `ctx` seed (badge stays empty until next turn) rather than a misleading 0.
2026-05-18 18:48:35 +02:00 · 2026-05-18 18:48:35 +02:00 · 5c6c607e25
commit 5c6c607e25
parent 14549dd8a9
9 changed files with 267 additions and 101 deletions
--- a/docs/web-ui.md
+++ b/docs/web-ui.md
@ -310,13 +310,22 @@ Layout, top to bottom:
    `turn_state_since`.
  - Model chip: `model · <name>` (e.g. `model · haiku`). Driven
    by `LiveEvent::ModelChanged`; emitted from `Bus::set_model`.
-  - Ctx badge: `ctx · 142k` — total prompt tokens in the
-    current context window (input + cache_read + cache_write),
-    mirroring claude code's bottom-right indicator. Hover for
-    the breakdown including output. Driven by
-    `LiveEvent::TokenUsageChanged`; emitted from
-    `Bus::record_usage` whenever the terminal `result` event
-    delivers a fresh usage block.
+  - Ctx badge: `ctx · 142k` — last inference's prompt size
+    (input + cache_read + cache_write of the most recent
+    model call in the just-ended turn). This is the **actual
+    context window utilisation** — the number to watch when
+    deciding whether to compact.
+  - Cost badge: `cost · 1.3M` — cumulative tokens billed
+    across **every inference** in the last turn (sum of all
+    per-call prompts). Tool-heavy turns rebill the cached
+    prefix per call, so this routinely exceeds the model's
+    window — it's a cost signal, not a size signal.
+  - Both badges driven by `LiveEvent::TokenUsageChanged {
+    ctx, cost }`, emitted once at turn-end from
+    `Bus::record_turn_usage`. The harness tracks per-inference
+    usage by walking `assistant` events in the stream-json
+    and updating `last_inference` on each one; the `result`
+    event supplies `cost` and triggers the emit.
  - Last-turn chip: `last turn 12.3s` appears after the first
    turn ends, computed from the state-since deltas.
  - `■ cancel turn` button: visible only while state=thinking,
@ -437,8 +446,11 @@ Bus events (new vocabulary on `/events/stream`):
  `needs_login_idle` / `needs_login_in_progress`. Drives the
  alive-badge.
 - `model_changed { model }` — drives the model chip.
- `token_usage_changed { usage: TokenUsage }` — drives the
-  ctx-badge. Emitted from `Bus::record_usage` whenever the
-  stream-json `result` event delivers a fresh usage block.
+- `token_usage_changed { ctx: TokenUsage, cost: TokenUsage }`
+  — drives the ctx + cost badges. Emitted from
+  `Bus::record_turn_usage` at turn-end; `ctx` is the last
+  inference's usage (current context size), `cost` is the
+  cumulative across every inference (the `result` event's
+  totals).
 - `turn_state_changed { state, since_unix }` — drives the
  state badge (`idle`/`thinking`/`compacting`).
--- a/hive-ag3nt/assets/app.js
+++ b/hive-ag3nt/assets/app.js
@ -525,30 +525,43 @@
    el_.textContent = 'model · ' + model;
    el_.title = `claude --model ${model}\nset via the operator's /model command; persists across turns until changed`;
  }
-  // Context badge — mirrors Claude Code's bottom-right "N tokens"
-  // indicator. Primary number is total prompt tokens used in the
-  // current context window (input + both cache axes); hover for the
-  // breakdown including output. Kept as chrome on the state row so
-  // the terminal stays the star.
-  function renderTokenUsage(u) {
-    const el_ = $('ctx-badge');
+  // Token badges — two separate chips:
+  //   ctx · N    last inference's prompt size = current context window
+  //              utilisation (what to watch for compaction decisions)
+  //   cost · M   cumulative billed tokens across the whole last turn
+  //              (sum across every inference; tool-heavy turns rebill
+  //              the cached prompt per call and blow past the model's
+  //              context window — this is a cost signal, not a size
+  //              signal)
+  // Both fed by the same `token_usage_changed` SSE event (`{ ctx, cost }`).
+  const fmtTokens = (n) => {
+    if (n >= 1_000_000) return (n / 1_000_000).toFixed(1) + 'M';
+    if (n >= 1_000) return Math.round(n / 1000) + 'k';
+    return String(n);
+  };
+  function renderOneUsage(elId, label, u, blurb) {
+    const el_ = $(elId);
    if (!el_) return;
    if (!u) { el_.hidden = true; return; }
-    const ctx = u.input_tokens + u.cache_read_input_tokens + u.cache_creation_input_tokens;
-    const fmt = (n) => {
-      if (n >= 1_000_000) return (n / 1_000_000).toFixed(1) + 'M';
-      if (n >= 1_000) return Math.round(n / 1000) + 'k';
-      return String(n);
-    };
+    const total = u.input_tokens + u.cache_read_input_tokens + u.cache_creation_input_tokens;
    el_.hidden = false;
    el_.title = [
-      'context window in use',
+      blurb,
      'input: ' + u.input_tokens,
      'cache_read: ' + u.cache_read_input_tokens,
      'cache_write: ' + u.cache_creation_input_tokens,
-      'output (last turn): ' + u.output_tokens,
+      'output: ' + u.output_tokens,
    ].join('\n');
-    el_.textContent = 'ctx · ' + fmt(ctx);
+    el_.textContent = label + ' · ' + fmtTokens(total);
+  }
+  function renderTokenUsage(ev) {
+    // `ev` is `{ ctx, cost }` either off /api/state cold-load (each may
+    // be null) or off a `token_usage_changed` SSE event (both present
+    // post-turn).
+    renderOneUsage('ctx-badge', 'ctx', ev && ev.ctx,
+      'last-inference prompt size — the actual context window in use right now');
+    renderOneUsage('cost-badge', 'cost', ev && ev.cost,
+      'cumulative tokens billed across the last turn (sum across every inference)');
  }
  function renderLastTurn(ms) {
    const el_ = $('last-turn');
@ -626,7 +639,7 @@
      }
      renderAliveBadge(s.status);
      renderModelChip(s.model);
-      renderTokenUsage(s.token_usage);
+      renderTokenUsage({ ctx: s.ctx_usage, cost: s.cost_usage });
      // Open-threads aren't part of /api/state (kept on the broker
      // db, fetched via the per-agent socket). Cold-load fetches
      // it here; turn_end refreshes it via the renderer below.
@ -1026,7 +1039,7 @@
        },
        model_changed(ev, api) { if (!api.fromHistory) renderModelChip(ev.model); },
        token_usage_changed(ev, api) {
-          if (!api.fromHistory) renderTokenUsage(ev.usage);
+          if (!api.fromHistory) renderTokenUsage({ ctx: ev.ctx, cost: ev.cost });
        },
        turn_state_changed(ev, api) {
          if (!api.fromHistory) setStateAbs(ev.state, ev.since_unix);
--- a/hive-ag3nt/assets/index.html
+++ b/hive-ag3nt/assets/index.html
@ -18,6 +18,7 @@
    <span id="state-badge" class="state-badge state-loading">… booting</span>
    <span id="model-chip" class="model-chip" hidden></span>
    <span id="ctx-badge" class="ctx-badge" hidden title="tokens used in the current context window"></span>
+    <span id="cost-badge" class="ctx-badge" hidden title="cumulative tokens billed across the last turn (sum across every inference; tool-heavy turns rebill the cached prompt per call)"></span>
    <span id="last-turn" class="last-turn" hidden></span>
    <button type="button" id="cancel-btn" class="btn-cancel-turn" hidden>■ cancel turn</button>
    <button type="button" id="new-session-btn" class="btn-new-session"
--- a/hive-ag3nt/src/bin/hive-ag3nt.rs
+++ b/hive-ag3nt/src/bin/hive-ag3nt.rs
@ -74,10 +74,11 @@ async fn main() -> Result<()> {
            let login_state = Arc::new(Mutex::new(initial));
            let bus = Bus::new();
            let stats = TurnStats::open_default();
-            if let Some(s) = &stats
-                && let Some(u) = s.last_usage()
-            {
-                bus.seed_usage(u);
+            if let Some(s) = &stats {
+                let (ctx, cost) = s.last_usage();
+                if ctx.is_some() || cost.is_some() {
+                    bus.seed_usage(ctx, cost);
+                }
            }
            let files = turn::TurnFiles::prepare(&cli.socket, &label, mcp::Flavor::Agent).await?;
            let turn_lock: TurnLock = Arc::new(tokio::sync::Mutex::new(()));
@ -354,7 +355,8 @@ fn build_row(
    open_threads_count: Option<u64>,
    open_reminders_count: Option<u64>,
 ) -> TurnStatRow {
-    let usage = bus.last_usage().unwrap_or_default();
+    let cost = bus.last_cost_usage().unwrap_or_default();
+    let ctx = bus.last_ctx_usage().unwrap_or(cost);
    let tool_calls = bus.take_tool_calls();
    let tool_call_count: u64 = tool_calls.values().copied().sum();
    let tool_call_breakdown_json = if tool_calls.is_empty() {
@ -373,10 +375,14 @@ fn build_row(
        duration_ms,
        model,
        wake_from,
-        input_tokens: usage.input_tokens,
-        output_tokens: usage.output_tokens,
-        cache_read_input_tokens: usage.cache_read_input_tokens,
-        cache_creation_input_tokens: usage.cache_creation_input_tokens,
+        input_tokens: cost.input_tokens,
+        output_tokens: cost.output_tokens,
+        cache_read_input_tokens: cost.cache_read_input_tokens,
+        cache_creation_input_tokens: cost.cache_creation_input_tokens,
+        last_input_tokens: ctx.input_tokens,
+        last_output_tokens: ctx.output_tokens,
+        last_cache_read_input_tokens: ctx.cache_read_input_tokens,
+        last_cache_creation_input_tokens: ctx.cache_creation_input_tokens,
        tool_call_count,
        tool_call_breakdown_json,
        open_threads_count,
--- a/hive-ag3nt/src/bin/hive-m1nd.rs
+++ b/hive-ag3nt/src/bin/hive-m1nd.rs
@ -64,10 +64,11 @@ async fn main() -> Result<()> {
            let login_state = Arc::new(Mutex::new(initial));
            let bus = Bus::new();
            let stats = TurnStats::open_default();
-            if let Some(s) = &stats
-                && let Some(u) = s.last_usage()
-            {
-                bus.seed_usage(u);
+            if let Some(s) = &stats {
+                let (ctx, cost) = s.last_usage();
+                if ctx.is_some() || cost.is_some() {
+                    bus.seed_usage(ctx, cost);
+                }
            }
            let files = turn::TurnFiles::prepare(&cli.socket, &label, mcp::Flavor::Manager).await?;
            let turn_lock: TurnLock = Arc::new(tokio::sync::Mutex::new(()));
@ -291,7 +292,8 @@ fn build_row(
    open_threads_count: Option<u64>,
    open_reminders_count: Option<u64>,
 ) -> TurnStatRow {
-    let usage = bus.last_usage().unwrap_or_default();
+    let cost = bus.last_cost_usage().unwrap_or_default();
+    let ctx = bus.last_ctx_usage().unwrap_or(cost);
    let tool_calls = bus.take_tool_calls();
    let tool_call_count: u64 = tool_calls.values().copied().sum();
    let tool_call_breakdown_json = if tool_calls.is_empty() {
@ -310,10 +312,14 @@ fn build_row(
        duration_ms,
        model,
        wake_from,
-        input_tokens: usage.input_tokens,
-        output_tokens: usage.output_tokens,
-        cache_read_input_tokens: usage.cache_read_input_tokens,
-        cache_creation_input_tokens: usage.cache_creation_input_tokens,
+        input_tokens: cost.input_tokens,
+        output_tokens: cost.output_tokens,
+        cache_read_input_tokens: cost.cache_read_input_tokens,
+        cache_creation_input_tokens: cost.cache_creation_input_tokens,
+        last_input_tokens: ctx.input_tokens,
+        last_output_tokens: ctx.output_tokens,
+        last_cache_read_input_tokens: ctx.cache_read_input_tokens,
+        last_cache_creation_input_tokens: ctx.cache_creation_input_tokens,
        tool_call_count,
        tool_call_breakdown_json,
        open_threads_count,
--- a/hive-ag3nt/src/events.rs
+++ b/hive-ag3nt/src/events.rs
@ -130,10 +130,15 @@ pub enum LiveEvent {
    /// updates the chip + the per-turn stats sink will key off this
    /// to mark the boundary in its log.
    ModelChanged { model: String },
-    /// Final-turn `usage` block landed (input + output + cache
-    /// counters). Powers the context-window badge + accumulates into
-    /// the per-turn stats sink.
-    TokenUsageChanged { usage: TokenUsage },
+    /// Token usage for the turn just ended. Carries two snapshots:
+    /// - `ctx` is the LAST inference's usage block (the actual context
+    ///   window in use right now — what the operator needs to decide
+    ///   whether to compact / reset).
+    /// - `cost` is the cumulative usage across every inference in the
+    ///   turn (sum of per-call billed tokens — the cost signal). For
+    ///   tool-heavy turns the cumulative blows past the model's window
+    ///   because each tool call's prompt is rebilled.
+    TokenUsageChanged { ctx: TokenUsage, cost: TokenUsage },
    /// Harness's `TurnState` transitioned (idle / thinking /
    /// compacting). `since_unix` matches `Bus::state_snapshot().1`
    /// so the client's elapsed-time ticker keeps progressing across
@ -221,15 +226,29 @@ impl TokenUsage {
        self.input_tokens + self.cache_read_input_tokens + self.cache_creation_input_tokens
    }

-    /// Parse usage from a stream-json event. Returns `Some` only for the
-    /// terminal `result` event (which is the only one that carries `usage`);
-    /// every other event maps to `None`. Missing numeric fields default to 0
-    /// so partial server payloads don't drop the whole snapshot.
+    /// Parse usage from the terminal `result` stream-json event. This is the
+    /// **cumulative** sum across every inference in the turn — useful as a
+    /// cost signal, but NOT the current context size (a tool-heavy turn
+    /// sums per-call cached prompts and easily exceeds the model window).
    pub fn from_stream_event(v: &serde_json::Value) -> Option<Self> {
        if v.get("type").and_then(|t| t.as_str()) != Some("result") {
            return None;
        }
-        let u = v.get("usage")?;
+        Self::from_usage_obj(v.get("usage")?)
+    }
+
+    /// Parse usage from a per-inference `assistant` event's
+    /// `.message.usage` block. Each turn fires one of these for every
+    /// model call; tracking the LAST one over the turn gives the actual
+    /// conversation context size — the number to watch for compaction.
+    pub fn from_assistant_event(v: &serde_json::Value) -> Option<Self> {
+        if v.get("type").and_then(|t| t.as_str()) != Some("assistant") {
+            return None;
+        }
+        Self::from_usage_obj(v.get("message")?.get("usage")?)
+    }
+
+    fn from_usage_obj(u: &serde_json::Value) -> Option<Self> {
        let field = |k: &str| u.get(k).and_then(serde_json::Value::as_u64).unwrap_or(0);
        Some(Self {
            input_tokens: field("input_tokens"),
@ -281,12 +300,16 @@ pub struct Bus {
    /// Model name passed to `claude --model`. Default `haiku`; the
    /// operator can override at runtime via `POST /api/model`.
    model: Arc<Mutex<String>>,
-    /// Last token usage reported by claude (from the `result` stream-json
-    /// event). `None` until the first turn with usage data completes.
-    /// Updated on every turn; survives across turns within one harness
-    /// process lifetime (resets on container restart, which is fine —
-    /// it's a live indicator, not a cumulative counter).
-    last_usage: Arc<Mutex<Option<TokenUsage>>>,
+    /// Last-inference token usage from the most recent turn's final
+    /// `assistant` event. Represents the actual context window size at
+    /// turn-end — the number the operator watches to decide whether to
+    /// compact. `None` until the first turn completes.
+    last_ctx_usage: Arc<Mutex<Option<TokenUsage>>>,
+    /// Cumulative token usage from the most recent turn's `result`
+    /// event (sum across every inference in the turn). This is the cost
+    /// signal — tool-heavy turns rebill the cached prompt per call and
+    /// blow past the model window. `None` until the first turn completes.
+    last_cost_usage: Arc<Mutex<Option<TokenUsage>>>,
    /// One-shot: next `run_claude` call drops `--continue`, starting
    /// a fresh claude session. Set by `POST /api/new-session` from
    /// the per-agent web UI; consumed (cleared back to false) by the
@ -323,7 +346,8 @@ impl Bus {
            store,
            state: Arc::new(Mutex::new((TurnState::Idle, now_unix()))),
            model: Arc::new(Mutex::new(initial_model)),
-            last_usage: Arc::new(Mutex::new(None)),
+            last_ctx_usage: Arc::new(Mutex::new(None)),
+            last_cost_usage: Arc::new(Mutex::new(None)),
            skip_continue_once: Arc::new(AtomicBool::new(false)),
            tool_calls: Arc::new(Mutex::new(std::collections::HashMap::new())),
        }
@ -378,19 +402,27 @@ impl Bus {
        self.emit(LiveEvent::ModelChanged { model: value });
    }

-    /// Seed `last_usage` at startup without emitting a SSE event.
-    /// Used by the bin entrypoints to backfill from the most recent
-    /// `turn_stats` row so the per-agent web UI's `ctx-badge` paints
-    /// real numbers on cold load instead of staying empty until the
-    /// next turn finishes.
-    pub fn seed_usage(&self, usage: TokenUsage) {
-        *self.last_usage.lock().unwrap() = Some(usage);
+    /// Seed `last_ctx_usage` + `last_cost_usage` at startup without
+    /// emitting a SSE event. Used by the bin entrypoints to backfill
+    /// from the most recent `turn_stats` row so the per-agent web UI's
+    /// ctx + cost badges paint real numbers on cold load.
+    pub fn seed_usage(&self, ctx: Option<TokenUsage>, cost: Option<TokenUsage>) {
+        if ctx.is_some() {
+            *self.last_ctx_usage.lock().unwrap() = ctx;
+        }
+        if cost.is_some() {
+            *self.last_cost_usage.lock().unwrap() = cost;
+        }
    }

-    /// Record the latest token usage from a completed turn.
-    pub fn record_usage(&self, usage: TokenUsage) {
-        *self.last_usage.lock().unwrap() = Some(usage);
-        self.emit(LiveEvent::TokenUsageChanged { usage });
+    /// Record the just-ended turn's usage. `ctx` is the last inference's
+    /// usage (current context size); `cost` is the cumulative across
+    /// every inference in the turn (cost signal). One SSE event fires
+    /// per turn carrying both.
+    pub fn record_turn_usage(&self, ctx: TokenUsage, cost: TokenUsage) {
+        *self.last_ctx_usage.lock().unwrap() = Some(ctx);
+        *self.last_cost_usage.lock().unwrap() = Some(cost);
+        self.emit(LiveEvent::TokenUsageChanged { ctx, cost });
    }

    /// Walk a stream-json value for `tool_use` blocks and bump the
@ -430,10 +462,18 @@ impl Bus {
        std::mem::take(&mut *self.tool_calls.lock().unwrap())
    }

-    /// Last known token usage, or `None` if no turn has completed yet.
+    /// Last context-size snapshot (last inference of the most recent
+    /// turn), or `None` if no turn has completed yet.
    #[must_use]
-    pub fn last_usage(&self) -> Option<TokenUsage> {
-        *self.last_usage.lock().unwrap()
+    pub fn last_ctx_usage(&self) -> Option<TokenUsage> {
+        *self.last_ctx_usage.lock().unwrap()
+    }
+
+    /// Last cumulative cost snapshot (sum across the most recent turn's
+    /// inferences), or `None` if no turn has completed yet.
+    #[must_use]
+    pub fn last_cost_usage(&self) -> Option<TokenUsage> {
+        *self.last_cost_usage.lock().unwrap()
    }

    /// Update the harness's authoritative turn-loop state. Records
--- a/hive-ag3nt/src/turn.rs
+++ b/hive-ag3nt/src/turn.rs
@ -279,14 +279,28 @@ async fn run_claude(prompt: &str, files: &TurnFiles, bus: &Bus) -> Result<bool>
    let bus_err = bus.clone();
    let pump_stdout = tokio::spawn(async move {
        let mut reader = BufReader::new(stdout).lines();
+        // Track usage as the turn unfolds. `last_inference` overwrites on
+        // every assistant event so at result-time it holds the most recent
+        // model call's usage — the actual context size. The `result` event
+        // carries the cumulative-across-the-turn usage (cost signal). Both
+        // get handed to `record_turn_usage` together so a single SSE
+        // event updates both badges.
+        let mut last_inference: Option<crate::events::TokenUsage> = None;
        while let Ok(Some(line)) = reader.next_line().await {
            if line.contains(PROMPT_TOO_LONG_MARKER) {
                flag_out.store(true, Ordering::Relaxed);
            }
            match serde_json::from_str::<serde_json::Value>(&line) {
                Ok(v) => {
-                    if let Some(usage) = crate::events::TokenUsage::from_stream_event(&v) {
-                        bus_out.record_usage(usage);
+                    if let Some(u) = crate::events::TokenUsage::from_assistant_event(&v) {
+                        last_inference = Some(u);
+                    }
+                    if let Some(cost) = crate::events::TokenUsage::from_stream_event(&v) {
+                        // Fallback to `cost` if the turn somehow produced
+                        // a result without any assistant event — keeps the
+                        // ctx badge from going stale on a degenerate turn.
+                        let ctx = last_inference.unwrap_or(cost);
+                        bus_out.record_turn_usage(ctx, cost);
                    }
                    bus_out.observe_stream(&v);
                    bus_out.emit(LiveEvent::Stream(v));
--- a/hive-ag3nt/src/turn_stats.rs
+++ b/hive-ag3nt/src/turn_stats.rs
@ -22,8 +22,9 @@ use anyhow::{Context, Result};
 use rusqlite::{Connection, params};

 /// SQL bootstrap. CREATE TABLE IF NOT EXISTS so first-boot agents
-/// and existing ones converge on the same shape; ALTER-style
-/// migrations land here as additional statements once we have any.
+/// and existing ones converge on the same shape. The base table is
+/// fresh-install only; additive migrations land via `MIGRATIONS`
+/// below as try-and-ignore ALTERs so existing dbs catch up.
 const SCHEMA: &str = "
 CREATE TABLE IF NOT EXISTS turn_stats (
    id                              INTEGER PRIMARY KEY AUTOINCREMENT,
@ -36,6 +37,10 @@ CREATE TABLE IF NOT EXISTS turn_stats (
    output_tokens                   INTEGER NOT NULL DEFAULT 0,
    cache_read_input_tokens         INTEGER NOT NULL DEFAULT 0,
    cache_creation_input_tokens     INTEGER NOT NULL DEFAULT 0,
+    last_input_tokens               INTEGER NOT NULL DEFAULT 0,
+    last_output_tokens              INTEGER NOT NULL DEFAULT 0,
+    last_cache_read_input_tokens    INTEGER NOT NULL DEFAULT 0,
+    last_cache_creation_input_tokens INTEGER NOT NULL DEFAULT 0,
    tool_call_count                 INTEGER NOT NULL DEFAULT 0,
    tool_call_breakdown_json        TEXT,
    open_threads_count              INTEGER,
@ -47,6 +52,17 @@ CREATE INDEX IF NOT EXISTS idx_turn_stats_started
    ON turn_stats (started_at DESC);
 ";

+/// Additive column migrations. Each runs unconditionally and ignores
+/// `duplicate column name` errors — sqlite < 3.35 lacks
+/// `ADD COLUMN IF NOT EXISTS`, so try-and-ignore is the portable path.
+/// New columns MUST carry a default so existing rows decode.
+const MIGRATIONS: &[&str] = &[
+    "ALTER TABLE turn_stats ADD COLUMN last_input_tokens INTEGER NOT NULL DEFAULT 0",
+    "ALTER TABLE turn_stats ADD COLUMN last_output_tokens INTEGER NOT NULL DEFAULT 0",
+    "ALTER TABLE turn_stats ADD COLUMN last_cache_read_input_tokens INTEGER NOT NULL DEFAULT 0",
+    "ALTER TABLE turn_stats ADD COLUMN last_cache_creation_input_tokens INTEGER NOT NULL DEFAULT 0",
+];
+
 /// One row to be inserted. `Option`-wrapped fields default to NULL
 /// when the harness couldn't gather them (e.g. socket roundtrip for
 /// open_threads failed) so a partial row beats no row.
@ -57,10 +73,16 @@ pub struct TurnStatRow {
    pub duration_ms: i64,
    pub model: String,
    pub wake_from: String,
+    /// Cumulative across every inference in the turn (cost signal).
    pub input_tokens: u64,
    pub output_tokens: u64,
    pub cache_read_input_tokens: u64,
    pub cache_creation_input_tokens: u64,
+    /// Last inference's usage — the actual context size at turn end.
+    pub last_input_tokens: u64,
+    pub last_output_tokens: u64,
+    pub last_cache_read_input_tokens: u64,
+    pub last_cache_creation_input_tokens: u64,
    pub tool_call_count: u64,
    /// Per-tool breakdown as JSON: `{"Read":12,"Bash":3,...}`. None
    /// when no tools were called (saves a sqlite write of `"{}"`).
@ -107,6 +129,18 @@ impl TurnStats {
            .with_context(|| format!("open turn_stats db {}", path.display()))?;
        conn.execute_batch(SCHEMA)
            .context("apply turn_stats schema")?;
+        for stmt in MIGRATIONS {
+            // Ignore "duplicate column name" — the migration already ran.
+            // Any other error is logged but doesn't fail open() because the
+            // base schema works and we'd rather keep the harness alive than
+            // crash on an upgrade hiccup.
+            if let Err(e) = conn.execute(stmt, []) {
+                let msg = e.to_string();
+                if !msg.contains("duplicate column name") {
+                    tracing::warn!(error = %msg, stmt, "turn_stats migration failed");
+                }
+            }
+        }
        Ok(Self {
            inner: std::sync::Arc::new(Mutex::new(conn)),
        })
@ -121,6 +155,8 @@ impl TurnStats {
                started_at, ended_at, duration_ms, model, wake_from,
                input_tokens, output_tokens,
                cache_read_input_tokens, cache_creation_input_tokens,
+                last_input_tokens, last_output_tokens,
+                last_cache_read_input_tokens, last_cache_creation_input_tokens,
                tool_call_count, tool_call_breakdown_json,
                open_threads_count, open_reminders_count,
                result_kind, note
@ -130,7 +166,9 @@ impl TurnStats {
                ?8, ?9,
                ?10, ?11,
                ?12, ?13,
-                ?14, ?15
+                ?14, ?15,
+                ?16, ?17,
+                ?18, ?19
             )",
            params![
                row.started_at,
@ -142,6 +180,10 @@ impl TurnStats {
                i64::try_from(row.output_tokens).unwrap_or(i64::MAX),
                i64::try_from(row.cache_read_input_tokens).unwrap_or(i64::MAX),
                i64::try_from(row.cache_creation_input_tokens).unwrap_or(i64::MAX),
+                i64::try_from(row.last_input_tokens).unwrap_or(i64::MAX),
+                i64::try_from(row.last_output_tokens).unwrap_or(i64::MAX),
+                i64::try_from(row.last_cache_read_input_tokens).unwrap_or(i64::MAX),
+                i64::try_from(row.last_cache_creation_input_tokens).unwrap_or(i64::MAX),
                i64::try_from(row.tool_call_count).unwrap_or(i64::MAX),
                row.tool_call_breakdown_json,
                row.open_threads_count
@ -157,32 +199,58 @@ impl TurnStats {
        }
    }

-    /// Token counts from the most recently inserted row, if any. Lets
-    /// the harness seed `Bus::last_usage` on startup so the per-agent
-    /// web UI's `ctx-badge` paints with real numbers on cold load
-    /// instead of waiting for the next `TokenUsageChanged` SSE event.
-    /// Best-effort: any sqlite error returns `None` and the caller
-    /// falls back to the empty state.
+    /// Token counts from the most recently inserted row, if any.
+    /// Returns `(ctx, cost)` — both backfill `Bus` on startup so the
+    /// per-agent web UI's ctx + cost badges paint with real numbers on
+    /// cold load instead of waiting for the next `TokenUsageChanged`
+    /// SSE event. Best-effort: any sqlite error returns `(None, None)`.
+    ///
+    /// Pre-migration rows (before the `last_*_tokens` columns existed)
+    /// have last-inference zeros — those rows yield `ctx = None` so the
+    /// badge stays empty until the next real turn rather than showing a
+    /// misleading 0.
    #[must_use]
-    pub fn last_usage(&self) -> Option<crate::events::TokenUsage> {
+    pub fn last_usage(
+        &self,
+    ) -> (
+        Option<crate::events::TokenUsage>,
+        Option<crate::events::TokenUsage>,
+    ) {
        let conn = self.inner.lock().unwrap();
        conn.query_row(
            "SELECT input_tokens, output_tokens,
-                    cache_read_input_tokens, cache_creation_input_tokens
+                    cache_read_input_tokens, cache_creation_input_tokens,
+                    last_input_tokens, last_output_tokens,
+                    last_cache_read_input_tokens, last_cache_creation_input_tokens
             FROM turn_stats
             ORDER BY started_at DESC
             LIMIT 1",
            [],
            |row| {
-                Ok(crate::events::TokenUsage {
-                    input_tokens: u64::try_from(row.get::<_, i64>(0)?).unwrap_or(0),
-                    output_tokens: u64::try_from(row.get::<_, i64>(1)?).unwrap_or(0),
-                    cache_read_input_tokens: u64::try_from(row.get::<_, i64>(2)?).unwrap_or(0),
-                    cache_creation_input_tokens: u64::try_from(row.get::<_, i64>(3)?).unwrap_or(0),
-                })
+                let g = |i: usize| -> rusqlite::Result<u64> {
+                    Ok(u64::try_from(row.get::<_, i64>(i)?).unwrap_or(0))
+                };
+                let cost = crate::events::TokenUsage {
+                    input_tokens: g(0)?,
+                    output_tokens: g(1)?,
+                    cache_read_input_tokens: g(2)?,
+                    cache_creation_input_tokens: g(3)?,
+                };
+                let last = crate::events::TokenUsage {
+                    input_tokens: g(4)?,
+                    output_tokens: g(5)?,
+                    cache_read_input_tokens: g(6)?,
+                    cache_creation_input_tokens: g(7)?,
+                };
+                let ctx = if last == crate::events::TokenUsage::default() {
+                    None
+                } else {
+                    Some(last)
+                };
+                Ok((ctx, Some(cost)))
            },
        )
-        .ok()
+        .unwrap_or((None, None))
    }
 }

--- a/hive-ag3nt/src/web_ui.rs
+++ b/hive-ag3nt/src/web_ui.rs
@ -225,9 +225,13 @@ struct StateSnapshot {
    /// the operator can see what they just switched to (and what's
    /// in flight). Mutable at runtime via `POST /api/model`.
    model: String,
-    /// Token usage from the last completed turn. `null` until the
-    /// first turn with usage data finishes.
-    token_usage: Option<crate::events::TokenUsage>,
+    /// Last-inference token usage from the most recent completed
+    /// turn — represents the current context-window size at turn-end.
+    /// `null` until the first turn finishes.
+    ctx_usage: Option<crate::events::TokenUsage>,
+    /// Cumulative token usage across the most recent turn's inferences
+    /// (cost signal). `null` until the first turn finishes.
+    cost_usage: Option<crate::events::TokenUsage>,
 }

 #[derive(Serialize)]
@ -310,7 +314,8 @@ async fn api_state(State(state): State<AppState>) -> axum::Json<StateSnapshot> {
    let inbox = recent_inbox(&state.socket, state.flavor()).await;
    let (turn_state, turn_state_since) = state.bus.state_snapshot();
    let model = state.bus.model();
-    let token_usage = state.bus.last_usage();
+    let ctx_usage = state.bus.last_ctx_usage();
+    let cost_usage = state.bus.last_cost_usage();
    axum::Json(StateSnapshot {
        seq,
        label: state.label.clone(),
@ -321,7 +326,8 @@ async fn api_state(State(state): State<AppState>) -> axum::Json<StateSnapshot> {
        turn_state,
        turn_state_since,
        model,
-        token_usage,
+        ctx_usage,
+        cost_usage,
    })
 }