agent badges: split into ctx (last-inference) + cost (cumulative)

the existing ctx badge was misnamed: it summed `result.usage`, which is
the cumulative tokens billed across every inference in the turn. for
tool-heavy turns that easily exceeds the model's context window (a 600k
cached prefix × 15 sub-calls = 9M cache_read), making it useless as a
"should i compact?" signal.

now two separate badges:

  ctx · N    last inference's prompt size = actual context window in
             use right now. parsed from each `assistant` event's
             `.message.usage`; the harness tracks the most recent one
             across the stream and snapshots it when the `result`
             event lands.

  cost · M   cumulative tokens billed across the whole turn (the
             previous behaviour, now correctly labelled).

both update via a single `TokenUsageChanged { ctx, cost }` SSE event at
turn-end. turn_stats grows four columns (`last_input_tokens`,
`last_output_tokens`, `last_cache_read_input_tokens`,
`last_cache_creation_input_tokens`) so the cold-load seed can paint both
badges on page load. migrations run try-and-ignore ALTERs so existing
agent dbs catch up; pre-migration rows have last-inference zeros and
yield no `ctx` seed (badge stays empty until next turn) rather than a
misleading 0.
This commit is contained in:
müde 2026-05-18 18:48:35 +02:00
parent 14549dd8a9
commit 5c6c607e25
9 changed files with 267 additions and 101 deletions

View file

@ -525,30 +525,43 @@
el_.textContent = 'model · ' + model;
el_.title = `claude --model ${model}\nset via the operator's /model command; persists across turns until changed`;
}
// Context badge — mirrors Claude Code's bottom-right "N tokens"
// indicator. Primary number is total prompt tokens used in the
// current context window (input + both cache axes); hover for the
// breakdown including output. Kept as chrome on the state row so
// the terminal stays the star.
function renderTokenUsage(u) {
const el_ = $('ctx-badge');
// Token badges — two separate chips:
// ctx · N last inference's prompt size = current context window
// utilisation (what to watch for compaction decisions)
// cost · M cumulative billed tokens across the whole last turn
// (sum across every inference; tool-heavy turns rebill
// the cached prompt per call and blow past the model's
// context window — this is a cost signal, not a size
// signal)
// Both fed by the same `token_usage_changed` SSE event (`{ ctx, cost }`).
const fmtTokens = (n) => {
if (n >= 1_000_000) return (n / 1_000_000).toFixed(1) + 'M';
if (n >= 1_000) return Math.round(n / 1000) + 'k';
return String(n);
};
function renderOneUsage(elId, label, u, blurb) {
const el_ = $(elId);
if (!el_) return;
if (!u) { el_.hidden = true; return; }
const ctx = u.input_tokens + u.cache_read_input_tokens + u.cache_creation_input_tokens;
const fmt = (n) => {
if (n >= 1_000_000) return (n / 1_000_000).toFixed(1) + 'M';
if (n >= 1_000) return Math.round(n / 1000) + 'k';
return String(n);
};
const total = u.input_tokens + u.cache_read_input_tokens + u.cache_creation_input_tokens;
el_.hidden = false;
el_.title = [
'context window in use',
blurb,
'input: ' + u.input_tokens,
'cache_read: ' + u.cache_read_input_tokens,
'cache_write: ' + u.cache_creation_input_tokens,
'output (last turn): ' + u.output_tokens,
'output: ' + u.output_tokens,
].join('\n');
el_.textContent = 'ctx · ' + fmt(ctx);
el_.textContent = label + ' · ' + fmtTokens(total);
}
function renderTokenUsage(ev) {
// `ev` is `{ ctx, cost }` either off /api/state cold-load (each may
// be null) or off a `token_usage_changed` SSE event (both present
// post-turn).
renderOneUsage('ctx-badge', 'ctx', ev && ev.ctx,
'last-inference prompt size — the actual context window in use right now');
renderOneUsage('cost-badge', 'cost', ev && ev.cost,
'cumulative tokens billed across the last turn (sum across every inference)');
}
function renderLastTurn(ms) {
const el_ = $('last-turn');
@ -626,7 +639,7 @@
}
renderAliveBadge(s.status);
renderModelChip(s.model);
renderTokenUsage(s.token_usage);
renderTokenUsage({ ctx: s.ctx_usage, cost: s.cost_usage });
// Open-threads aren't part of /api/state (kept on the broker
// db, fetched via the per-agent socket). Cold-load fetches
// it here; turn_end refreshes it via the renderer below.
@ -1026,7 +1039,7 @@
},
model_changed(ev, api) { if (!api.fromHistory) renderModelChip(ev.model); },
token_usage_changed(ev, api) {
if (!api.fromHistory) renderTokenUsage(ev.usage);
if (!api.fromHistory) renderTokenUsage({ ctx: ev.ctx, cost: ev.cost });
},
turn_state_changed(ev, api) {
if (!api.fromHistory) setStateAbs(ev.state, ev.since_unix);

View file

@ -18,6 +18,7 @@
<span id="state-badge" class="state-badge state-loading">… booting</span>
<span id="model-chip" class="model-chip" hidden></span>
<span id="ctx-badge" class="ctx-badge" hidden title="tokens used in the current context window"></span>
<span id="cost-badge" class="ctx-badge" hidden title="cumulative tokens billed across the last turn (sum across every inference; tool-heavy turns rebill the cached prompt per call)"></span>
<span id="last-turn" class="last-turn" hidden></span>
<button type="button" id="cancel-btn" class="btn-cancel-turn" hidden>■ cancel turn</button>
<button type="button" id="new-session-btn" class="btn-new-session"