fix: self-calibrate context window from API result event

the stream-json result event carries modelUsage.<model>.contextWindow which is the actual per-inference active window the model enforces. for claude-sonnet-4-6 this is 200k even though the full prompt cache can hold millions of tokens via accumulated cache reads. with the nix-configured sonnet = 1000000 the proactive compact watermark sat at 750k and was never reached. agents grew context until prompt_too_long at ~170k — reactive compact, no checkpoint turn. changes: - bus gains api_context_window field seeded from modelUsage.*.contextWindow in each turn's result event. authoritative; falls back to env var, then 200k. - new effective_context_window(bus) helper used by both watermark functions - compact_watermark (75%) and auto_reset_watermark (50%) call effective_context_window - context_tokens() docstring clarified: all three token fields (input + cache_read + cache_creation) count against the per-inference contextWindow limit. the large cache_read values seen in the result event are cumulative across all inferences in a turn, not per-inference. - /api/state context_window_tokens now reflects the calibrated window closes #129
2026-05-20 22:55:34 +02:00 · 2026-05-20 22:55:34 +02:00 · b0f6bd8ece
commit b0f6bd8ece
parent 3e94914569
3 changed files with 131 additions and 43 deletions
--- a/hive-ag3nt/src/events.rs
+++ b/hive-ag3nt/src/events.rs
@ -222,6 +222,11 @@ pub struct TokenUsage {

 impl TokenUsage {
    /// Total context consumed this turn (input + cache reads + cache writes).
+    /// This is the per-inference context footprint that counts against the
+    /// model's `contextWindow` limit. Tracked from the last `assistant` event
+    /// in the stream-json (per-inference usage, not the cumulative `result`
+    /// event which sums across all inferences in a tool-heavy turn and can
+    /// far exceed the per-inference window).
    #[must_use]
    pub fn context_tokens(&self) -> u64 {
        self.input_tokens + self.cache_read_input_tokens + self.cache_creation_input_tokens
@ -260,6 +265,33 @@ impl TokenUsage {
            cache_creation_input_tokens: field("cache_creation_input_tokens"),
        }
    }
+
+    /// Extract the per-inference context-window limit from a `result`
+    /// stream-json event's `modelUsage` map. The API reports this as
+    /// `modelUsage.<model-name>.contextWindow`; we take the first non-zero
+    /// value across all model keys.
+    ///
+    /// Returns `None` if the event is not a `result` type or has no
+    /// `contextWindow` field. The returned value is the authoritative
+    /// per-inference active window (e.g. 200 000 for `claude-sonnet-4-6`).
+    /// It may be smaller than the full prompt-cache capacity (which can
+    /// be several million tokens via cache reads).
+    #[must_use]
+    pub fn context_window_from_result_event(v: &serde_json::Value) -> Option<u64> {
+        if v.get("type").and_then(|t| t.as_str()) != Some("result") {
+            return None;
+        }
+        let model_usage = v.get("modelUsage")?;
+        let map = model_usage.as_object()?;
+        for (_model, stats) in map {
+            if let Some(w) = stats.get("contextWindow").and_then(serde_json::Value::as_u64) {
+                if w > 0 {
+                    return Some(w);
+                }
+            }
+        }
+        None
+    }
 }

 /// Authoritative turn-loop state. The harness owns it; the web UI
@ -385,6 +417,14 @@ pub struct Bus {
    /// `turn.rs` to compute how long the session has been idle and
    /// whether the prompt cache has gone cold. `0` = no turn yet.
    last_turn_ended_unix: Arc<AtomicI64>,
+    /// Per-inference context-window size as reported by the Anthropic API
+    /// in the stream-json `result` event (`modelUsage.*.contextWindow`).
+    /// Set by the stdout pump on every completed turn. Takes precedence
+    /// over the Nix-configured `HIVE_CONTEXT_WINDOW_TOKENS_*` env vars
+    /// for compaction watermark calculations — it reflects the actual
+    /// limit the model enforces, which may differ from what the operator
+    /// configured (e.g. 200 k active window on a 1 M cache-enabled model).
+    api_context_window: Arc<Mutex<Option<u64>>>,
 }

 impl Bus {
@ -419,6 +459,7 @@ impl Bus {
            skip_continue_once: Arc::new(AtomicBool::new(false)),
            tool_calls: Arc::new(Mutex::new(std::collections::HashMap::new())),
            last_turn_ended_unix: Arc::new(AtomicI64::new(0)),
+            api_context_window: Arc::new(Mutex::new(None)),
        }
    }

@ -519,6 +560,24 @@ impl Bus {
        self.last_turn_ended_unix.load(Ordering::Relaxed)
    }

+    /// Update the API-reported context-window size from the stream-json
+    /// `result` event's `modelUsage.*.contextWindow` field. Called by the
+    /// stdout pump once per completed turn. `0` is ignored (sentinel for
+    /// "not reported").
+    pub fn set_api_context_window(&self, window: u64) {
+        if window > 0 {
+            *self.api_context_window.lock().unwrap() = Some(window);
+        }
+    }
+
+    /// Return the API-reported per-inference context-window size, if the
+    /// harness has seen at least one completed turn for this session.
+    /// `None` until the first result event is processed.
+    #[must_use]
+    pub fn api_context_window(&self) -> Option<u64> {
+        *self.api_context_window.lock().unwrap()
+    }
+
    /// Walk a stream-json value for `tool_use` blocks and bump the
    /// per-turn counter for each one we find. Called by the stdout
    /// pump on every parsed line. Cheap when the line isn't an