use proper context stats · ptr.pet/klbr@eaa3582

+18 -12

klbr-core/src/agent.rs

··· 8 8 config::Config, 9 9 context::Context, 10 10 interrupt::Interrupt, 11 - llm::{LlmClient, Message}, 11 + llm::{LlmClient, LlmEvent, Message}, 12 12 memory::MemoryStore, 13 13 }; 14 14 use klbr_ipc::ServerMsg; ··· 69 69 70 70 let mut response = String::new(); 71 71 let mut thinking = String::new(); 72 - while let Some((is_think, tok)) = tok_rx.recv().await { 73 - if is_think { 74 - thinking.push_str(&tok); 75 - let _ = output.send(ServerMsg::ThinkToken { content: tok }); 76 - } else { 77 - response.push_str(&tok); 78 - let _ = output.send(ServerMsg::Token { content: tok }); 72 + while let Some(ev) = tok_rx.recv().await { 73 + match ev { 74 + LlmEvent::ThinkToken(tok) => { 75 + thinking.push_str(&tok); 76 + let _ = output.send(ServerMsg::ThinkToken { content: tok }); 77 + } 78 + LlmEvent::Token(tok) => { 79 + response.push_str(&tok); 80 + let _ = output.send(ServerMsg::Token { content: tok }); 81 + } 82 + LlmEvent::Usage(usage) => { 83 + ctx.update_tokens(usage.total_tokens); 84 + } 79 85 } 80 86 } 81 87 ··· 87 93 let _ = output.send(ServerMsg::Done); 88 94 let metrics = ServerMsg::Metrics { 89 95 turn_count, 90 - context_chars: ctx.char_count(), 91 - watermark: config.watermark_chars, 96 + context_tokens: ctx.total_tokens, 97 + watermark: config.watermark_tokens, 92 98 }; 93 99 *snapshot.write().await = Some(metrics.clone()); 94 100 let _ = output.send(metrics); 95 101 96 - if ctx.char_count() > config.watermark_chars { 102 + if ctx.total_tokens > config.watermark_tokens { 97 103 let _ = output.send(ServerMsg::Status { 98 104 content: "compacting...".into(), 99 105 }); ··· 122 128 "summarize these conversation turns concisely, preserving key facts and topics:\n\n{turns_text}" 123 129 ))]; 124 130 125 - let Ok(summary) = llm.complete(&prompt).await else { 131 + let Ok((summary, _)) = llm.complete(&prompt).await else { 126 132 return; 127 133 }; 128 134 let Ok(emb) = llm.embed(&summary).await else {

+2 -3

klbr-core/src/config.rs

··· 6 6 pub embed_url: String, 7 7 pub llm_model: String, 8 8 pub embed_model: String, 9 - /// char count before context compaction fires 10 - pub watermark_chars: usize, 9 + pub watermark_tokens: usize, 11 10 /// how many recent turns to preserve during compaction 12 11 pub compaction_keep: usize, 13 12 /// memories to inject per turn ··· 32 31 embed_url: "http://localhost:1234".into(), 33 32 llm_model: "google/gemma-4-26b-a4b".into(), 34 33 embed_model: "nomic-embed-text-v1.5".into(), 35 - watermark_chars: 48_000, 34 + watermark_tokens: 32_000, 36 35 compaction_keep: 10, 37 36 memory_top_k: 3, 38 37 memory_sim_threshold: 0.3,

+9 -4

klbr-core/src/context.rs

··· 5 5 anchor: Vec<Message>, 6 6 /// rolling conversation turns 7 7 pub turns: Vec<Message>, 8 + pub total_tokens: usize, 8 9 } 9 10 10 11 impl Context { ··· 12 13 Self { 13 14 anchor: vec![Message::system(anchor)], 14 15 turns: vec![], 16 + total_tokens: 0, 15 17 } 16 18 } 17 19 ··· 46 48 self.anchor.iter().chain(&self.turns).cloned().collect() 47 49 } 48 50 49 - pub fn char_count(&self) -> usize { 50 - self.turns.iter().map(|m| m.content.len()).sum() 51 - } 52 - 53 51 pub fn turn_count(&self) -> usize { 54 52 self.turns.len() 55 53 } ··· 59 57 if self.turns.len() <= keep { 60 58 return vec![]; 61 59 } 60 + // we reset tokens because we lost track of the exact window count after draining 61 + // it will be updated on the next API call anyway 62 + self.total_tokens = 0; 62 63 self.turns.drain(..self.turns.len() - keep).collect() 64 + } 65 + 66 + pub fn update_tokens(&mut self, tokens: usize) { 67 + self.total_tokens = tokens; 63 68 } 64 69 }

+41 -13

klbr-core/src/llm.rs

··· 34 34 } 35 35 } 36 36 37 - /// (is_think, token) - is_think=true means reasoning_content (thinking trace) 38 - pub type StreamChunk = (bool, String); 37 + #[derive(Debug, Serialize, Deserialize, Clone, Default)] 38 + pub struct Usage { 39 + pub prompt_tokens: usize, 40 + pub completion_tokens: usize, 41 + pub total_tokens: usize, 42 + } 43 + 44 + #[derive(Debug, Clone)] 45 + pub enum LlmEvent { 46 + Token(String), 47 + ThinkToken(String), 48 + Usage(Usage), 49 + } 39 50 40 51 #[derive(Clone)] 41 52 pub struct LlmClient { ··· 51 62 } 52 63 } 53 64 54 - pub async fn stream( 55 - &self, 56 - messages: &[Message], 57 - tok_tx: mpsc::Sender<StreamChunk>, 58 - ) -> Result<()> { 65 + pub async fn stream(&self, messages: &[Message], tok_tx: mpsc::Sender<LlmEvent>) -> Result<()> { 59 66 let body = json!({ 60 67 "model": self.config.llm_model, 61 68 "messages": messages, 62 69 "stream": true, 70 + "stream_options": { "include_usage": true } 63 71 }); 64 72 65 73 let mut res = self ··· 88 96 let Ok(v) = serde_json::from_str::<Value>(data) else { 89 97 continue; 90 98 }; 91 - let delta = &v["choices"][0]["delta"]; 99 + 100 + if let Some(_usage) = v["usage"].as_object() { 101 + let u: Usage = serde_json::from_value(v["usage"].clone())?; 102 + let _ = tok_tx.send(LlmEvent::Usage(u)).await; 103 + continue; 104 + } 105 + 106 + let Some(choices) = v["choices"].as_array() else { 107 + continue; 108 + }; 109 + if choices.is_empty() { 110 + continue; 111 + } 112 + let delta = &choices[0]["delta"]; 92 113 93 114 // reasoning_content comes before content during thinking 94 115 if let Some(t) = delta["reasoning_content"].as_str() { 95 - if !t.is_empty() && tok_tx.send((true, t.to_string())).await.is_err() { 116 + if !t.is_empty() 117 + && tok_tx 118 + .send(LlmEvent::ThinkToken(t.to_string())) 119 + .await 120 + .is_err() 121 + { 96 122 return Ok(()); 97 123 } 98 124 } 99 125 if let Some(t) = delta["content"].as_str() { 100 - if !t.is_empty() && tok_tx.send((false, t.to_string())).await.is_err() { 126 + if !t.is_empty() && tok_tx.send(LlmEvent::Token(t.to_string())).await.is_err() { 101 127 return Ok(()); 102 128 } 103 129 } ··· 108 134 } 109 135 110 136 /// non-streaming, used for compaction summaries 111 - pub async fn complete(&self, messages: &[Message]) -> Result<String> { 137 + pub async fn complete(&self, messages: &[Message]) -> Result<(String, Usage)> { 112 138 let body = json!({ 113 139 "model": self.config.llm_model, 114 140 "messages": messages, ··· 122 148 .await? 123 149 .json::<Value>() 124 150 .await?; 125 - Ok(v["choices"][0]["message"]["content"] 151 + let content = v["choices"][0]["message"]["content"] 126 152 .as_str() 127 153 .unwrap_or("") 128 - .to_string()) 154 + .to_string(); 155 + let usage: Usage = serde_json::from_value(v["usage"].clone()).unwrap_or_default(); 156 + Ok((content, usage)) 129 157 } 130 158 131 159 pub async fn embed(&self, text: &str) -> Result<Vec<f32>> {

+1 -1

klbr-ipc/src/lib.rs

··· 26 26 /// sent after each Done with current agent stats 27 27 Metrics { 28 28 turn_count: usize, 29 - context_chars: usize, 29 + context_tokens: usize, 30 30 watermark: usize, 31 31 }, 32 32 /// pushed on connect: current context window turns

+7 -8

klbr-tui/src/main.rs

··· 111 111 112 112 // metrics 113 113 turn_count: usize, 114 - context_chars: usize, 114 + context_tokens: usize, 115 115 watermark: usize, 116 116 stream_start: Option<Instant>, 117 117 stream_tokens: usize, ··· 132 132 history_exhausted: false, 133 133 loading_history: false, 134 134 turn_count: 0, 135 - context_chars: 0, 135 + context_tokens: 0, 136 136 watermark: 0, 137 137 stream_start: None, 138 138 stream_tokens: 0, ··· 471 471 .unwrap_or_default(); 472 472 473 473 let ctx_pct = (app.watermark > 0) 474 - .then(|| (app.context_chars as f64 / app.watermark as f64 * 100.0) as usize) 474 + .then(|| (app.context_tokens as f64 / app.watermark as f64 * 100.0) as usize) 475 475 .unwrap_or(0); 476 476 477 477 let context_str = if app.watermark > 0 { 478 - let remaining = app.watermark.saturating_sub(app.context_chars); 479 - let tokens_left = remaining / 4; 480 - format!("ctx {ctx_pct}% (~{tokens_left} tok until compact)") 478 + let remaining = app.watermark.saturating_sub(app.context_tokens); 479 + format!("ctx {ctx_pct}% ({remaining} tok left)") 481 480 } else { 482 481 String::new() 483 482 }; ··· 638 637 } 639 638 ServerMsg::Metrics { 640 639 turn_count, 641 - context_chars, 640 + context_tokens, 642 641 watermark, 643 642 } => { 644 643 app.turn_count = turn_count; 645 - app.context_chars = context_chars; 644 + app.context_tokens = context_tokens; 646 645 app.watermark = watermark; 647 646 } 648 647 ServerMsg::History { turns } => {

Configure Feed

Configure Feed