ive harnessed the harness
1{
2 "dataset_id": "internal-eval-starter-v2",
3 "description": "Expanded internal eval set for KLBR MVP retrieval benchmarking. Timeline-aware, category-complete, and large enough to stress no-hit, support-calibration, and multi-evidence behavior more realistically.",
4 "memories": [
5 {
6 "memory_id": 1,
7 "namespace": "default",
8 "layer": "L1",
9 "timeline_id": "t0-editor-before-switch",
10 "text": "before the switch, the primary editor for klbr work was neovim.",
11 "event_time": 1776153600,
12 "status": "active",
13 "source_ref": "session:t0:m1",
14 "tags": ["project:klbr", "topic:editor"]
15 },
16 {
17 "memory_id": 2,
18 "namespace": "default",
19 "layer": "L1",
20 "timeline_id": "t1-editor-and-embedding-split",
21 "text": "switched the day-to-day klbr editor from neovim to zed.",
22 "event_time": 1776157200,
23 "status": "active",
24 "source_ref": "session:t1:m1",
25 "tags": ["project:klbr", "topic:editor"]
26 },
27 {
28 "memory_id": 3,
29 "namespace": "default",
30 "layer": "L1",
31 "timeline_id": "t1-editor-and-embedding-split",
32 "text": "split embeddings onto a separate llama-server at localhost 8002 using bge-m3 with dimension 1024.",
33 "event_time": 1776159000,
34 "status": "active",
35 "source_ref": "session:t1:m2",
36 "tags": ["project:klbr", "topic:embeddings"]
37 },
38 {
39 "memory_id": 4,
40 "namespace": "default",
41 "layer": "L1",
42 "timeline_id": "t1-editor-and-embedding-split",
43 "text": "the first benchmark slice should stay exact retrieval first before indexed search.",
44 "event_time": 1776160800,
45 "status": "active",
46 "source_ref": "session:t1:m3",
47 "tags": ["project:klbr", "topic:benchmarking"]
48 },
49 {
50 "memory_id": 5,
51 "namespace": "default",
52 "layer": "L1",
53 "timeline_id": "t2-error-propagation",
54 "text": "fixed the daemon swallowing llm stream errors and forwarded the error to the tui.",
55 "event_time": 1776250800,
56 "status": "active",
57 "source_ref": "session:t2:m1",
58 "tags": ["project:klbr", "topic:daemon", "topic:tui"]
59 },
60 {
61 "memory_id": 6,
62 "namespace": "default",
63 "layer": "L1",
64 "timeline_id": "t2-error-propagation",
65 "text": "compaction summaries are stored with the compaction_summary tag.",
66 "event_time": 1776252600,
67 "status": "active",
68 "source_ref": "session:t2:m2",
69 "tags": ["project:klbr", "topic:memory"]
70 },
71 {
72 "memory_id": 7,
73 "namespace": "default",
74 "layer": "L1",
75 "timeline_id": "t2-error-propagation",
76 "text": "the chat llama-server did not expose embeddings, so embeddings had to stay on a separate process.",
77 "event_time": 1776254400,
78 "status": "active",
79 "source_ref": "session:t2:m3",
80 "tags": ["project:klbr", "topic:embeddings", "topic:llama-server"]
81 },
82 {
83 "memory_id": 8,
84 "namespace": "default",
85 "layer": "L1",
86 "timeline_id": "t3-benchmark-scaffolding",
87 "text": "the benchmark harness should log per-query candidate lists and stage latencies.",
88 "event_time": 1776351600,
89 "status": "active",
90 "source_ref": "session:t3:m1",
91 "tags": ["project:klbr", "topic:benchmarking"]
92 },
93 {
94 "memory_id": 9,
95 "namespace": "default",
96 "layer": "L1",
97 "timeline_id": "t3-benchmark-scaffolding",
98 "text": "internal eval should split by timeline or session, not by individual question only.",
99 "event_time": 1776353400,
100 "status": "active",
101 "source_ref": "session:t3:m2",
102 "tags": ["project:klbr", "topic:benchmarking", "topic:eval-data"]
103 },
104 {
105 "memory_id": 10,
106 "namespace": "default",
107 "layer": "L1",
108 "timeline_id": "t3-benchmark-scaffolding",
109 "text": "recent work has mostly been retrieval plumbing, instrumentation, and benchmark scaffolding.",
110 "event_time": 1776355200,
111 "status": "active",
112 "source_ref": "session:t3:m3",
113 "tags": ["project:klbr", "topic:benchmarking", "topic:retrieval"]
114 },
115 {
116 "memory_id": 11,
117 "namespace": "default",
118 "layer": "L1",
119 "timeline_id": "t4-reranker-plan",
120 "text": "planned a separate reranker endpoint on localhost 8003.",
121 "event_time": 1776421800,
122 "status": "active",
123 "source_ref": "session:t4:m1",
124 "tags": ["project:klbr", "topic:reranker"]
125 },
126 {
127 "memory_id": 12,
128 "namespace": "default",
129 "layer": "L1",
130 "timeline_id": "t4-reranker-plan",
131 "text": "the cpu-only reranker should use plain http, not https.",
132 "event_time": 1776423600,
133 "status": "active",
134 "source_ref": "session:t4:m2",
135 "tags": ["project:klbr", "topic:reranker", "topic:networking"]
136 },
137 {
138 "memory_id": 13,
139 "namespace": "default",
140 "layer": "L1",
141 "timeline_id": "t4-reranker-plan",
142 "text": "the reranker choice for the mvp is bge-reranker-v2-m3.",
143 "event_time": 1776425400,
144 "status": "active",
145 "source_ref": "session:t4:m3",
146 "tags": ["project:klbr", "topic:reranker"]
147 },
148 {
149 "memory_id": 14,
150 "namespace": "default",
151 "layer": "L1",
152 "timeline_id": "t5-llama-server-migration",
153 "text": "moved chat model serving from lm studio to llama-server.",
154 "event_time": 1776517200,
155 "status": "active",
156 "source_ref": "session:t5:m1",
157 "tags": ["project:klbr", "topic:llama-server"]
158 },
159 {
160 "memory_id": 15,
161 "namespace": "default",
162 "layer": "L1",
163 "timeline_id": "t5-llama-server-migration",
164 "text": "around the migration, auto model discovery was added for chat and embeddings.",
165 "event_time": 1776519000,
166 "status": "active",
167 "source_ref": "session:t5:m2",
168 "tags": ["project:klbr", "topic:llama-server", "topic:models"]
169 },
170 {
171 "memory_id": 16,
172 "namespace": "default",
173 "layer": "L1",
174 "timeline_id": "t5-llama-server-migration",
175 "text": "the separate embedding server still uses bge-m3 rather than whatever chat model is currently loaded.",
176 "event_time": 1776520800,
177 "status": "active",
178 "source_ref": "session:t5:m3",
179 "tags": ["project:klbr", "topic:embeddings"]
180 },
181 {
182 "memory_id": 17,
183 "namespace": "default",
184 "layer": "L1",
185 "timeline_id": "t6-harness-observability",
186 "text": "added progress logging to the benchmark harness because silent hangs were confusing.",
187 "event_time": 1776614400,
188 "status": "active",
189 "source_ref": "session:t6:m1",
190 "tags": ["project:klbr", "topic:benchmarking", "topic:logging"]
191 },
192 {
193 "memory_id": 18,
194 "namespace": "default",
195 "layer": "L1",
196 "timeline_id": "t6-harness-observability",
197 "text": "the first tiny benchmark only proved the plumbing and was too small for calibration.",
198 "event_time": 1776616200,
199 "status": "active",
200 "source_ref": "session:t6:m2",
201 "tags": ["project:klbr", "topic:benchmarking", "topic:calibration"]
202 },
203 {
204 "memory_id": 19,
205 "namespace": "default",
206 "layer": "L1",
207 "timeline_id": "t6-harness-observability",
208 "text": "a recurring infra pattern is to separate chat, embedding, and rerank services by role to avoid model mismatch.",
209 "event_time": 1776618000,
210 "status": "active",
211 "source_ref": "session:t6:m3",
212 "tags": ["project:klbr", "topic:infra", "topic:retrieval"]
213 },
214 {
215 "memory_id": 20,
216 "namespace": "default",
217 "layer": "L1",
218 "timeline_id": "t7-latency-and-abstain",
219 "text": "rerank latency dropped sharply after switching the reranker endpoint to plain http and using a warmed model.",
220 "event_time": 1776686400,
221 "status": "active",
222 "source_ref": "session:t7:m1",
223 "tags": ["project:klbr", "topic:reranker", "topic:latency"]
224 },
225 {
226 "memory_id": 21,
227 "namespace": "default",
228 "layer": "L1",
229 "timeline_id": "t7-latency-and-abstain",
230 "text": "the benchmark still abstained wrongly on one positive query because abstention used first-stage distance instead of rerank confidence.",
231 "event_time": 1776688200,
232 "status": "active",
233 "source_ref": "session:t7:m2",
234 "tags": ["project:klbr", "topic:abstention", "topic:reranker"]
235 },
236 {
237 "memory_id": 22,
238 "namespace": "default",
239 "layer": "L1",
240 "timeline_id": "t8-better-eval-data",
241 "text": "a better internal eval set should include no-hit, temporal ambiguity, and multi-evidence cases.",
242 "event_time": 1776780000,
243 "status": "active",
244 "source_ref": "session:t8:m1",
245 "tags": ["project:klbr", "topic:eval-data"]
246 },
247 {
248 "memory_id": 23,
249 "namespace": "default",
250 "layer": "L1",
251 "timeline_id": "t8-better-eval-data",
252 "text": "the live agent and the benchmark harness now share the same time-windowed exact retrieval path.",
253 "event_time": 1776781800,
254 "status": "active",
255 "source_ref": "session:t8:m2",
256 "tags": ["project:klbr", "topic:retrieval"]
257 },
258 {
259 "memory_id": 24,
260 "namespace": "default",
261 "layer": "L1",
262 "timeline_id": "t9-current-endpoints",
263 "text": "the current reranker endpoint is http://localhost:8003.",
264 "event_time": 1776852000,
265 "status": "active",
266 "source_ref": "session:t9:m1",
267 "tags": ["project:klbr", "topic:reranker", "topic:networking"]
268 },
269 {
270 "memory_id": 25,
271 "namespace": "default",
272 "layer": "L1",
273 "timeline_id": "t9-current-endpoints",
274 "text": "the current embedding endpoint is http://localhost:8002.",
275 "event_time": 1776853800,
276 "status": "active",
277 "source_ref": "session:t9:m2",
278 "tags": ["project:klbr", "topic:embeddings", "topic:networking"]
279 },
280 {
281 "memory_id": 26,
282 "namespace": "default",
283 "layer": "L1",
284 "timeline_id": "t10-support-calibration",
285 "text": "added a generic lexical support score based on weighted overlap between salient query terms and the top memory text.",
286 "event_time": 1776861000,
287 "status": "active",
288 "source_ref": "session:t10:m1",
289 "tags": ["project:klbr", "topic:calibration", "topic:support"]
290 },
291 {
292 "memory_id": 27,
293 "namespace": "default",
294 "layer": "L1",
295 "timeline_id": "t10-support-calibration",
296 "text": "a focused support sweep found a useful operating point at rerank score -6.0, margin 0.0, and support 0.2.",
297 "event_time": 1776862800,
298 "status": "active",
299 "source_ref": "session:t10:m2",
300 "tags": ["project:klbr", "topic:calibration", "topic:support"]
301 },
302 {
303 "memory_id": 28,
304 "namespace": "default",
305 "layer": "L1",
306 "timeline_id": "t10-support-calibration",
307 "text": "the support-assisted operating point removed no-hit false answers on the dev slice, but answerable coverage settled at 0.8333 instead of the old 0.9167 balanced point.",
308 "event_time": 1776864600,
309 "status": "active",
310 "source_ref": "session:t10:m3",
311 "tags": ["project:klbr", "topic:calibration", "topic:support"]
312 },
313 {
314 "memory_id": 29,
315 "namespace": "default",
316 "layer": "L1",
317 "timeline_id": "t11-support-vs-taxonomy",
318 "text": "decided against adding many brittle query-type guards for calibration.",
319 "event_time": 1776866400,
320 "status": "active",
321 "source_ref": "session:t11:m1",
322 "tags": ["project:klbr", "topic:calibration", "topic:query-types"]
323 },
324 {
325 "memory_id": 30,
326 "namespace": "default",
327 "layer": "L1",
328 "timeline_id": "t11-support-vs-taxonomy",
329 "text": "the better direction is generic evidence support that works for both direct questions and non-question inputs.",
330 "event_time": 1776868200,
331 "status": "active",
332 "source_ref": "session:t11:m2",
333 "tags": ["project:klbr", "topic:support", "topic:passive-recall"]
334 },
335 {
336 "memory_id": 31,
337 "namespace": "default",
338 "layer": "L1",
339 "timeline_id": "t11-support-vs-taxonomy",
340 "text": "passive recall should reuse the same support features instead of introducing a separate query taxonomy.",
341 "event_time": 1776870000,
342 "status": "active",
343 "source_ref": "session:t11:m3",
344 "tags": ["project:klbr", "topic:support", "topic:passive-recall"]
345 },
346 {
347 "memory_id": 32,
348 "namespace": "default",
349 "layer": "L1",
350 "timeline_id": "t12-expand-dataset-first",
351 "text": "before passive recall work, the next step was to expand the eval dataset and rerun the retrieval benchmarks.",
352 "event_time": 1776871800,
353 "status": "active",
354 "source_ref": "session:t12:m1",
355 "tags": ["project:klbr", "topic:eval-data", "topic:benchmarking"]
356 },
357 {
358 "memory_id": 33,
359 "namespace": "default",
360 "layer": "L1",
361 "timeline_id": "t12-expand-dataset-first",
362 "text": "the starter eval set needed stronger coverage in no-hit, conflict/update, and multi-evidence cases.",
363 "event_time": 1776873600,
364 "status": "active",
365 "source_ref": "session:t12:m2",
366 "tags": ["project:klbr", "topic:eval-data", "topic:calibration"]
367 },
368 {
369 "memory_id": 34,
370 "namespace": "default",
371 "layer": "L1",
372 "timeline_id": "t12-expand-dataset-first",
373 "text": "the current thresholds should stay provisional until the larger dataset is benchmarked again.",
374 "event_time": 1776875400,
375 "status": "active",
376 "source_ref": "session:t12:m3",
377 "tags": ["project:klbr", "topic:calibration", "topic:eval-data"]
378 }
379 ],
380 "queries": [
381 {
382 "query_id": "dev_q1",
383 "split": "test",
384 "category": "exact recent event",
385 "namespace": "default",
386 "timeline_id": "t9-current-endpoints",
387 "text": "what is the reranker endpoint right now?",
388 "gold_memory_ids": [24],
389 "no_hit": false,
390 "gold_answer": "http://localhost:8003",
391 "reference_time": 1776852000
392 },
393 {
394 "query_id": "dev_q2",
395 "split": "test",
396 "category": "exact dated event",
397 "namespace": "default",
398 "timeline_id": "t5-llama-server-migration",
399 "text": "what did i switch to on april 18 when i moved off lm studio?",
400 "gold_memory_ids": [14],
401 "no_hit": false,
402 "gold_answer": "llama-server",
403 "reference_time": 1776517200
404 },
405 {
406 "query_id": "dev_q3",
407 "split": "dev",
408 "category": "vague recent lookup",
409 "namespace": "default",
410 "timeline_id": "t8-better-eval-data",
411 "text": "what have i been working on lately?",
412 "gold_memory_ids": [10, 22, 23],
413 "no_hit": false,
414 "gold_answer": "better eval data and the shared retrieval path",
415 "reference_time": 1776852000
416 },
417 {
418 "query_id": "dev_q4",
419 "split": "dev",
420 "category": "recurring theme / pattern-like question",
421 "namespace": "default",
422 "timeline_id": "t6-harness-observability",
423 "text": "what kind of infra pattern do i keep using in this project?",
424 "gold_memory_ids": [19],
425 "no_hit": false,
426 "gold_answer": "splitting chat, embedding, and rerank services by role",
427 "reference_time": 1776780000
428 },
429 {
430 "query_id": "dev_q5",
431 "split": "dev",
432 "category": "conflict / update question",
433 "namespace": "default",
434 "timeline_id": "t1-editor-and-embedding-split",
435 "text": "what editor am i using now for klbr?",
436 "gold_memory_ids": [2],
437 "no_hit": false,
438 "gold_answer": "zed",
439 "reference_time": 1776852000
440 },
441 {
442 "query_id": "dev_q6",
443 "split": "test",
444 "category": "conflict / update question",
445 "namespace": "default",
446 "timeline_id": "t7-latency-and-abstain",
447 "text": "is the reranker endpoint using https now?",
448 "gold_memory_ids": [12, 24],
449 "no_hit": false,
450 "gold_answer": "no, it should use plain http and the current endpoint is http://localhost:8003",
451 "reference_time": 1776852000
452 },
453 {
454 "query_id": "dev_q7",
455 "split": "test",
456 "category": "temporally ambiguous query",
457 "namespace": "default",
458 "timeline_id": "t5-llama-server-migration",
459 "text": "what happened around the migration?",
460 "gold_memory_ids": [14, 15, 16],
461 "no_hit": false,
462 "gold_answer": "chat moved to llama-server, auto model discovery was added, and embeddings stayed separate on bge-m3",
463 "reference_time": 1776519000
464 },
465 {
466 "query_id": "dev_q8",
467 "split": "test",
468 "category": "multi-evidence query",
469 "namespace": "default",
470 "timeline_id": "t7-latency-and-abstain",
471 "text": "why did the benchmark latency improve?",
472 "gold_memory_ids": [20, 24],
473 "no_hit": false,
474 "gold_answer": "because the reranker endpoint switched to plain http and the model was warm",
475 "reference_time": 1776852000
476 },
477 {
478 "query_id": "dev_q9",
479 "split": "test",
480 "category": "no-hit query",
481 "namespace": "default",
482 "timeline_id": "t9-current-endpoints",
483 "text": "what postgres version am i running for klbr?",
484 "gold_memory_ids": [],
485 "no_hit": true,
486 "reference_time": 1776852000
487 },
488 {
489 "query_id": "dev_q10",
490 "split": "dev",
491 "category": "exact dated event",
492 "namespace": "default",
493 "timeline_id": "t2-error-propagation",
494 "text": "what was fixed on april 15 when the stream failed?",
495 "gold_memory_ids": [5],
496 "no_hit": false,
497 "gold_answer": "the daemon stopped swallowing llm stream errors and forwarded them to the tui",
498 "reference_time": 1776250800
499 },
500 {
501 "query_id": "dev_q11",
502 "split": "dev",
503 "category": "vague recent lookup",
504 "namespace": "default",
505 "timeline_id": "t3-benchmark-scaffolding",
506 "text": "what benchmark-related work has been happening?",
507 "gold_memory_ids": [8, 9, 10, 17, 18, 22],
508 "no_hit": false,
509 "gold_answer": "benchmark logging, timeline-aware eval data, retrieval plumbing, and replacing the toy eval set",
510 "reference_time": 1776852000
511 },
512 {
513 "query_id": "dev_q12",
514 "split": "dev",
515 "category": "multi-evidence query",
516 "namespace": "default",
517 "timeline_id": "t1-editor-and-embedding-split",
518 "text": "what setup is protecting us from embedding/model mismatch?",
519 "gold_memory_ids": [3, 7, 19],
520 "no_hit": false,
521 "gold_answer": "keeping embeddings separate on bge-m3 and separating services by role",
522 "reference_time": 1776852000
523 },
524 {
525 "query_id": "dev_q13",
526 "split": "dev",
527 "category": "no-hit query",
528 "namespace": "default",
529 "timeline_id": "t8-better-eval-data",
530 "route_label": "tools",
531 "text": "what postgres version is klbr using right now?",
532 "gold_memory_ids": [],
533 "no_hit": true,
534 "reference_time": 1776852000
535 },
536 {
537 "query_id": "dev_q14",
538 "split": "dev",
539 "category": "no-hit query",
540 "namespace": "default",
541 "timeline_id": "t6-harness-observability",
542 "route_label": "tools",
543 "text": "which browser am i using to inspect the benchmark output?",
544 "gold_memory_ids": [],
545 "no_hit": true,
546 "reference_time": 1776852000
547 },
548 {
549 "query_id": "dev_q15",
550 "split": "dev",
551 "category": "no-hit query",
552 "namespace": "default",
553 "timeline_id": "t1-editor-and-embedding-split",
554 "route_label": "tools",
555 "text": "what gpu is running the reranker?",
556 "gold_memory_ids": [],
557 "no_hit": true,
558 "reference_time": 1776852000
559 },
560 {
561 "query_id": "dev_q16",
562 "split": "dev",
563 "category": "no-hit query",
564 "namespace": "default",
565 "timeline_id": "t3-benchmark-scaffolding",
566 "route_label": "tools",
567 "text": "which postgres extension did i enable for the benchmark database?",
568 "gold_memory_ids": [],
569 "no_hit": true,
570 "reference_time": 1776852000
571 },
572 {
573 "query_id": "test_q1",
574 "split": "dev",
575 "category": "exact recent event",
576 "namespace": "default",
577 "timeline_id": "t8-better-eval-data",
578 "text": "what changed between the live agent and the benchmark harness?",
579 "gold_memory_ids": [23],
580 "no_hit": false,
581 "gold_answer": "they now share the same time-windowed exact retrieval path",
582 "reference_time": 1776781800
583 },
584 {
585 "query_id": "test_q2",
586 "split": "dev",
587 "category": "conflict / update question",
588 "namespace": "default",
589 "timeline_id": "t1-editor-and-embedding-split",
590 "text": "was i still using neovim after the editor switch?",
591 "gold_memory_ids": [2],
592 "no_hit": false,
593 "gold_answer": "no, the newer fact says the editor switched to zed",
594 "reference_time": 1776852000
595 },
596 {
597 "query_id": "test_q3",
598 "split": "test",
599 "category": "temporally ambiguous query",
600 "namespace": "default",
601 "timeline_id": "t7-latency-and-abstain",
602 "text": "what happened around the latency fix?",
603 "gold_memory_ids": [20, 21],
604 "no_hit": false,
605 "gold_answer": "latency improved after moving to http and a remaining bug was the wrong abstention rule",
606 "reference_time": 1776688200
607 },
608 {
609 "query_id": "test_q4",
610 "split": "test",
611 "category": "no-hit query",
612 "namespace": "default",
613 "timeline_id": "t9-current-endpoints",
614 "text": "which browser am i using to inspect the benchmark output?",
615 "gold_memory_ids": [],
616 "no_hit": true,
617 "reference_time": 1776852000
618 },
619 {
620 "query_id": "test_q5",
621 "split": "test",
622 "category": "exact dated event",
623 "namespace": "default",
624 "timeline_id": "t4-reranker-plan",
625 "text": "what reranker did i decide to use on april 17?",
626 "gold_memory_ids": [13],
627 "no_hit": false,
628 "gold_answer": "bge-reranker-v2-m3",
629 "reference_time": 1776425400
630 },
631 {
632 "query_id": "test_q6",
633 "split": "test",
634 "category": "vague recent lookup",
635 "namespace": "default",
636 "timeline_id": "t5-llama-server-migration",
637 "text": "what changed in model serving around llama-server?",
638 "gold_memory_ids": [14, 15, 16],
639 "no_hit": false,
640 "gold_answer": "chat moved to llama-server, auto model discovery was added, and embeddings remained separate",
641 "reference_time": 1776852000
642 },
643 {
644 "query_id": "test_q7",
645 "split": "dev",
646 "category": "multi-evidence query",
647 "namespace": "default",
648 "timeline_id": "t8-better-eval-data",
649 "text": "why do we need better benchmark data now?",
650 "gold_memory_ids": [18, 22],
651 "no_hit": false,
652 "gold_answer": "because the toy benchmark only proved plumbing and the eval set needs harder no-hit and ambiguity cases",
653 "reference_time": 1776852000
654 },
655 {
656 "query_id": "test_q8",
657 "split": "dev",
658 "category": "recurring theme / pattern-like question",
659 "namespace": "default",
660 "timeline_id": "t6-harness-observability",
661 "text": "what sort of retrieval work keeps coming up?",
662 "gold_memory_ids": [10, 19, 23],
663 "no_hit": false,
664 "gold_answer": "retrieval plumbing, service separation, and aligning the live and benchmark retrieval path",
665 "reference_time": 1776852000
666 },
667 {
668 "query_id": "test_q9",
669 "split": "test",
670 "category": "no-hit query",
671 "namespace": "default",
672 "timeline_id": "t9-current-endpoints",
673 "route_label": "tools",
674 "text": "what gpu am i using for the reranker?",
675 "required_tools": ["shell"],
676 "gold_memory_ids": [],
677 "no_hit": true,
678 "reference_time": 1776852000
679 },
680 {
681 "query_id": "test_q10",
682 "split": "dev",
683 "category": "exact recent event",
684 "namespace": "default",
685 "timeline_id": "t6-harness-observability",
686 "text": "why did i add progress logging to the benchmark harness?",
687 "gold_memory_ids": [17],
688 "no_hit": false,
689 "gold_answer": "because silent hangs were confusing",
690 "reference_time": 1776614400
691 },
692 {
693 "query_id": "test_q11",
694 "split": "test",
695 "category": "conflict / update question",
696 "namespace": "default",
697 "timeline_id": "t7-latency-and-abstain",
698 "text": "what was wrong with the benchmark after the reranker got fast?",
699 "gold_memory_ids": [21],
700 "no_hit": false,
701 "gold_answer": "it still abstained on a positive query because it used first-stage distance instead of rerank confidence",
702 "reference_time": 1776688200
703 },
704 {
705 "query_id": "test_q12",
706 "split": "dev",
707 "category": "temporally ambiguous query",
708 "namespace": "default",
709 "timeline_id": "t1-editor-and-embedding-split",
710 "text": "what happened around the embedding split?",
711 "gold_memory_ids": [2, 3, 4],
712 "no_hit": false,
713 "gold_answer": "the editor switched to zed, embeddings moved to a separate bge-m3 server, and exact retrieval stayed the first benchmark step",
714 "reference_time": 1776160800
715 },
716 {
717 "query_id": "dev_q17",
718 "split": "dev",
719 "category": "exact recent event",
720 "namespace": "default",
721 "timeline_id": "t10-support-calibration",
722 "text": "what support threshold looked best in the focused support sweep?",
723 "gold_memory_ids": [27],
724 "no_hit": false,
725 "gold_answer": "0.2",
726 "reference_time": 1776862800
727 },
728 {
729 "query_id": "dev_q18",
730 "split": "dev",
731 "category": "multi-evidence query",
732 "namespace": "default",
733 "timeline_id": "t10-support-calibration",
734 "text": "what changed after adding the support score to the policy?",
735 "gold_memory_ids": [26, 27, 28],
736 "no_hit": false,
737 "gold_answer": "the policy added weighted lexical support, settled on score -6.0 / margin 0.0 / support 0.2, and removed no-hit false answers with some coverage loss",
738 "reference_time": 1776864600
739 },
740 {
741 "query_id": "dev_q19",
742 "split": "dev",
743 "category": "recurring theme / pattern-like question",
744 "namespace": "default",
745 "timeline_id": "t11-support-vs-taxonomy",
746 "text": "what kind of calibration fix are we avoiding because it would be brittle?",
747 "gold_memory_ids": [29],
748 "no_hit": false,
749 "gold_answer": "many query-type guards",
750 "reference_time": 1776870000
751 },
752 {
753 "query_id": "dev_q20",
754 "split": "dev",
755 "category": "multi-evidence query",
756 "namespace": "default",
757 "timeline_id": "t11-support-vs-taxonomy",
758 "text": "how should passive recall connect to the current calibration work?",
759 "gold_memory_ids": [30, 31],
760 "no_hit": false,
761 "gold_answer": "it should reuse the same generic support features instead of a separate taxonomy",
762 "reference_time": 1776870000
763 },
764 {
765 "query_id": "dev_q21",
766 "split": "dev",
767 "category": "no-hit query",
768 "namespace": "default",
769 "timeline_id": "t11-support-vs-taxonomy",
770 "route_label": "tools",
771 "text": "which classifier model are we using to assign query types?",
772 "gold_memory_ids": [],
773 "no_hit": true,
774 "reference_time": 1776870000
775 },
776 {
777 "query_id": "dev_q22",
778 "split": "dev",
779 "category": "conflict / update question",
780 "namespace": "default",
781 "timeline_id": "t10-support-calibration",
782 "text": "did the support-assisted sweep get coverage all the way back to the old 0.9167 balanced point?",
783 "gold_memory_ids": [28],
784 "no_hit": false,
785 "gold_answer": "no, coverage settled at 0.8333",
786 "reference_time": 1776864600
787 },
788 {
789 "query_id": "dev_q23",
790 "split": "dev",
791 "category": "no-hit query",
792 "namespace": "default",
793 "timeline_id": "t8-better-eval-data",
794 "text": "what editor am i using for my dotfiles repo these days?",
795 "gold_memory_ids": [],
796 "no_hit": true,
797 "reference_time": 1776870000
798 },
799 {
800 "query_id": "dev_q24",
801 "split": "dev",
802 "category": "no-hit query",
803 "namespace": "default",
804 "timeline_id": "t3-benchmark-scaffolding",
805 "text": "what's the p99 rerank latency on the current machine?",
806 "gold_memory_ids": [],
807 "no_hit": true,
808 "reference_time": 1776870000
809 },
810 {
811 "query_id": "dev_q25",
812 "split": "dev",
813 "category": "no-hit query",
814 "namespace": "default",
815 "timeline_id": "t11-support-vs-taxonomy",
816 "text": "what's the commit hash for the router model we're using right now?",
817 "gold_memory_ids": [],
818 "no_hit": true,
819 "reference_time": 1776870000
820 },
821 {
822 "query_id": "dev_q26",
823 "split": "dev",
824 "category": "no-hit query",
825 "namespace": "default",
826 "timeline_id": "t6-harness-observability",
827 "text": "what rust edition is klbr using?",
828 "gold_memory_ids": [],
829 "no_hit": true,
830 "reference_time": 1776870000
831 },
832 {
833 "query_id": "dev_q27",
834 "split": "dev",
835 "category": "no-hit query",
836 "namespace": "default",
837 "timeline_id": "t10-support-calibration",
838 "text": "what's the websocket port the daemon is listening on?",
839 "gold_memory_ids": [],
840 "no_hit": true,
841 "reference_time": 1776870000
842 },
843 {
844 "query_id": "dev_q28",
845 "split": "dev",
846 "category": "no-hit query",
847 "namespace": "default",
848 "timeline_id": "t8-better-eval-data",
849 "text": "what's the default db filename the agent writes to?",
850 "gold_memory_ids": [],
851 "no_hit": true,
852 "reference_time": 1776870000
853 },
854 {
855 "query_id": "dev_q29",
856 "split": "dev",
857 "category": "no-hit query",
858 "namespace": "default",
859 "timeline_id": "t3-benchmark-scaffolding",
860 "text": "what's the current p95 embedding latency on this machine?",
861 "gold_memory_ids": [],
862 "no_hit": true,
863 "reference_time": 1776870000
864 },
865 {
866 "query_id": "dev_q30",
867 "split": "dev",
868 "category": "no-hit query",
869 "namespace": "default",
870 "timeline_id": "t6-harness-observability",
871 "text": "what's the sqlite-vec extension version we're using?",
872 "gold_memory_ids": [],
873 "no_hit": true,
874 "reference_time": 1776870000
875 },
876 {
877 "query_id": "dev_q31",
878 "split": "dev",
879 "category": "no-hit query",
880 "namespace": "default",
881 "timeline_id": "t11-support-vs-taxonomy",
882 "text": "what's the name of the nearest-neighbor index backend we're using in production?",
883 "gold_memory_ids": [],
884 "no_hit": true,
885 "reference_time": 1776870000
886 },
887 {
888 "query_id": "dev_q32",
889 "split": "dev",
890 "category": "no-hit query",
891 "namespace": "default",
892 "timeline_id": "t2-error-propagation",
893 "text": "what's the default watermark token budget for compaction?",
894 "gold_memory_ids": [],
895 "no_hit": true,
896 "reference_time": 1776870000
897 },
898 {
899 "query_id": "test_q13",
900 "split": "test",
901 "category": "exact recent event",
902 "namespace": "default",
903 "timeline_id": "t12-expand-dataset-first",
904 "text": "what are we doing before passive recall work?",
905 "gold_memory_ids": [32],
906 "no_hit": false,
907 "gold_answer": "expanding the eval dataset and rerunning the retrieval benchmarks",
908 "reference_time": 1776871800
909 },
910 {
911 "query_id": "test_q14",
912 "split": "test",
913 "category": "multi-evidence query",
914 "namespace": "default",
915 "timeline_id": "t12-expand-dataset-first",
916 "text": "why are we expanding the dataset before treating thresholds as final?",
917 "gold_memory_ids": [32, 33, 34],
918 "no_hit": false,
919 "gold_answer": "because the eval set needed stronger coverage and the thresholds should stay provisional until the larger dataset is benchmarked again",
920 "reference_time": 1776875400
921 },
922 {
923 "query_id": "test_q15",
924 "split": "test",
925 "category": "no-hit query",
926 "namespace": "default",
927 "timeline_id": "t12-expand-dataset-first",
928 "route_label": "tools",
929 "text": "which postgres extension are we enabling before passive recall?",
930 "required_tools": ["read_file"],
931 "gold_memory_ids": [],
932 "no_hit": true,
933 "reference_time": 1776875400
934 },
935 {
936 "query_id": "test_q16",
937 "split": "test",
938 "category": "recurring theme / pattern-like question",
939 "namespace": "default",
940 "timeline_id": "t12-expand-dataset-first",
941 "text": "what kinds of eval cases did the starter set need more of?",
942 "gold_memory_ids": [33],
943 "no_hit": false,
944 "gold_answer": "no-hit, conflict/update, and multi-evidence cases",
945 "reference_time": 1776873600
946 },
947 {
948 "query_id": "test_q17",
949 "split": "test",
950 "category": "conflict / update question",
951 "namespace": "default",
952 "timeline_id": "t12-expand-dataset-first",
953 "text": "are the current thresholds final now that we have one sweep?",
954 "gold_memory_ids": [34],
955 "no_hit": false,
956 "gold_answer": "no, they should remain provisional until the larger dataset is benchmarked again",
957 "reference_time": 1776875400
958 },
959 {
960 "query_id": "test_q18",
961 "split": "test",
962 "category": "temporally ambiguous query",
963 "namespace": "default",
964 "timeline_id": "t12-expand-dataset-first",
965 "text": "what was the sequence around the dataset expansion decision?",
966 "gold_memory_ids": [32, 33, 34],
967 "no_hit": false,
968 "gold_answer": "the next step was to expand the dataset, it needed more hard cases, and thresholds stayed provisional until rerunning the larger benchmark",
969 "reference_time": 1776875400
970 }
971 ]
972}