benchmarks/inputs/datasets/internal_eval_starter.json at main

ptr.pet / klbr
fork
ive harnessed the harness
fork
klbr / benchmarks / inputs / datasets / internal_eval_starter.json
at main 972 lines 34 kB view raw
wrap content
dawn memory and stuff part 2 8d ago
521c761f
  1{
  2  "dataset_id": "internal-eval-starter-v2",
  3  "description": "Expanded internal eval set for KLBR MVP retrieval benchmarking. Timeline-aware, category-complete, and large enough to stress no-hit, support-calibration, and multi-evidence behavior more realistically.",
  4  "memories": [
  5    {
  6      "memory_id": 1,
  7      "namespace": "default",
  8      "layer": "L1",
  9      "timeline_id": "t0-editor-before-switch",
 10      "text": "before the switch, the primary editor for klbr work was neovim.",
 11      "event_time": 1776153600,
 12      "status": "active",
 13      "source_ref": "session:t0:m1",
 14      "tags": ["project:klbr", "topic:editor"]
 15    },
 16    {
 17      "memory_id": 2,
 18      "namespace": "default",
 19      "layer": "L1",
 20      "timeline_id": "t1-editor-and-embedding-split",
 21      "text": "switched the day-to-day klbr editor from neovim to zed.",
 22      "event_time": 1776157200,
 23      "status": "active",
 24      "source_ref": "session:t1:m1",
 25      "tags": ["project:klbr", "topic:editor"]
 26    },
 27    {
 28      "memory_id": 3,
 29      "namespace": "default",
 30      "layer": "L1",
 31      "timeline_id": "t1-editor-and-embedding-split",
 32      "text": "split embeddings onto a separate llama-server at localhost 8002 using bge-m3 with dimension 1024.",
 33      "event_time": 1776159000,
 34      "status": "active",
 35      "source_ref": "session:t1:m2",
 36      "tags": ["project:klbr", "topic:embeddings"]
 37    },
 38    {
 39      "memory_id": 4,
 40      "namespace": "default",
 41      "layer": "L1",
 42      "timeline_id": "t1-editor-and-embedding-split",
 43      "text": "the first benchmark slice should stay exact retrieval first before indexed search.",
 44      "event_time": 1776160800,
 45      "status": "active",
 46      "source_ref": "session:t1:m3",
 47      "tags": ["project:klbr", "topic:benchmarking"]
 48    },
 49    {
 50      "memory_id": 5,
 51      "namespace": "default",
 52      "layer": "L1",
 53      "timeline_id": "t2-error-propagation",
 54      "text": "fixed the daemon swallowing llm stream errors and forwarded the error to the tui.",
 55      "event_time": 1776250800,
 56      "status": "active",
 57      "source_ref": "session:t2:m1",
 58      "tags": ["project:klbr", "topic:daemon", "topic:tui"]
 59    },
 60    {
 61      "memory_id": 6,
 62      "namespace": "default",
 63      "layer": "L1",
 64      "timeline_id": "t2-error-propagation",
 65      "text": "compaction summaries are stored with the compaction_summary tag.",
 66      "event_time": 1776252600,
 67      "status": "active",
 68      "source_ref": "session:t2:m2",
 69      "tags": ["project:klbr", "topic:memory"]
 70    },
 71    {
 72      "memory_id": 7,
 73      "namespace": "default",
 74      "layer": "L1",
 75      "timeline_id": "t2-error-propagation",
 76      "text": "the chat llama-server did not expose embeddings, so embeddings had to stay on a separate process.",
 77      "event_time": 1776254400,
 78      "status": "active",
 79      "source_ref": "session:t2:m3",
 80      "tags": ["project:klbr", "topic:embeddings", "topic:llama-server"]
 81    },
 82    {
 83      "memory_id": 8,
 84      "namespace": "default",
 85      "layer": "L1",
 86      "timeline_id": "t3-benchmark-scaffolding",
 87      "text": "the benchmark harness should log per-query candidate lists and stage latencies.",
 88      "event_time": 1776351600,
 89      "status": "active",
 90      "source_ref": "session:t3:m1",
 91      "tags": ["project:klbr", "topic:benchmarking"]
 92    },
 93    {
 94      "memory_id": 9,
 95      "namespace": "default",
 96      "layer": "L1",
 97      "timeline_id": "t3-benchmark-scaffolding",
 98      "text": "internal eval should split by timeline or session, not by individual question only.",
 99      "event_time": 1776353400,
100      "status": "active",
101      "source_ref": "session:t3:m2",
102      "tags": ["project:klbr", "topic:benchmarking", "topic:eval-data"]
103    },
104    {
105      "memory_id": 10,
106      "namespace": "default",
107      "layer": "L1",
108      "timeline_id": "t3-benchmark-scaffolding",
109      "text": "recent work has mostly been retrieval plumbing, instrumentation, and benchmark scaffolding.",
110      "event_time": 1776355200,
111      "status": "active",
112      "source_ref": "session:t3:m3",
113      "tags": ["project:klbr", "topic:benchmarking", "topic:retrieval"]
114    },
115    {
116      "memory_id": 11,
117      "namespace": "default",
118      "layer": "L1",
119      "timeline_id": "t4-reranker-plan",
120      "text": "planned a separate reranker endpoint on localhost 8003.",
121      "event_time": 1776421800,
122      "status": "active",
123      "source_ref": "session:t4:m1",
124      "tags": ["project:klbr", "topic:reranker"]
125    },
126    {
127      "memory_id": 12,
128      "namespace": "default",
129      "layer": "L1",
130      "timeline_id": "t4-reranker-plan",
131      "text": "the cpu-only reranker should use plain http, not https.",
132      "event_time": 1776423600,
133      "status": "active",
134      "source_ref": "session:t4:m2",
135      "tags": ["project:klbr", "topic:reranker", "topic:networking"]
136    },
137    {
138      "memory_id": 13,
139      "namespace": "default",
140      "layer": "L1",
141      "timeline_id": "t4-reranker-plan",
142      "text": "the reranker choice for the mvp is bge-reranker-v2-m3.",
143      "event_time": 1776425400,
144      "status": "active",
145      "source_ref": "session:t4:m3",
146      "tags": ["project:klbr", "topic:reranker"]
147    },
148    {
149      "memory_id": 14,
150      "namespace": "default",
151      "layer": "L1",
152      "timeline_id": "t5-llama-server-migration",
153      "text": "moved chat model serving from lm studio to llama-server.",
154      "event_time": 1776517200,
155      "status": "active",
156      "source_ref": "session:t5:m1",
157      "tags": ["project:klbr", "topic:llama-server"]
158    },
159    {
160      "memory_id": 15,
161      "namespace": "default",
162      "layer": "L1",
163      "timeline_id": "t5-llama-server-migration",
164      "text": "around the migration, auto model discovery was added for chat and embeddings.",
165      "event_time": 1776519000,
166      "status": "active",
167      "source_ref": "session:t5:m2",
168      "tags": ["project:klbr", "topic:llama-server", "topic:models"]
169    },
170    {
171      "memory_id": 16,
172      "namespace": "default",
173      "layer": "L1",
174      "timeline_id": "t5-llama-server-migration",
175      "text": "the separate embedding server still uses bge-m3 rather than whatever chat model is currently loaded.",
176      "event_time": 1776520800,
177      "status": "active",
178      "source_ref": "session:t5:m3",
179      "tags": ["project:klbr", "topic:embeddings"]
180    },
181    {
182      "memory_id": 17,
183      "namespace": "default",
184      "layer": "L1",
185      "timeline_id": "t6-harness-observability",
186      "text": "added progress logging to the benchmark harness because silent hangs were confusing.",
187      "event_time": 1776614400,
188      "status": "active",
189      "source_ref": "session:t6:m1",
190      "tags": ["project:klbr", "topic:benchmarking", "topic:logging"]
191    },
192    {
193      "memory_id": 18,
194      "namespace": "default",
195      "layer": "L1",
196      "timeline_id": "t6-harness-observability",
197      "text": "the first tiny benchmark only proved the plumbing and was too small for calibration.",
198      "event_time": 1776616200,
199      "status": "active",
200      "source_ref": "session:t6:m2",
201      "tags": ["project:klbr", "topic:benchmarking", "topic:calibration"]
202    },
203    {
204      "memory_id": 19,
205      "namespace": "default",
206      "layer": "L1",
207      "timeline_id": "t6-harness-observability",
208      "text": "a recurring infra pattern is to separate chat, embedding, and rerank services by role to avoid model mismatch.",
209      "event_time": 1776618000,
210      "status": "active",
211      "source_ref": "session:t6:m3",
212      "tags": ["project:klbr", "topic:infra", "topic:retrieval"]
213    },
214    {
215      "memory_id": 20,
216      "namespace": "default",
217      "layer": "L1",
218      "timeline_id": "t7-latency-and-abstain",
219      "text": "rerank latency dropped sharply after switching the reranker endpoint to plain http and using a warmed model.",
220      "event_time": 1776686400,
221      "status": "active",
222      "source_ref": "session:t7:m1",
223      "tags": ["project:klbr", "topic:reranker", "topic:latency"]
224    },
225    {
226      "memory_id": 21,
227      "namespace": "default",
228      "layer": "L1",
229      "timeline_id": "t7-latency-and-abstain",
230      "text": "the benchmark still abstained wrongly on one positive query because abstention used first-stage distance instead of rerank confidence.",
231      "event_time": 1776688200,
232      "status": "active",
233      "source_ref": "session:t7:m2",
234      "tags": ["project:klbr", "topic:abstention", "topic:reranker"]
235    },
236    {
237      "memory_id": 22,
238      "namespace": "default",
239      "layer": "L1",
240      "timeline_id": "t8-better-eval-data",
241      "text": "a better internal eval set should include no-hit, temporal ambiguity, and multi-evidence cases.",
242      "event_time": 1776780000,
243      "status": "active",
244      "source_ref": "session:t8:m1",
245      "tags": ["project:klbr", "topic:eval-data"]
246    },
247    {
248      "memory_id": 23,
249      "namespace": "default",
250      "layer": "L1",
251      "timeline_id": "t8-better-eval-data",
252      "text": "the live agent and the benchmark harness now share the same time-windowed exact retrieval path.",
253      "event_time": 1776781800,
254      "status": "active",
255      "source_ref": "session:t8:m2",
256      "tags": ["project:klbr", "topic:retrieval"]
257    },
258    {
259      "memory_id": 24,
260      "namespace": "default",
261      "layer": "L1",
262      "timeline_id": "t9-current-endpoints",
263      "text": "the current reranker endpoint is http://localhost:8003.",
264      "event_time": 1776852000,
265      "status": "active",
266      "source_ref": "session:t9:m1",
267      "tags": ["project:klbr", "topic:reranker", "topic:networking"]
268    },
269    {
270      "memory_id": 25,
271      "namespace": "default",
272      "layer": "L1",
273      "timeline_id": "t9-current-endpoints",
274      "text": "the current embedding endpoint is http://localhost:8002.",
275      "event_time": 1776853800,
276      "status": "active",
277      "source_ref": "session:t9:m2",
278      "tags": ["project:klbr", "topic:embeddings", "topic:networking"]
279    },
280    {
281      "memory_id": 26,
282      "namespace": "default",
283      "layer": "L1",
284      "timeline_id": "t10-support-calibration",
285      "text": "added a generic lexical support score based on weighted overlap between salient query terms and the top memory text.",
286      "event_time": 1776861000,
287      "status": "active",
288      "source_ref": "session:t10:m1",
289      "tags": ["project:klbr", "topic:calibration", "topic:support"]
290    },
291    {
292      "memory_id": 27,
293      "namespace": "default",
294      "layer": "L1",
295      "timeline_id": "t10-support-calibration",
296      "text": "a focused support sweep found a useful operating point at rerank score -6.0, margin 0.0, and support 0.2.",
297      "event_time": 1776862800,
298      "status": "active",
299      "source_ref": "session:t10:m2",
300      "tags": ["project:klbr", "topic:calibration", "topic:support"]
301    },
302    {
303      "memory_id": 28,
304      "namespace": "default",
305      "layer": "L1",
306      "timeline_id": "t10-support-calibration",
307      "text": "the support-assisted operating point removed no-hit false answers on the dev slice, but answerable coverage settled at 0.8333 instead of the old 0.9167 balanced point.",
308      "event_time": 1776864600,
309      "status": "active",
310      "source_ref": "session:t10:m3",
311      "tags": ["project:klbr", "topic:calibration", "topic:support"]
312    },
313    {
314      "memory_id": 29,
315      "namespace": "default",
316      "layer": "L1",
317      "timeline_id": "t11-support-vs-taxonomy",
318      "text": "decided against adding many brittle query-type guards for calibration.",
319      "event_time": 1776866400,
320      "status": "active",
321      "source_ref": "session:t11:m1",
322      "tags": ["project:klbr", "topic:calibration", "topic:query-types"]
323    },
324    {
325      "memory_id": 30,
326      "namespace": "default",
327      "layer": "L1",
328      "timeline_id": "t11-support-vs-taxonomy",
329      "text": "the better direction is generic evidence support that works for both direct questions and non-question inputs.",
330      "event_time": 1776868200,
331      "status": "active",
332      "source_ref": "session:t11:m2",
333      "tags": ["project:klbr", "topic:support", "topic:passive-recall"]
334    },
335    {
336      "memory_id": 31,
337      "namespace": "default",
338      "layer": "L1",
339      "timeline_id": "t11-support-vs-taxonomy",
340      "text": "passive recall should reuse the same support features instead of introducing a separate query taxonomy.",
341      "event_time": 1776870000,
342      "status": "active",
343      "source_ref": "session:t11:m3",
344      "tags": ["project:klbr", "topic:support", "topic:passive-recall"]
345    },
346    {
347      "memory_id": 32,
348      "namespace": "default",
349      "layer": "L1",
350      "timeline_id": "t12-expand-dataset-first",
351      "text": "before passive recall work, the next step was to expand the eval dataset and rerun the retrieval benchmarks.",
352      "event_time": 1776871800,
353      "status": "active",
354      "source_ref": "session:t12:m1",
355      "tags": ["project:klbr", "topic:eval-data", "topic:benchmarking"]
356    },
357    {
358      "memory_id": 33,
359      "namespace": "default",
360      "layer": "L1",
361      "timeline_id": "t12-expand-dataset-first",
362      "text": "the starter eval set needed stronger coverage in no-hit, conflict/update, and multi-evidence cases.",
363      "event_time": 1776873600,
364      "status": "active",
365      "source_ref": "session:t12:m2",
366      "tags": ["project:klbr", "topic:eval-data", "topic:calibration"]
367    },
368    {
369      "memory_id": 34,
370      "namespace": "default",
371      "layer": "L1",
372      "timeline_id": "t12-expand-dataset-first",
373      "text": "the current thresholds should stay provisional until the larger dataset is benchmarked again.",
374      "event_time": 1776875400,
375      "status": "active",
376      "source_ref": "session:t12:m3",
377      "tags": ["project:klbr", "topic:calibration", "topic:eval-data"]
378    }
379  ],
380  "queries": [
381    {
382      "query_id": "dev_q1",
383      "split": "test",
384      "category": "exact recent event",
385      "namespace": "default",
386      "timeline_id": "t9-current-endpoints",
387      "text": "what is the reranker endpoint right now?",
388      "gold_memory_ids": [24],
389      "no_hit": false,
390      "gold_answer": "http://localhost:8003",
391      "reference_time": 1776852000
392    },
393    {
394      "query_id": "dev_q2",
395      "split": "test",
396      "category": "exact dated event",
397      "namespace": "default",
398      "timeline_id": "t5-llama-server-migration",
399      "text": "what did i switch to on april 18 when i moved off lm studio?",
400      "gold_memory_ids": [14],
401      "no_hit": false,
402      "gold_answer": "llama-server",
403      "reference_time": 1776517200
404    },
405    {
406      "query_id": "dev_q3",
407      "split": "dev",
408      "category": "vague recent lookup",
409      "namespace": "default",
410      "timeline_id": "t8-better-eval-data",
411      "text": "what have i been working on lately?",
412      "gold_memory_ids": [10, 22, 23],
413      "no_hit": false,
414      "gold_answer": "better eval data and the shared retrieval path",
415      "reference_time": 1776852000
416    },
417    {
418      "query_id": "dev_q4",
419      "split": "dev",
420      "category": "recurring theme / pattern-like question",
421      "namespace": "default",
422      "timeline_id": "t6-harness-observability",
423      "text": "what kind of infra pattern do i keep using in this project?",
424      "gold_memory_ids": [19],
425      "no_hit": false,
426      "gold_answer": "splitting chat, embedding, and rerank services by role",
427      "reference_time": 1776780000
428    },
429    {
430      "query_id": "dev_q5",
431      "split": "dev",
432      "category": "conflict / update question",
433      "namespace": "default",
434      "timeline_id": "t1-editor-and-embedding-split",
435      "text": "what editor am i using now for klbr?",
436      "gold_memory_ids": [2],
437      "no_hit": false,
438      "gold_answer": "zed",
439      "reference_time": 1776852000
440    },
441    {
442      "query_id": "dev_q6",
443      "split": "test",
444      "category": "conflict / update question",
445      "namespace": "default",
446      "timeline_id": "t7-latency-and-abstain",
447      "text": "is the reranker endpoint using https now?",
448      "gold_memory_ids": [12, 24],
449      "no_hit": false,
450      "gold_answer": "no, it should use plain http and the current endpoint is http://localhost:8003",
451      "reference_time": 1776852000
452    },
453    {
454      "query_id": "dev_q7",
455      "split": "test",
456      "category": "temporally ambiguous query",
457      "namespace": "default",
458      "timeline_id": "t5-llama-server-migration",
459      "text": "what happened around the migration?",
460      "gold_memory_ids": [14, 15, 16],
461      "no_hit": false,
462      "gold_answer": "chat moved to llama-server, auto model discovery was added, and embeddings stayed separate on bge-m3",
463      "reference_time": 1776519000
464    },
465    {
466      "query_id": "dev_q8",
467      "split": "test",
468      "category": "multi-evidence query",
469      "namespace": "default",
470      "timeline_id": "t7-latency-and-abstain",
471      "text": "why did the benchmark latency improve?",
472      "gold_memory_ids": [20, 24],
473      "no_hit": false,
474      "gold_answer": "because the reranker endpoint switched to plain http and the model was warm",
475      "reference_time": 1776852000
476    },
477    {
478      "query_id": "dev_q9",
479      "split": "test",
480      "category": "no-hit query",
481      "namespace": "default",
482      "timeline_id": "t9-current-endpoints",
483      "text": "what postgres version am i running for klbr?",
484      "gold_memory_ids": [],
485      "no_hit": true,
486      "reference_time": 1776852000
487    },
488    {
489      "query_id": "dev_q10",
490      "split": "dev",
491      "category": "exact dated event",
492      "namespace": "default",
493      "timeline_id": "t2-error-propagation",
494      "text": "what was fixed on april 15 when the stream failed?",
495      "gold_memory_ids": [5],
496      "no_hit": false,
497      "gold_answer": "the daemon stopped swallowing llm stream errors and forwarded them to the tui",
498      "reference_time": 1776250800
499    },
500    {
501      "query_id": "dev_q11",
502      "split": "dev",
503      "category": "vague recent lookup",
504      "namespace": "default",
505      "timeline_id": "t3-benchmark-scaffolding",
506      "text": "what benchmark-related work has been happening?",
507      "gold_memory_ids": [8, 9, 10, 17, 18, 22],
508      "no_hit": false,
509      "gold_answer": "benchmark logging, timeline-aware eval data, retrieval plumbing, and replacing the toy eval set",
510      "reference_time": 1776852000
511    },
512    {
513      "query_id": "dev_q12",
514      "split": "dev",
515      "category": "multi-evidence query",
516      "namespace": "default",
517      "timeline_id": "t1-editor-and-embedding-split",
518      "text": "what setup is protecting us from embedding/model mismatch?",
519      "gold_memory_ids": [3, 7, 19],
520      "no_hit": false,
521      "gold_answer": "keeping embeddings separate on bge-m3 and separating services by role",
522      "reference_time": 1776852000
523    },
524    {
525      "query_id": "dev_q13",
526      "split": "dev",
527      "category": "no-hit query",
528      "namespace": "default",
529      "timeline_id": "t8-better-eval-data",
530      "route_label": "tools",
531      "text": "what postgres version is klbr using right now?",
532      "gold_memory_ids": [],
533      "no_hit": true,
534      "reference_time": 1776852000
535    },
536    {
537      "query_id": "dev_q14",
538      "split": "dev",
539      "category": "no-hit query",
540      "namespace": "default",
541      "timeline_id": "t6-harness-observability",
542      "route_label": "tools",
543      "text": "which browser am i using to inspect the benchmark output?",
544      "gold_memory_ids": [],
545      "no_hit": true,
546      "reference_time": 1776852000
547    },
548    {
549      "query_id": "dev_q15",
550      "split": "dev",
551      "category": "no-hit query",
552      "namespace": "default",
553      "timeline_id": "t1-editor-and-embedding-split",
554      "route_label": "tools",
555      "text": "what gpu is running the reranker?",
556      "gold_memory_ids": [],
557      "no_hit": true,
558      "reference_time": 1776852000
559    },
560    {
561      "query_id": "dev_q16",
562      "split": "dev",
563      "category": "no-hit query",
564      "namespace": "default",
565      "timeline_id": "t3-benchmark-scaffolding",
566      "route_label": "tools",
567      "text": "which postgres extension did i enable for the benchmark database?",
568      "gold_memory_ids": [],
569      "no_hit": true,
570      "reference_time": 1776852000
571    },
572    {
573      "query_id": "test_q1",
574      "split": "dev",
575      "category": "exact recent event",
576      "namespace": "default",
577      "timeline_id": "t8-better-eval-data",
578      "text": "what changed between the live agent and the benchmark harness?",
579      "gold_memory_ids": [23],
580      "no_hit": false,
581      "gold_answer": "they now share the same time-windowed exact retrieval path",
582      "reference_time": 1776781800
583    },
584    {
585      "query_id": "test_q2",
586      "split": "dev",
587      "category": "conflict / update question",
588      "namespace": "default",
589      "timeline_id": "t1-editor-and-embedding-split",
590      "text": "was i still using neovim after the editor switch?",
591      "gold_memory_ids": [2],
592      "no_hit": false,
593      "gold_answer": "no, the newer fact says the editor switched to zed",
594      "reference_time": 1776852000
595    },
596    {
597      "query_id": "test_q3",
598      "split": "test",
599      "category": "temporally ambiguous query",
600      "namespace": "default",
601      "timeline_id": "t7-latency-and-abstain",
602      "text": "what happened around the latency fix?",
603      "gold_memory_ids": [20, 21],
604      "no_hit": false,
605      "gold_answer": "latency improved after moving to http and a remaining bug was the wrong abstention rule",
606      "reference_time": 1776688200
607    },
608    {
609      "query_id": "test_q4",
610      "split": "test",
611      "category": "no-hit query",
612      "namespace": "default",
613      "timeline_id": "t9-current-endpoints",
614      "text": "which browser am i using to inspect the benchmark output?",
615      "gold_memory_ids": [],
616      "no_hit": true,
617      "reference_time": 1776852000
618    },
619    {
620      "query_id": "test_q5",
621      "split": "test",
622      "category": "exact dated event",
623      "namespace": "default",
624      "timeline_id": "t4-reranker-plan",
625      "text": "what reranker did i decide to use on april 17?",
626      "gold_memory_ids": [13],
627      "no_hit": false,
628      "gold_answer": "bge-reranker-v2-m3",
629      "reference_time": 1776425400
630    },
631    {
632      "query_id": "test_q6",
633      "split": "test",
634      "category": "vague recent lookup",
635      "namespace": "default",
636      "timeline_id": "t5-llama-server-migration",
637      "text": "what changed in model serving around llama-server?",
638      "gold_memory_ids": [14, 15, 16],
639      "no_hit": false,
640      "gold_answer": "chat moved to llama-server, auto model discovery was added, and embeddings remained separate",
641      "reference_time": 1776852000
642    },
643    {
644      "query_id": "test_q7",
645      "split": "dev",
646      "category": "multi-evidence query",
647      "namespace": "default",
648      "timeline_id": "t8-better-eval-data",
649      "text": "why do we need better benchmark data now?",
650      "gold_memory_ids": [18, 22],
651      "no_hit": false,
652      "gold_answer": "because the toy benchmark only proved plumbing and the eval set needs harder no-hit and ambiguity cases",
653      "reference_time": 1776852000
654    },
655    {
656      "query_id": "test_q8",
657      "split": "dev",
658      "category": "recurring theme / pattern-like question",
659      "namespace": "default",
660      "timeline_id": "t6-harness-observability",
661      "text": "what sort of retrieval work keeps coming up?",
662      "gold_memory_ids": [10, 19, 23],
663      "no_hit": false,
664      "gold_answer": "retrieval plumbing, service separation, and aligning the live and benchmark retrieval path",
665      "reference_time": 1776852000
666    },
667    {
668      "query_id": "test_q9",
669      "split": "test",
670      "category": "no-hit query",
671      "namespace": "default",
672      "timeline_id": "t9-current-endpoints",
673      "route_label": "tools",
674      "text": "what gpu am i using for the reranker?",
675      "required_tools": ["shell"],
676      "gold_memory_ids": [],
677      "no_hit": true,
678      "reference_time": 1776852000
679    },
680    {
681      "query_id": "test_q10",
682      "split": "dev",
683      "category": "exact recent event",
684      "namespace": "default",
685      "timeline_id": "t6-harness-observability",
686      "text": "why did i add progress logging to the benchmark harness?",
687      "gold_memory_ids": [17],
688      "no_hit": false,
689      "gold_answer": "because silent hangs were confusing",
690      "reference_time": 1776614400
691    },
692    {
693      "query_id": "test_q11",
694      "split": "test",
695      "category": "conflict / update question",
696      "namespace": "default",
697      "timeline_id": "t7-latency-and-abstain",
698      "text": "what was wrong with the benchmark after the reranker got fast?",
699      "gold_memory_ids": [21],
700      "no_hit": false,
701      "gold_answer": "it still abstained on a positive query because it used first-stage distance instead of rerank confidence",
702      "reference_time": 1776688200
703    },
704    {
705      "query_id": "test_q12",
706      "split": "dev",
707      "category": "temporally ambiguous query",
708      "namespace": "default",
709      "timeline_id": "t1-editor-and-embedding-split",
710      "text": "what happened around the embedding split?",
711      "gold_memory_ids": [2, 3, 4],
712      "no_hit": false,
713      "gold_answer": "the editor switched to zed, embeddings moved to a separate bge-m3 server, and exact retrieval stayed the first benchmark step",
714      "reference_time": 1776160800
715    },
716    {
717      "query_id": "dev_q17",
718      "split": "dev",
719      "category": "exact recent event",
720      "namespace": "default",
721      "timeline_id": "t10-support-calibration",
722      "text": "what support threshold looked best in the focused support sweep?",
723      "gold_memory_ids": [27],
724      "no_hit": false,
725      "gold_answer": "0.2",
726      "reference_time": 1776862800
727    },
728    {
729      "query_id": "dev_q18",
730      "split": "dev",
731      "category": "multi-evidence query",
732      "namespace": "default",
733      "timeline_id": "t10-support-calibration",
734      "text": "what changed after adding the support score to the policy?",
735      "gold_memory_ids": [26, 27, 28],
736      "no_hit": false,
737      "gold_answer": "the policy added weighted lexical support, settled on score -6.0 / margin 0.0 / support 0.2, and removed no-hit false answers with some coverage loss",
738      "reference_time": 1776864600
739    },
740    {
741      "query_id": "dev_q19",
742      "split": "dev",
743      "category": "recurring theme / pattern-like question",
744      "namespace": "default",
745      "timeline_id": "t11-support-vs-taxonomy",
746      "text": "what kind of calibration fix are we avoiding because it would be brittle?",
747      "gold_memory_ids": [29],
748      "no_hit": false,
749      "gold_answer": "many query-type guards",
750      "reference_time": 1776870000
751    },
752    {
753      "query_id": "dev_q20",
754      "split": "dev",
755      "category": "multi-evidence query",
756      "namespace": "default",
757      "timeline_id": "t11-support-vs-taxonomy",
758      "text": "how should passive recall connect to the current calibration work?",
759      "gold_memory_ids": [30, 31],
760      "no_hit": false,
761      "gold_answer": "it should reuse the same generic support features instead of a separate taxonomy",
762      "reference_time": 1776870000
763    },
764    {
765      "query_id": "dev_q21",
766      "split": "dev",
767      "category": "no-hit query",
768      "namespace": "default",
769      "timeline_id": "t11-support-vs-taxonomy",
770      "route_label": "tools",
771      "text": "which classifier model are we using to assign query types?",
772      "gold_memory_ids": [],
773      "no_hit": true,
774      "reference_time": 1776870000
775    },
776    {
777      "query_id": "dev_q22",
778      "split": "dev",
779      "category": "conflict / update question",
780      "namespace": "default",
781      "timeline_id": "t10-support-calibration",
782      "text": "did the support-assisted sweep get coverage all the way back to the old 0.9167 balanced point?",
783      "gold_memory_ids": [28],
784      "no_hit": false,
785      "gold_answer": "no, coverage settled at 0.8333",
786      "reference_time": 1776864600
787    },
788    {
789      "query_id": "dev_q23",
790      "split": "dev",
791      "category": "no-hit query",
792      "namespace": "default",
793      "timeline_id": "t8-better-eval-data",
794      "text": "what editor am i using for my dotfiles repo these days?",
795      "gold_memory_ids": [],
796      "no_hit": true,
797      "reference_time": 1776870000
798    },
799    {
800      "query_id": "dev_q24",
801      "split": "dev",
802      "category": "no-hit query",
803      "namespace": "default",
804      "timeline_id": "t3-benchmark-scaffolding",
805      "text": "what's the p99 rerank latency on the current machine?",
806      "gold_memory_ids": [],
807      "no_hit": true,
808      "reference_time": 1776870000
809    },
810    {
811      "query_id": "dev_q25",
812      "split": "dev",
813      "category": "no-hit query",
814      "namespace": "default",
815      "timeline_id": "t11-support-vs-taxonomy",
816      "text": "what's the commit hash for the router model we're using right now?",
817      "gold_memory_ids": [],
818      "no_hit": true,
819      "reference_time": 1776870000
820    },
821    {
822      "query_id": "dev_q26",
823      "split": "dev",
824      "category": "no-hit query",
825      "namespace": "default",
826      "timeline_id": "t6-harness-observability",
827      "text": "what rust edition is klbr using?",
828      "gold_memory_ids": [],
829      "no_hit": true,
830      "reference_time": 1776870000
831    },
832    {
833      "query_id": "dev_q27",
834      "split": "dev",
835      "category": "no-hit query",
836      "namespace": "default",
837      "timeline_id": "t10-support-calibration",
838      "text": "what's the websocket port the daemon is listening on?",
839      "gold_memory_ids": [],
840      "no_hit": true,
841      "reference_time": 1776870000
842    },
843    {
844      "query_id": "dev_q28",
845      "split": "dev",
846      "category": "no-hit query",
847      "namespace": "default",
848      "timeline_id": "t8-better-eval-data",
849      "text": "what's the default db filename the agent writes to?",
850      "gold_memory_ids": [],
851      "no_hit": true,
852      "reference_time": 1776870000
853    },
854    {
855      "query_id": "dev_q29",
856      "split": "dev",
857      "category": "no-hit query",
858      "namespace": "default",
859      "timeline_id": "t3-benchmark-scaffolding",
860      "text": "what's the current p95 embedding latency on this machine?",
861      "gold_memory_ids": [],
862      "no_hit": true,
863      "reference_time": 1776870000
864    },
865    {
866      "query_id": "dev_q30",
867      "split": "dev",
868      "category": "no-hit query",
869      "namespace": "default",
870      "timeline_id": "t6-harness-observability",
871      "text": "what's the sqlite-vec extension version we're using?",
872      "gold_memory_ids": [],
873      "no_hit": true,
874      "reference_time": 1776870000
875    },
876    {
877      "query_id": "dev_q31",
878      "split": "dev",
879      "category": "no-hit query",
880      "namespace": "default",
881      "timeline_id": "t11-support-vs-taxonomy",
882      "text": "what's the name of the nearest-neighbor index backend we're using in production?",
883      "gold_memory_ids": [],
884      "no_hit": true,
885      "reference_time": 1776870000
886    },
887    {
888      "query_id": "dev_q32",
889      "split": "dev",
890      "category": "no-hit query",
891      "namespace": "default",
892      "timeline_id": "t2-error-propagation",
893      "text": "what's the default watermark token budget for compaction?",
894      "gold_memory_ids": [],
895      "no_hit": true,
896      "reference_time": 1776870000
897    },
898    {
899      "query_id": "test_q13",
900      "split": "test",
901      "category": "exact recent event",
902      "namespace": "default",
903      "timeline_id": "t12-expand-dataset-first",
904      "text": "what are we doing before passive recall work?",
905      "gold_memory_ids": [32],
906      "no_hit": false,
907      "gold_answer": "expanding the eval dataset and rerunning the retrieval benchmarks",
908      "reference_time": 1776871800
909    },
910    {
911      "query_id": "test_q14",
912      "split": "test",
913      "category": "multi-evidence query",
914      "namespace": "default",
915      "timeline_id": "t12-expand-dataset-first",
916      "text": "why are we expanding the dataset before treating thresholds as final?",
917      "gold_memory_ids": [32, 33, 34],
918      "no_hit": false,
919      "gold_answer": "because the eval set needed stronger coverage and the thresholds should stay provisional until the larger dataset is benchmarked again",
920      "reference_time": 1776875400
921    },
922    {
923      "query_id": "test_q15",
924      "split": "test",
925      "category": "no-hit query",
926      "namespace": "default",
927      "timeline_id": "t12-expand-dataset-first",
928      "route_label": "tools",
929      "text": "which postgres extension are we enabling before passive recall?",
930      "required_tools": ["read_file"],
931      "gold_memory_ids": [],
932      "no_hit": true,
933      "reference_time": 1776875400
934    },
935    {
936      "query_id": "test_q16",
937      "split": "test",
938      "category": "recurring theme / pattern-like question",
939      "namespace": "default",
940      "timeline_id": "t12-expand-dataset-first",
941      "text": "what kinds of eval cases did the starter set need more of?",
942      "gold_memory_ids": [33],
943      "no_hit": false,
944      "gold_answer": "no-hit, conflict/update, and multi-evidence cases",
945      "reference_time": 1776873600
946    },
947    {
948      "query_id": "test_q17",
949      "split": "test",
950      "category": "conflict / update question",
951      "namespace": "default",
952      "timeline_id": "t12-expand-dataset-first",
953      "text": "are the current thresholds final now that we have one sweep?",
954      "gold_memory_ids": [34],
955      "no_hit": false,
956      "gold_answer": "no, they should remain provisional until the larger dataset is benchmarked again",
957      "reference_time": 1776875400
958    },
959    {
960      "query_id": "test_q18",
961      "split": "test",
962      "category": "temporally ambiguous query",
963      "namespace": "default",
964      "timeline_id": "t12-expand-dataset-first",
965      "text": "what was the sequence around the dataset expansion decision?",
966      "gold_memory_ids": [32, 33, 34],
967      "no_hit": false,
968      "gold_answer": "the next step was to expand the dataset, it needed more hard cases, and thresholds stayed provisional until rerunning the larger benchmark",
969      "reference_time": 1776875400
970    }
971  ]
972}
Configure Feed

Configure Feed