benchmarks/inputs/datasets/internal_eval_passive_recall_starter.json at main

ptr.pet / klbr
fork
ive harnessed the harness
fork
klbr / benchmarks / inputs / datasets / internal_eval_passive_recall_starter.json
at main 600 lines 21 kB view raw
wrap content
dawn memory and stuff 9d ago
c130f9e6
  1{
  2  "dataset_id": "internal-eval-passive-recall-starter-v2",
  3  "description": "Expanded passive-recall eval slice for KLBR. Covers activation, no-recall behavior, multi-evidence support, and update-style cues across dev and test splits.",
  4  "memories": [
  5    {
  6      "memory_id": 101,
  7      "namespace": "default",
  8      "layer": "L1",
  9      "timeline_id": "pr1-editor-and-embeddings-dev",
 10      "text": "switched the day-to-day klbr editor from neovim to zed.",
 11      "event_time": 1776157200,
 12      "status": "active",
 13      "source_ref": "session:pr1:m1",
 14      "tags": ["project:klbr", "topic:editor"]
 15    },
 16    {
 17      "memory_id": 102,
 18      "namespace": "default",
 19      "layer": "L1",
 20      "timeline_id": "pr1-editor-and-embeddings-dev",
 21      "text": "split embeddings onto a separate llama-server at localhost 8002 using bge-m3 with dimension 1024.",
 22      "event_time": 1776159000,
 23      "status": "active",
 24      "source_ref": "session:pr1:m2",
 25      "tags": ["project:klbr", "topic:embeddings"]
 26    },
 27    {
 28      "memory_id": 103,
 29      "namespace": "default",
 30      "layer": "L1",
 31      "timeline_id": "pr2-reranker-dev",
 32      "text": "the reranker endpoint is http://localhost:8003.",
 33      "event_time": 1776686400,
 34      "status": "active",
 35      "source_ref": "session:pr2:m1",
 36      "tags": ["project:klbr", "topic:reranker", "topic:networking"]
 37    },
 38    {
 39      "memory_id": 104,
 40      "namespace": "default",
 41      "layer": "L1",
 42      "timeline_id": "pr2-reranker-dev",
 43      "text": "the reranker choice for the mvp is bge-reranker-v2-m3.",
 44      "event_time": 1776425400,
 45      "status": "active",
 46      "source_ref": "session:pr2:m2",
 47      "tags": ["project:klbr", "topic:reranker"]
 48    },
 49    {
 50      "memory_id": 105,
 51      "namespace": "default",
 52      "layer": "L1",
 53      "timeline_id": "pr3-recent-work-dev",
 54      "text": "recent work has mostly been retrieval plumbing, instrumentation, and benchmark scaffolding.",
 55      "event_time": 1776355200,
 56      "status": "active",
 57      "source_ref": "session:pr3:m1",
 58      "tags": ["project:klbr", "topic:benchmarking", "topic:retrieval"]
 59    },
 60    {
 61      "memory_id": 106,
 62      "namespace": "default",
 63      "layer": "L1",
 64      "timeline_id": "pr3-recent-work-dev",
 65      "text": "added progress logging to the benchmark harness because silent hangs were confusing.",
 66      "event_time": 1776614400,
 67      "status": "active",
 68      "source_ref": "session:pr3:m2",
 69      "tags": ["project:klbr", "topic:benchmarking", "topic:logging"]
 70    },
 71    {
 72      "memory_id": 107,
 73      "namespace": "default",
 74      "layer": "L1",
 75      "timeline_id": "pr4-service-separation-dev",
 76      "text": "a recurring infra pattern is to separate chat, embedding, and rerank services by role to avoid model mismatch.",
 77      "event_time": 1776618000,
 78      "status": "active",
 79      "source_ref": "session:pr4:m1",
 80      "tags": ["project:klbr", "topic:infra", "topic:retrieval"]
 81    },
 82    {
 83      "memory_id": 108,
 84      "namespace": "default",
 85      "layer": "L1",
 86      "timeline_id": "pr5-errors-dev",
 87      "text": "fixed the daemon swallowing llm stream errors and forwarded the error to the tui.",
 88      "event_time": 1776250800,
 89      "status": "active",
 90      "source_ref": "session:pr5:m1",
 91      "tags": ["project:klbr", "topic:daemon", "topic:tui"]
 92    },
 93    {
 94      "memory_id": 109,
 95      "namespace": "default",
 96      "layer": "L1",
 97      "timeline_id": "pr6-support-calibration-dev",
 98      "text": "added a generic lexical support score based on weighted overlap between salient query terms and the top memory text.",
 99      "event_time": 1776861000,
100      "status": "active",
101      "source_ref": "session:pr6:m1",
102      "tags": ["project:klbr", "topic:calibration", "topic:support"]
103    },
104    {
105      "memory_id": 110,
106      "namespace": "default",
107      "layer": "L1",
108      "timeline_id": "pr6-support-calibration-dev",
109      "text": "a focused support sweep found the best operating point near rerank score -6.0, margin 0.0, and support 0.2.",
110      "event_time": 1776862800,
111      "status": "active",
112      "source_ref": "session:pr6:m2",
113      "tags": ["project:klbr", "topic:calibration", "topic:support"]
114    },
115    {
116      "memory_id": 111,
117      "namespace": "default",
118      "layer": "L1",
119      "timeline_id": "pr6-support-calibration-dev",
120      "text": "the support-assisted operating point removed no-hit false answers, but answerable coverage dropped compared with the looser balanced mode.",
121      "event_time": 1776864600,
122      "status": "active",
123      "source_ref": "session:pr6:m3",
124      "tags": ["project:klbr", "topic:calibration", "topic:support"]
125    },
126    {
127      "memory_id": 112,
128      "namespace": "default",
129      "layer": "L1",
130      "timeline_id": "pr7-passive-policy-dev",
131      "text": "passive recall should reuse generic evidence support rather than brittle query-type guards.",
132      "event_time": 1776866400,
133      "status": "active",
134      "source_ref": "session:pr7:m1",
135      "tags": ["project:klbr", "topic:passive-recall", "topic:support"]
136    },
137    {
138      "memory_id": 113,
139      "namespace": "default",
140      "layer": "L1",
141      "timeline_id": "pr7-passive-policy-dev",
142      "text": "memory use should be treated as two linked decisions: activation first, then trust in the recalled support.",
143      "event_time": 1776868200,
144      "status": "active",
145      "source_ref": "session:pr7:m2",
146      "tags": ["project:klbr", "topic:passive-recall", "topic:calibration"]
147    },
148    {
149      "memory_id": 114,
150      "namespace": "default",
151      "layer": "L1",
152      "timeline_id": "pr7-passive-policy-dev",
153      "text": "before doing more passive recall tuning, the eval dataset was expanded and the retrieval benchmarks were rerun.",
154      "event_time": 1776870000,
155      "status": "active",
156      "source_ref": "session:pr7:m3",
157      "tags": ["project:klbr", "topic:passive-recall", "topic:eval-data"]
158    },
159    {
160      "memory_id": 115,
161      "namespace": "default",
162      "layer": "L1",
163      "timeline_id": "pr7-passive-policy-dev",
164      "text": "the first passive-recall starter run had perfect activation, but support precision was much weaker than support recall.",
165      "event_time": 1776871800,
166      "status": "active",
167      "source_ref": "session:pr7:m4",
168      "tags": ["project:klbr", "topic:passive-recall", "topic:support"]
169    },
170    {
171      "memory_id": 116,
172      "namespace": "default",
173      "layer": "L1",
174      "timeline_id": "pr8-expanded-eval-test",
175      "text": "the main internal eval starter set was expanded from 25 memories and 28 queries to 34 memories and 40 queries.",
176      "event_time": 1776873600,
177      "status": "active",
178      "source_ref": "session:pr8:m1",
179      "tags": ["project:klbr", "topic:eval-data"]
180    },
181    {
182      "memory_id": 117,
183      "namespace": "default",
184      "layer": "L1",
185      "timeline_id": "pr8-expanded-eval-test",
186      "text": "after expansion, the dev slice contained 22 queries and 5 real no-hit cases.",
187      "event_time": 1776875400,
188      "status": "active",
189      "source_ref": "session:pr8:m2",
190      "tags": ["project:klbr", "topic:eval-data", "topic:no-hit"]
191    },
192    {
193      "memory_id": 118,
194      "namespace": "default",
195      "layer": "L1",
196      "timeline_id": "pr8-expanded-eval-test",
197      "text": "the support-calibrated operating point stayed on the frontier after the larger dataset rerun.",
198      "event_time": 1776877200,
199      "status": "active",
200      "source_ref": "session:pr8:m3",
201      "tags": ["project:klbr", "topic:calibration", "topic:support"]
202    },
203    {
204      "memory_id": 119,
205      "namespace": "default",
206      "layer": "L1",
207      "timeline_id": "pr9-passive-followup-test",
208      "text": "the next passive-recall task should focus on support-set quality rather than activation thresholds.",
209      "event_time": 1776879000,
210      "status": "active",
211      "source_ref": "session:pr9:m1",
212      "tags": ["project:klbr", "topic:passive-recall", "topic:support"]
213    },
214    {
215      "memory_id": 120,
216      "namespace": "default",
217      "layer": "L1",
218      "timeline_id": "pr9-passive-followup-test",
219      "text": "the passive dataset still needs more should-not-recall cases and more update-style cues.",
220      "event_time": 1776880800,
221      "status": "active",
222      "source_ref": "session:pr9:m2",
223      "tags": ["project:klbr", "topic:passive-recall", "topic:eval-data"]
224    },
225    {
226      "memory_id": 121,
227      "namespace": "default",
228      "layer": "L1",
229      "timeline_id": "pr9-passive-followup-test",
230      "text": "the passive-recall benchmark should have separate dev and test slices instead of staying dev-only.",
231      "event_time": 1776882600,
232      "status": "active",
233      "source_ref": "session:pr9:m3",
234      "tags": ["project:klbr", "topic:passive-recall", "topic:eval-data"]
235    },
236    {
237      "memory_id": 122,
238      "namespace": "default",
239      "layer": "L1",
240      "timeline_id": "pr9-passive-followup-test",
241      "text": "on the starter passive run, support precision was about 0.39 while support recall was about 0.94.",
242      "event_time": 1776884400,
243      "status": "active",
244      "source_ref": "session:pr9:m4",
245      "tags": ["project:klbr", "topic:passive-recall", "topic:metrics"]
246    }
247  ],
248  "queries": [
249    {
250      "query_id": "pr_dev_1",
251      "split": "dev",
252      "category": "passive recall / progress update",
253      "interaction_mode": "statement",
254      "objective": "passive_recall",
255      "namespace": "default",
256      "timeline_id": "pr3-recent-work-dev",
257      "text": "still working on the benchmark scaffolding and retrieval plumbing stuff today.",
258      "gold_memory_ids": [105],
259      "no_hit": false,
260      "expected_memory_action": "recall",
261      "reference_time": 1776860000
262    },
263    {
264      "query_id": "pr_dev_2",
265      "split": "dev",
266      "category": "passive recall / elliptical request",
267      "interaction_mode": "request",
268      "objective": "passive_recall",
269      "namespace": "default",
270      "timeline_id": "pr2-reranker-dev",
271      "text": "remind me which reranker model we settled on",
272      "gold_memory_ids": [104],
273      "no_hit": false,
274      "expected_memory_action": "recall",
275      "reference_time": 1776860000
276    },
277    {
278      "query_id": "pr_dev_3",
279      "split": "dev",
280      "category": "passive recall / declarative state cue",
281      "interaction_mode": "statement",
282      "objective": "passive_recall",
283      "namespace": "default",
284      "timeline_id": "pr1-editor-and-embeddings-dev",
285      "text": "i'm back in zed again for klbr work.",
286      "gold_memory_ids": [101],
287      "no_hit": false,
288      "expected_memory_action": "recall",
289      "reference_time": 1776860000
290    },
291    {
292      "query_id": "pr_dev_4",
293      "split": "dev",
294      "category": "passive recall / service endpoint cue",
295      "interaction_mode": "fragment",
296      "objective": "passive_recall",
297      "namespace": "default",
298      "timeline_id": "pr2-reranker-dev",
299      "text": "reranker on 8003",
300      "gold_memory_ids": [103],
301      "no_hit": false,
302      "expected_memory_action": "recall",
303      "reference_time": 1776860000
304    },
305    {
306      "query_id": "pr_dev_5",
307      "split": "dev",
308      "category": "passive recall / multi-evidence state",
309      "interaction_mode": "statement",
310      "objective": "passive_recall",
311      "namespace": "default",
312      "timeline_id": "pr4-service-separation-dev",
313      "text": "keeping chat, embeddings, and rerank split out seems like the cleanest setup.",
314      "gold_memory_ids": [107, 102, 103],
315      "no_hit": false,
316      "expected_memory_action": "recall",
317      "reference_time": 1776860000
318    },
319    {
320      "query_id": "pr_dev_6",
321      "split": "dev",
322      "category": "passive recall / no recall social chatter",
323      "interaction_mode": "statement",
324      "objective": "passive_recall",
325      "namespace": "default",
326      "timeline_id": "pr5-errors-dev",
327      "text": "lol yeah that makes sense",
328      "gold_memory_ids": [],
329      "no_hit": true,
330      "expected_memory_action": "no_recall",
331      "reference_time": 1776860000
332    },
333    {
334      "query_id": "pr_dev_7",
335      "split": "dev",
336      "category": "passive recall / no recall generic status",
337      "interaction_mode": "statement",
338      "objective": "passive_recall",
339      "namespace": "default",
340      "timeline_id": "pr5-errors-dev",
341      "text": "i'm kind of tired today",
342      "gold_memory_ids": [],
343      "no_hit": true,
344      "expected_memory_action": "no_recall",
345      "reference_time": 1776860000
346    },
347    {
348      "query_id": "pr_dev_8",
349      "split": "dev",
350      "category": "passive recall / error context cue",
351      "interaction_mode": "statement",
352      "objective": "passive_recall",
353      "namespace": "default",
354      "timeline_id": "pr5-errors-dev",
355      "text": "the tui finally shows the stream errors now.",
356      "gold_memory_ids": [108],
357      "no_hit": false,
358      "expected_memory_action": "recall",
359      "reference_time": 1776860000
360    },
361    {
362      "query_id": "pr_dev_9",
363      "split": "dev",
364      "category": "passive recall / calibration result statement",
365      "interaction_mode": "statement",
366      "objective": "passive_recall",
367      "namespace": "default",
368      "timeline_id": "pr6-support-calibration-dev",
369      "text": "the support feature ended up mattering more than i expected.",
370      "gold_memory_ids": [109, 110],
371      "no_hit": false,
372      "expected_memory_action": "recall",
373      "reference_time": 1776864600
374    },
375    {
376      "query_id": "pr_dev_10",
377      "split": "dev",
378      "category": "passive recall / anti taxonomy cue",
379      "interaction_mode": "statement",
380      "objective": "passive_recall",
381      "namespace": "default",
382      "timeline_id": "pr7-passive-policy-dev",
383      "text": "i still don't want to solve this with query-type guards.",
384      "gold_memory_ids": [112],
385      "no_hit": false,
386      "expected_memory_action": "recall",
387      "reference_time": 1776871800
388    },
389    {
390      "query_id": "pr_dev_11",
391      "split": "dev",
392      "category": "passive recall / planning cue",
393      "interaction_mode": "statement",
394      "objective": "passive_recall",
395      "namespace": "default",
396      "timeline_id": "pr7-passive-policy-dev",
397      "text": "before tuning passive recall further we needed a bigger eval slice.",
398      "gold_memory_ids": [114],
399      "no_hit": false,
400      "expected_memory_action": "recall",
401      "reference_time": 1776871800
402    },
403    {
404      "query_id": "pr_dev_12",
405      "split": "dev",
406      "category": "passive recall / support quality cue",
407      "interaction_mode": "statement",
408      "objective": "passive_recall",
409      "namespace": "default",
410      "timeline_id": "pr7-passive-policy-dev",
411      "text": "activation seems fine but the recalled set is still kind of noisy.",
412      "gold_memory_ids": [115],
413      "no_hit": false,
414      "expected_memory_action": "recall",
415      "reference_time": 1776871800
416    },
417    {
418      "query_id": "pr_dev_13",
419      "split": "dev",
420      "category": "passive recall / policy framing cue",
421      "interaction_mode": "statement",
422      "objective": "passive_recall",
423      "namespace": "default",
424      "timeline_id": "pr7-passive-policy-dev",
425      "text": "memory use really is activation first and then trust.",
426      "gold_memory_ids": [113],
427      "no_hit": false,
428      "expected_memory_action": "recall",
429      "reference_time": 1776871800
430    },
431    {
432      "query_id": "pr_dev_14",
433      "split": "dev",
434      "category": "passive recall / no recall off topic opinion",
435      "interaction_mode": "statement",
436      "objective": "passive_recall",
437      "namespace": "default",
438      "timeline_id": "pr7-passive-policy-dev",
439      "text": "that color palette is nice tbh",
440      "gold_memory_ids": [],
441      "no_hit": true,
442      "expected_memory_action": "no_recall",
443      "reference_time": 1776871800
444    },
445    {
446      "query_id": "pr_dev_15",
447      "split": "dev",
448      "category": "passive recall / no recall short acknowledgement",
449      "interaction_mode": "fragment",
450      "objective": "passive_recall",
451      "namespace": "default",
452      "timeline_id": "pr7-passive-policy-dev",
453      "text": "lol fair enough",
454      "gold_memory_ids": [],
455      "no_hit": true,
456      "expected_memory_action": "no_recall",
457      "reference_time": 1776871800
458    },
459    {
460      "query_id": "pr_test_1",
461      "split": "test",
462      "category": "passive recall / rerun result cue",
463      "interaction_mode": "statement",
464      "objective": "passive_recall",
465      "namespace": "default",
466      "timeline_id": "pr8-expanded-eval-test",
467      "text": "the bigger eval rerun still liked the support-calibrated point.",
468      "gold_memory_ids": [118],
469      "no_hit": false,
470      "expected_memory_action": "recall",
471      "reference_time": 1776884400
472    },
473    {
474      "query_id": "pr_test_2",
475      "split": "test",
476      "category": "passive recall / expanded dataset cue",
477      "interaction_mode": "statement",
478      "objective": "passive_recall",
479      "namespace": "default",
480      "timeline_id": "pr8-expanded-eval-test",
481      "text": "the expanded main eval set is way less toy now.",
482      "gold_memory_ids": [116, 117],
483      "no_hit": false,
484      "expected_memory_action": "recall",
485      "reference_time": 1776884400
486    },
487    {
488      "query_id": "pr_test_3",
489      "split": "test",
490      "category": "passive recall / planning request",
491      "interaction_mode": "request",
492      "objective": "passive_recall",
493      "namespace": "default",
494      "timeline_id": "pr9-passive-followup-test",
495      "text": "what should the next passive recall pass focus on again",
496      "gold_memory_ids": [119],
497      "no_hit": false,
498      "expected_memory_action": "recall",
499      "reference_time": 1776884400
500    },
501    {
502      "query_id": "pr_test_4",
503      "split": "test",
504      "category": "passive recall / dataset curation cue",
505      "interaction_mode": "statement",
506      "objective": "passive_recall",
507      "namespace": "default",
508      "timeline_id": "pr9-passive-followup-test",
509      "text": "we still need more no-recall and update-style passive items.",
510      "gold_memory_ids": [120],
511      "no_hit": false,
512      "expected_memory_action": "recall",
513      "reference_time": 1776884400
514    },
515    {
516      "query_id": "pr_test_5",
517      "split": "test",
518      "category": "passive recall / split planning cue",
519      "interaction_mode": "statement",
520      "objective": "passive_recall",
521      "namespace": "default",
522      "timeline_id": "pr9-passive-followup-test",
523      "text": "the passive benchmark should stop being dev-only.",
524      "gold_memory_ids": [121],
525      "no_hit": false,
526      "expected_memory_action": "recall",
527      "reference_time": 1776884400
528    },
529    {
530      "query_id": "pr_test_6",
531      "split": "test",
532      "category": "passive recall / metric fragment",
533      "interaction_mode": "fragment",
534      "objective": "passive_recall",
535      "namespace": "default",
536      "timeline_id": "pr9-passive-followup-test",
537      "text": "support precision was only around point three nine",
538      "gold_memory_ids": [122],
539      "no_hit": false,
540      "expected_memory_action": "recall",
541      "reference_time": 1776884400
542    },
543    {
544      "query_id": "pr_test_7",
545      "split": "test",
546      "category": "passive recall / no recall ambient need",
547      "interaction_mode": "statement",
548      "objective": "passive_recall",
549      "namespace": "default",
550      "timeline_id": "pr9-passive-followup-test",
551      "text": "i need coffee lol",
552      "gold_memory_ids": [],
553      "no_hit": true,
554      "expected_memory_action": "no_recall",
555      "reference_time": 1776884400
556    },
557    {
558      "query_id": "pr_test_8",
559      "split": "test",
560      "category": "passive recall / no recall empathy",
561      "interaction_mode": "statement",
562      "objective": "passive_recall",
563      "namespace": "default",
564      "timeline_id": "pr9-passive-followup-test",
565      "text": "that sounds annoying",
566      "gold_memory_ids": [],
567      "no_hit": true,
568      "expected_memory_action": "no_recall",
569      "reference_time": 1776884400
570    },
571    {
572      "query_id": "pr_test_9",
573      "split": "test",
574      "category": "passive recall / multi-evidence calibration cue",
575      "interaction_mode": "statement",
576      "objective": "passive_recall",
577      "namespace": "default",
578      "timeline_id": "pr8-expanded-eval-test",
579      "text": "the rerun made the calibration feel more real because the dataset got bigger and the same point stayed best.",
580      "gold_memory_ids": [116, 118],
581      "no_hit": false,
582      "expected_memory_action": "recall",
583      "reference_time": 1776884400
584    },
585    {
586      "query_id": "pr_test_10",
587      "split": "test",
588      "category": "passive recall / no recall short absence",
589      "interaction_mode": "fragment",
590      "objective": "passive_recall",
591      "namespace": "default",
592      "timeline_id": "pr9-passive-followup-test",
593      "text": "brb for a sec",
594      "gold_memory_ids": [],
595      "no_hit": true,
596      "expected_memory_action": "no_recall",
597      "reference_time": 1776884400
598    }
599  ]
600}
Configure Feed

Configure Feed