ive harnessed the harness
1
fork

Configure Feed

Select the types of activity you want to include in your feed.

at main 972 lines 34 kB view raw
1{ 2 "dataset_id": "internal-eval-starter-v2", 3 "description": "Expanded internal eval set for KLBR MVP retrieval benchmarking. Timeline-aware, category-complete, and large enough to stress no-hit, support-calibration, and multi-evidence behavior more realistically.", 4 "memories": [ 5 { 6 "memory_id": 1, 7 "namespace": "default", 8 "layer": "L1", 9 "timeline_id": "t0-editor-before-switch", 10 "text": "before the switch, the primary editor for klbr work was neovim.", 11 "event_time": 1776153600, 12 "status": "active", 13 "source_ref": "session:t0:m1", 14 "tags": ["project:klbr", "topic:editor"] 15 }, 16 { 17 "memory_id": 2, 18 "namespace": "default", 19 "layer": "L1", 20 "timeline_id": "t1-editor-and-embedding-split", 21 "text": "switched the day-to-day klbr editor from neovim to zed.", 22 "event_time": 1776157200, 23 "status": "active", 24 "source_ref": "session:t1:m1", 25 "tags": ["project:klbr", "topic:editor"] 26 }, 27 { 28 "memory_id": 3, 29 "namespace": "default", 30 "layer": "L1", 31 "timeline_id": "t1-editor-and-embedding-split", 32 "text": "split embeddings onto a separate llama-server at localhost 8002 using bge-m3 with dimension 1024.", 33 "event_time": 1776159000, 34 "status": "active", 35 "source_ref": "session:t1:m2", 36 "tags": ["project:klbr", "topic:embeddings"] 37 }, 38 { 39 "memory_id": 4, 40 "namespace": "default", 41 "layer": "L1", 42 "timeline_id": "t1-editor-and-embedding-split", 43 "text": "the first benchmark slice should stay exact retrieval first before indexed search.", 44 "event_time": 1776160800, 45 "status": "active", 46 "source_ref": "session:t1:m3", 47 "tags": ["project:klbr", "topic:benchmarking"] 48 }, 49 { 50 "memory_id": 5, 51 "namespace": "default", 52 "layer": "L1", 53 "timeline_id": "t2-error-propagation", 54 "text": "fixed the daemon swallowing llm stream errors and forwarded the error to the tui.", 55 "event_time": 1776250800, 56 "status": "active", 57 "source_ref": "session:t2:m1", 58 "tags": ["project:klbr", "topic:daemon", "topic:tui"] 59 }, 60 { 61 "memory_id": 6, 62 "namespace": "default", 63 "layer": "L1", 64 "timeline_id": "t2-error-propagation", 65 "text": "compaction summaries are stored with the compaction_summary tag.", 66 "event_time": 1776252600, 67 "status": "active", 68 "source_ref": "session:t2:m2", 69 "tags": ["project:klbr", "topic:memory"] 70 }, 71 { 72 "memory_id": 7, 73 "namespace": "default", 74 "layer": "L1", 75 "timeline_id": "t2-error-propagation", 76 "text": "the chat llama-server did not expose embeddings, so embeddings had to stay on a separate process.", 77 "event_time": 1776254400, 78 "status": "active", 79 "source_ref": "session:t2:m3", 80 "tags": ["project:klbr", "topic:embeddings", "topic:llama-server"] 81 }, 82 { 83 "memory_id": 8, 84 "namespace": "default", 85 "layer": "L1", 86 "timeline_id": "t3-benchmark-scaffolding", 87 "text": "the benchmark harness should log per-query candidate lists and stage latencies.", 88 "event_time": 1776351600, 89 "status": "active", 90 "source_ref": "session:t3:m1", 91 "tags": ["project:klbr", "topic:benchmarking"] 92 }, 93 { 94 "memory_id": 9, 95 "namespace": "default", 96 "layer": "L1", 97 "timeline_id": "t3-benchmark-scaffolding", 98 "text": "internal eval should split by timeline or session, not by individual question only.", 99 "event_time": 1776353400, 100 "status": "active", 101 "source_ref": "session:t3:m2", 102 "tags": ["project:klbr", "topic:benchmarking", "topic:eval-data"] 103 }, 104 { 105 "memory_id": 10, 106 "namespace": "default", 107 "layer": "L1", 108 "timeline_id": "t3-benchmark-scaffolding", 109 "text": "recent work has mostly been retrieval plumbing, instrumentation, and benchmark scaffolding.", 110 "event_time": 1776355200, 111 "status": "active", 112 "source_ref": "session:t3:m3", 113 "tags": ["project:klbr", "topic:benchmarking", "topic:retrieval"] 114 }, 115 { 116 "memory_id": 11, 117 "namespace": "default", 118 "layer": "L1", 119 "timeline_id": "t4-reranker-plan", 120 "text": "planned a separate reranker endpoint on localhost 8003.", 121 "event_time": 1776421800, 122 "status": "active", 123 "source_ref": "session:t4:m1", 124 "tags": ["project:klbr", "topic:reranker"] 125 }, 126 { 127 "memory_id": 12, 128 "namespace": "default", 129 "layer": "L1", 130 "timeline_id": "t4-reranker-plan", 131 "text": "the cpu-only reranker should use plain http, not https.", 132 "event_time": 1776423600, 133 "status": "active", 134 "source_ref": "session:t4:m2", 135 "tags": ["project:klbr", "topic:reranker", "topic:networking"] 136 }, 137 { 138 "memory_id": 13, 139 "namespace": "default", 140 "layer": "L1", 141 "timeline_id": "t4-reranker-plan", 142 "text": "the reranker choice for the mvp is bge-reranker-v2-m3.", 143 "event_time": 1776425400, 144 "status": "active", 145 "source_ref": "session:t4:m3", 146 "tags": ["project:klbr", "topic:reranker"] 147 }, 148 { 149 "memory_id": 14, 150 "namespace": "default", 151 "layer": "L1", 152 "timeline_id": "t5-llama-server-migration", 153 "text": "moved chat model serving from lm studio to llama-server.", 154 "event_time": 1776517200, 155 "status": "active", 156 "source_ref": "session:t5:m1", 157 "tags": ["project:klbr", "topic:llama-server"] 158 }, 159 { 160 "memory_id": 15, 161 "namespace": "default", 162 "layer": "L1", 163 "timeline_id": "t5-llama-server-migration", 164 "text": "around the migration, auto model discovery was added for chat and embeddings.", 165 "event_time": 1776519000, 166 "status": "active", 167 "source_ref": "session:t5:m2", 168 "tags": ["project:klbr", "topic:llama-server", "topic:models"] 169 }, 170 { 171 "memory_id": 16, 172 "namespace": "default", 173 "layer": "L1", 174 "timeline_id": "t5-llama-server-migration", 175 "text": "the separate embedding server still uses bge-m3 rather than whatever chat model is currently loaded.", 176 "event_time": 1776520800, 177 "status": "active", 178 "source_ref": "session:t5:m3", 179 "tags": ["project:klbr", "topic:embeddings"] 180 }, 181 { 182 "memory_id": 17, 183 "namespace": "default", 184 "layer": "L1", 185 "timeline_id": "t6-harness-observability", 186 "text": "added progress logging to the benchmark harness because silent hangs were confusing.", 187 "event_time": 1776614400, 188 "status": "active", 189 "source_ref": "session:t6:m1", 190 "tags": ["project:klbr", "topic:benchmarking", "topic:logging"] 191 }, 192 { 193 "memory_id": 18, 194 "namespace": "default", 195 "layer": "L1", 196 "timeline_id": "t6-harness-observability", 197 "text": "the first tiny benchmark only proved the plumbing and was too small for calibration.", 198 "event_time": 1776616200, 199 "status": "active", 200 "source_ref": "session:t6:m2", 201 "tags": ["project:klbr", "topic:benchmarking", "topic:calibration"] 202 }, 203 { 204 "memory_id": 19, 205 "namespace": "default", 206 "layer": "L1", 207 "timeline_id": "t6-harness-observability", 208 "text": "a recurring infra pattern is to separate chat, embedding, and rerank services by role to avoid model mismatch.", 209 "event_time": 1776618000, 210 "status": "active", 211 "source_ref": "session:t6:m3", 212 "tags": ["project:klbr", "topic:infra", "topic:retrieval"] 213 }, 214 { 215 "memory_id": 20, 216 "namespace": "default", 217 "layer": "L1", 218 "timeline_id": "t7-latency-and-abstain", 219 "text": "rerank latency dropped sharply after switching the reranker endpoint to plain http and using a warmed model.", 220 "event_time": 1776686400, 221 "status": "active", 222 "source_ref": "session:t7:m1", 223 "tags": ["project:klbr", "topic:reranker", "topic:latency"] 224 }, 225 { 226 "memory_id": 21, 227 "namespace": "default", 228 "layer": "L1", 229 "timeline_id": "t7-latency-and-abstain", 230 "text": "the benchmark still abstained wrongly on one positive query because abstention used first-stage distance instead of rerank confidence.", 231 "event_time": 1776688200, 232 "status": "active", 233 "source_ref": "session:t7:m2", 234 "tags": ["project:klbr", "topic:abstention", "topic:reranker"] 235 }, 236 { 237 "memory_id": 22, 238 "namespace": "default", 239 "layer": "L1", 240 "timeline_id": "t8-better-eval-data", 241 "text": "a better internal eval set should include no-hit, temporal ambiguity, and multi-evidence cases.", 242 "event_time": 1776780000, 243 "status": "active", 244 "source_ref": "session:t8:m1", 245 "tags": ["project:klbr", "topic:eval-data"] 246 }, 247 { 248 "memory_id": 23, 249 "namespace": "default", 250 "layer": "L1", 251 "timeline_id": "t8-better-eval-data", 252 "text": "the live agent and the benchmark harness now share the same time-windowed exact retrieval path.", 253 "event_time": 1776781800, 254 "status": "active", 255 "source_ref": "session:t8:m2", 256 "tags": ["project:klbr", "topic:retrieval"] 257 }, 258 { 259 "memory_id": 24, 260 "namespace": "default", 261 "layer": "L1", 262 "timeline_id": "t9-current-endpoints", 263 "text": "the current reranker endpoint is http://localhost:8003.", 264 "event_time": 1776852000, 265 "status": "active", 266 "source_ref": "session:t9:m1", 267 "tags": ["project:klbr", "topic:reranker", "topic:networking"] 268 }, 269 { 270 "memory_id": 25, 271 "namespace": "default", 272 "layer": "L1", 273 "timeline_id": "t9-current-endpoints", 274 "text": "the current embedding endpoint is http://localhost:8002.", 275 "event_time": 1776853800, 276 "status": "active", 277 "source_ref": "session:t9:m2", 278 "tags": ["project:klbr", "topic:embeddings", "topic:networking"] 279 }, 280 { 281 "memory_id": 26, 282 "namespace": "default", 283 "layer": "L1", 284 "timeline_id": "t10-support-calibration", 285 "text": "added a generic lexical support score based on weighted overlap between salient query terms and the top memory text.", 286 "event_time": 1776861000, 287 "status": "active", 288 "source_ref": "session:t10:m1", 289 "tags": ["project:klbr", "topic:calibration", "topic:support"] 290 }, 291 { 292 "memory_id": 27, 293 "namespace": "default", 294 "layer": "L1", 295 "timeline_id": "t10-support-calibration", 296 "text": "a focused support sweep found a useful operating point at rerank score -6.0, margin 0.0, and support 0.2.", 297 "event_time": 1776862800, 298 "status": "active", 299 "source_ref": "session:t10:m2", 300 "tags": ["project:klbr", "topic:calibration", "topic:support"] 301 }, 302 { 303 "memory_id": 28, 304 "namespace": "default", 305 "layer": "L1", 306 "timeline_id": "t10-support-calibration", 307 "text": "the support-assisted operating point removed no-hit false answers on the dev slice, but answerable coverage settled at 0.8333 instead of the old 0.9167 balanced point.", 308 "event_time": 1776864600, 309 "status": "active", 310 "source_ref": "session:t10:m3", 311 "tags": ["project:klbr", "topic:calibration", "topic:support"] 312 }, 313 { 314 "memory_id": 29, 315 "namespace": "default", 316 "layer": "L1", 317 "timeline_id": "t11-support-vs-taxonomy", 318 "text": "decided against adding many brittle query-type guards for calibration.", 319 "event_time": 1776866400, 320 "status": "active", 321 "source_ref": "session:t11:m1", 322 "tags": ["project:klbr", "topic:calibration", "topic:query-types"] 323 }, 324 { 325 "memory_id": 30, 326 "namespace": "default", 327 "layer": "L1", 328 "timeline_id": "t11-support-vs-taxonomy", 329 "text": "the better direction is generic evidence support that works for both direct questions and non-question inputs.", 330 "event_time": 1776868200, 331 "status": "active", 332 "source_ref": "session:t11:m2", 333 "tags": ["project:klbr", "topic:support", "topic:passive-recall"] 334 }, 335 { 336 "memory_id": 31, 337 "namespace": "default", 338 "layer": "L1", 339 "timeline_id": "t11-support-vs-taxonomy", 340 "text": "passive recall should reuse the same support features instead of introducing a separate query taxonomy.", 341 "event_time": 1776870000, 342 "status": "active", 343 "source_ref": "session:t11:m3", 344 "tags": ["project:klbr", "topic:support", "topic:passive-recall"] 345 }, 346 { 347 "memory_id": 32, 348 "namespace": "default", 349 "layer": "L1", 350 "timeline_id": "t12-expand-dataset-first", 351 "text": "before passive recall work, the next step was to expand the eval dataset and rerun the retrieval benchmarks.", 352 "event_time": 1776871800, 353 "status": "active", 354 "source_ref": "session:t12:m1", 355 "tags": ["project:klbr", "topic:eval-data", "topic:benchmarking"] 356 }, 357 { 358 "memory_id": 33, 359 "namespace": "default", 360 "layer": "L1", 361 "timeline_id": "t12-expand-dataset-first", 362 "text": "the starter eval set needed stronger coverage in no-hit, conflict/update, and multi-evidence cases.", 363 "event_time": 1776873600, 364 "status": "active", 365 "source_ref": "session:t12:m2", 366 "tags": ["project:klbr", "topic:eval-data", "topic:calibration"] 367 }, 368 { 369 "memory_id": 34, 370 "namespace": "default", 371 "layer": "L1", 372 "timeline_id": "t12-expand-dataset-first", 373 "text": "the current thresholds should stay provisional until the larger dataset is benchmarked again.", 374 "event_time": 1776875400, 375 "status": "active", 376 "source_ref": "session:t12:m3", 377 "tags": ["project:klbr", "topic:calibration", "topic:eval-data"] 378 } 379 ], 380 "queries": [ 381 { 382 "query_id": "dev_q1", 383 "split": "test", 384 "category": "exact recent event", 385 "namespace": "default", 386 "timeline_id": "t9-current-endpoints", 387 "text": "what is the reranker endpoint right now?", 388 "gold_memory_ids": [24], 389 "no_hit": false, 390 "gold_answer": "http://localhost:8003", 391 "reference_time": 1776852000 392 }, 393 { 394 "query_id": "dev_q2", 395 "split": "test", 396 "category": "exact dated event", 397 "namespace": "default", 398 "timeline_id": "t5-llama-server-migration", 399 "text": "what did i switch to on april 18 when i moved off lm studio?", 400 "gold_memory_ids": [14], 401 "no_hit": false, 402 "gold_answer": "llama-server", 403 "reference_time": 1776517200 404 }, 405 { 406 "query_id": "dev_q3", 407 "split": "dev", 408 "category": "vague recent lookup", 409 "namespace": "default", 410 "timeline_id": "t8-better-eval-data", 411 "text": "what have i been working on lately?", 412 "gold_memory_ids": [10, 22, 23], 413 "no_hit": false, 414 "gold_answer": "better eval data and the shared retrieval path", 415 "reference_time": 1776852000 416 }, 417 { 418 "query_id": "dev_q4", 419 "split": "dev", 420 "category": "recurring theme / pattern-like question", 421 "namespace": "default", 422 "timeline_id": "t6-harness-observability", 423 "text": "what kind of infra pattern do i keep using in this project?", 424 "gold_memory_ids": [19], 425 "no_hit": false, 426 "gold_answer": "splitting chat, embedding, and rerank services by role", 427 "reference_time": 1776780000 428 }, 429 { 430 "query_id": "dev_q5", 431 "split": "dev", 432 "category": "conflict / update question", 433 "namespace": "default", 434 "timeline_id": "t1-editor-and-embedding-split", 435 "text": "what editor am i using now for klbr?", 436 "gold_memory_ids": [2], 437 "no_hit": false, 438 "gold_answer": "zed", 439 "reference_time": 1776852000 440 }, 441 { 442 "query_id": "dev_q6", 443 "split": "test", 444 "category": "conflict / update question", 445 "namespace": "default", 446 "timeline_id": "t7-latency-and-abstain", 447 "text": "is the reranker endpoint using https now?", 448 "gold_memory_ids": [12, 24], 449 "no_hit": false, 450 "gold_answer": "no, it should use plain http and the current endpoint is http://localhost:8003", 451 "reference_time": 1776852000 452 }, 453 { 454 "query_id": "dev_q7", 455 "split": "test", 456 "category": "temporally ambiguous query", 457 "namespace": "default", 458 "timeline_id": "t5-llama-server-migration", 459 "text": "what happened around the migration?", 460 "gold_memory_ids": [14, 15, 16], 461 "no_hit": false, 462 "gold_answer": "chat moved to llama-server, auto model discovery was added, and embeddings stayed separate on bge-m3", 463 "reference_time": 1776519000 464 }, 465 { 466 "query_id": "dev_q8", 467 "split": "test", 468 "category": "multi-evidence query", 469 "namespace": "default", 470 "timeline_id": "t7-latency-and-abstain", 471 "text": "why did the benchmark latency improve?", 472 "gold_memory_ids": [20, 24], 473 "no_hit": false, 474 "gold_answer": "because the reranker endpoint switched to plain http and the model was warm", 475 "reference_time": 1776852000 476 }, 477 { 478 "query_id": "dev_q9", 479 "split": "test", 480 "category": "no-hit query", 481 "namespace": "default", 482 "timeline_id": "t9-current-endpoints", 483 "text": "what postgres version am i running for klbr?", 484 "gold_memory_ids": [], 485 "no_hit": true, 486 "reference_time": 1776852000 487 }, 488 { 489 "query_id": "dev_q10", 490 "split": "dev", 491 "category": "exact dated event", 492 "namespace": "default", 493 "timeline_id": "t2-error-propagation", 494 "text": "what was fixed on april 15 when the stream failed?", 495 "gold_memory_ids": [5], 496 "no_hit": false, 497 "gold_answer": "the daemon stopped swallowing llm stream errors and forwarded them to the tui", 498 "reference_time": 1776250800 499 }, 500 { 501 "query_id": "dev_q11", 502 "split": "dev", 503 "category": "vague recent lookup", 504 "namespace": "default", 505 "timeline_id": "t3-benchmark-scaffolding", 506 "text": "what benchmark-related work has been happening?", 507 "gold_memory_ids": [8, 9, 10, 17, 18, 22], 508 "no_hit": false, 509 "gold_answer": "benchmark logging, timeline-aware eval data, retrieval plumbing, and replacing the toy eval set", 510 "reference_time": 1776852000 511 }, 512 { 513 "query_id": "dev_q12", 514 "split": "dev", 515 "category": "multi-evidence query", 516 "namespace": "default", 517 "timeline_id": "t1-editor-and-embedding-split", 518 "text": "what setup is protecting us from embedding/model mismatch?", 519 "gold_memory_ids": [3, 7, 19], 520 "no_hit": false, 521 "gold_answer": "keeping embeddings separate on bge-m3 and separating services by role", 522 "reference_time": 1776852000 523 }, 524 { 525 "query_id": "dev_q13", 526 "split": "dev", 527 "category": "no-hit query", 528 "namespace": "default", 529 "timeline_id": "t8-better-eval-data", 530 "route_label": "tools", 531 "text": "what postgres version is klbr using right now?", 532 "gold_memory_ids": [], 533 "no_hit": true, 534 "reference_time": 1776852000 535 }, 536 { 537 "query_id": "dev_q14", 538 "split": "dev", 539 "category": "no-hit query", 540 "namespace": "default", 541 "timeline_id": "t6-harness-observability", 542 "route_label": "tools", 543 "text": "which browser am i using to inspect the benchmark output?", 544 "gold_memory_ids": [], 545 "no_hit": true, 546 "reference_time": 1776852000 547 }, 548 { 549 "query_id": "dev_q15", 550 "split": "dev", 551 "category": "no-hit query", 552 "namespace": "default", 553 "timeline_id": "t1-editor-and-embedding-split", 554 "route_label": "tools", 555 "text": "what gpu is running the reranker?", 556 "gold_memory_ids": [], 557 "no_hit": true, 558 "reference_time": 1776852000 559 }, 560 { 561 "query_id": "dev_q16", 562 "split": "dev", 563 "category": "no-hit query", 564 "namespace": "default", 565 "timeline_id": "t3-benchmark-scaffolding", 566 "route_label": "tools", 567 "text": "which postgres extension did i enable for the benchmark database?", 568 "gold_memory_ids": [], 569 "no_hit": true, 570 "reference_time": 1776852000 571 }, 572 { 573 "query_id": "test_q1", 574 "split": "dev", 575 "category": "exact recent event", 576 "namespace": "default", 577 "timeline_id": "t8-better-eval-data", 578 "text": "what changed between the live agent and the benchmark harness?", 579 "gold_memory_ids": [23], 580 "no_hit": false, 581 "gold_answer": "they now share the same time-windowed exact retrieval path", 582 "reference_time": 1776781800 583 }, 584 { 585 "query_id": "test_q2", 586 "split": "dev", 587 "category": "conflict / update question", 588 "namespace": "default", 589 "timeline_id": "t1-editor-and-embedding-split", 590 "text": "was i still using neovim after the editor switch?", 591 "gold_memory_ids": [2], 592 "no_hit": false, 593 "gold_answer": "no, the newer fact says the editor switched to zed", 594 "reference_time": 1776852000 595 }, 596 { 597 "query_id": "test_q3", 598 "split": "test", 599 "category": "temporally ambiguous query", 600 "namespace": "default", 601 "timeline_id": "t7-latency-and-abstain", 602 "text": "what happened around the latency fix?", 603 "gold_memory_ids": [20, 21], 604 "no_hit": false, 605 "gold_answer": "latency improved after moving to http and a remaining bug was the wrong abstention rule", 606 "reference_time": 1776688200 607 }, 608 { 609 "query_id": "test_q4", 610 "split": "test", 611 "category": "no-hit query", 612 "namespace": "default", 613 "timeline_id": "t9-current-endpoints", 614 "text": "which browser am i using to inspect the benchmark output?", 615 "gold_memory_ids": [], 616 "no_hit": true, 617 "reference_time": 1776852000 618 }, 619 { 620 "query_id": "test_q5", 621 "split": "test", 622 "category": "exact dated event", 623 "namespace": "default", 624 "timeline_id": "t4-reranker-plan", 625 "text": "what reranker did i decide to use on april 17?", 626 "gold_memory_ids": [13], 627 "no_hit": false, 628 "gold_answer": "bge-reranker-v2-m3", 629 "reference_time": 1776425400 630 }, 631 { 632 "query_id": "test_q6", 633 "split": "test", 634 "category": "vague recent lookup", 635 "namespace": "default", 636 "timeline_id": "t5-llama-server-migration", 637 "text": "what changed in model serving around llama-server?", 638 "gold_memory_ids": [14, 15, 16], 639 "no_hit": false, 640 "gold_answer": "chat moved to llama-server, auto model discovery was added, and embeddings remained separate", 641 "reference_time": 1776852000 642 }, 643 { 644 "query_id": "test_q7", 645 "split": "dev", 646 "category": "multi-evidence query", 647 "namespace": "default", 648 "timeline_id": "t8-better-eval-data", 649 "text": "why do we need better benchmark data now?", 650 "gold_memory_ids": [18, 22], 651 "no_hit": false, 652 "gold_answer": "because the toy benchmark only proved plumbing and the eval set needs harder no-hit and ambiguity cases", 653 "reference_time": 1776852000 654 }, 655 { 656 "query_id": "test_q8", 657 "split": "dev", 658 "category": "recurring theme / pattern-like question", 659 "namespace": "default", 660 "timeline_id": "t6-harness-observability", 661 "text": "what sort of retrieval work keeps coming up?", 662 "gold_memory_ids": [10, 19, 23], 663 "no_hit": false, 664 "gold_answer": "retrieval plumbing, service separation, and aligning the live and benchmark retrieval path", 665 "reference_time": 1776852000 666 }, 667 { 668 "query_id": "test_q9", 669 "split": "test", 670 "category": "no-hit query", 671 "namespace": "default", 672 "timeline_id": "t9-current-endpoints", 673 "route_label": "tools", 674 "text": "what gpu am i using for the reranker?", 675 "required_tools": ["shell"], 676 "gold_memory_ids": [], 677 "no_hit": true, 678 "reference_time": 1776852000 679 }, 680 { 681 "query_id": "test_q10", 682 "split": "dev", 683 "category": "exact recent event", 684 "namespace": "default", 685 "timeline_id": "t6-harness-observability", 686 "text": "why did i add progress logging to the benchmark harness?", 687 "gold_memory_ids": [17], 688 "no_hit": false, 689 "gold_answer": "because silent hangs were confusing", 690 "reference_time": 1776614400 691 }, 692 { 693 "query_id": "test_q11", 694 "split": "test", 695 "category": "conflict / update question", 696 "namespace": "default", 697 "timeline_id": "t7-latency-and-abstain", 698 "text": "what was wrong with the benchmark after the reranker got fast?", 699 "gold_memory_ids": [21], 700 "no_hit": false, 701 "gold_answer": "it still abstained on a positive query because it used first-stage distance instead of rerank confidence", 702 "reference_time": 1776688200 703 }, 704 { 705 "query_id": "test_q12", 706 "split": "dev", 707 "category": "temporally ambiguous query", 708 "namespace": "default", 709 "timeline_id": "t1-editor-and-embedding-split", 710 "text": "what happened around the embedding split?", 711 "gold_memory_ids": [2, 3, 4], 712 "no_hit": false, 713 "gold_answer": "the editor switched to zed, embeddings moved to a separate bge-m3 server, and exact retrieval stayed the first benchmark step", 714 "reference_time": 1776160800 715 }, 716 { 717 "query_id": "dev_q17", 718 "split": "dev", 719 "category": "exact recent event", 720 "namespace": "default", 721 "timeline_id": "t10-support-calibration", 722 "text": "what support threshold looked best in the focused support sweep?", 723 "gold_memory_ids": [27], 724 "no_hit": false, 725 "gold_answer": "0.2", 726 "reference_time": 1776862800 727 }, 728 { 729 "query_id": "dev_q18", 730 "split": "dev", 731 "category": "multi-evidence query", 732 "namespace": "default", 733 "timeline_id": "t10-support-calibration", 734 "text": "what changed after adding the support score to the policy?", 735 "gold_memory_ids": [26, 27, 28], 736 "no_hit": false, 737 "gold_answer": "the policy added weighted lexical support, settled on score -6.0 / margin 0.0 / support 0.2, and removed no-hit false answers with some coverage loss", 738 "reference_time": 1776864600 739 }, 740 { 741 "query_id": "dev_q19", 742 "split": "dev", 743 "category": "recurring theme / pattern-like question", 744 "namespace": "default", 745 "timeline_id": "t11-support-vs-taxonomy", 746 "text": "what kind of calibration fix are we avoiding because it would be brittle?", 747 "gold_memory_ids": [29], 748 "no_hit": false, 749 "gold_answer": "many query-type guards", 750 "reference_time": 1776870000 751 }, 752 { 753 "query_id": "dev_q20", 754 "split": "dev", 755 "category": "multi-evidence query", 756 "namespace": "default", 757 "timeline_id": "t11-support-vs-taxonomy", 758 "text": "how should passive recall connect to the current calibration work?", 759 "gold_memory_ids": [30, 31], 760 "no_hit": false, 761 "gold_answer": "it should reuse the same generic support features instead of a separate taxonomy", 762 "reference_time": 1776870000 763 }, 764 { 765 "query_id": "dev_q21", 766 "split": "dev", 767 "category": "no-hit query", 768 "namespace": "default", 769 "timeline_id": "t11-support-vs-taxonomy", 770 "route_label": "tools", 771 "text": "which classifier model are we using to assign query types?", 772 "gold_memory_ids": [], 773 "no_hit": true, 774 "reference_time": 1776870000 775 }, 776 { 777 "query_id": "dev_q22", 778 "split": "dev", 779 "category": "conflict / update question", 780 "namespace": "default", 781 "timeline_id": "t10-support-calibration", 782 "text": "did the support-assisted sweep get coverage all the way back to the old 0.9167 balanced point?", 783 "gold_memory_ids": [28], 784 "no_hit": false, 785 "gold_answer": "no, coverage settled at 0.8333", 786 "reference_time": 1776864600 787 }, 788 { 789 "query_id": "dev_q23", 790 "split": "dev", 791 "category": "no-hit query", 792 "namespace": "default", 793 "timeline_id": "t8-better-eval-data", 794 "text": "what editor am i using for my dotfiles repo these days?", 795 "gold_memory_ids": [], 796 "no_hit": true, 797 "reference_time": 1776870000 798 }, 799 { 800 "query_id": "dev_q24", 801 "split": "dev", 802 "category": "no-hit query", 803 "namespace": "default", 804 "timeline_id": "t3-benchmark-scaffolding", 805 "text": "what's the p99 rerank latency on the current machine?", 806 "gold_memory_ids": [], 807 "no_hit": true, 808 "reference_time": 1776870000 809 }, 810 { 811 "query_id": "dev_q25", 812 "split": "dev", 813 "category": "no-hit query", 814 "namespace": "default", 815 "timeline_id": "t11-support-vs-taxonomy", 816 "text": "what's the commit hash for the router model we're using right now?", 817 "gold_memory_ids": [], 818 "no_hit": true, 819 "reference_time": 1776870000 820 }, 821 { 822 "query_id": "dev_q26", 823 "split": "dev", 824 "category": "no-hit query", 825 "namespace": "default", 826 "timeline_id": "t6-harness-observability", 827 "text": "what rust edition is klbr using?", 828 "gold_memory_ids": [], 829 "no_hit": true, 830 "reference_time": 1776870000 831 }, 832 { 833 "query_id": "dev_q27", 834 "split": "dev", 835 "category": "no-hit query", 836 "namespace": "default", 837 "timeline_id": "t10-support-calibration", 838 "text": "what's the websocket port the daemon is listening on?", 839 "gold_memory_ids": [], 840 "no_hit": true, 841 "reference_time": 1776870000 842 }, 843 { 844 "query_id": "dev_q28", 845 "split": "dev", 846 "category": "no-hit query", 847 "namespace": "default", 848 "timeline_id": "t8-better-eval-data", 849 "text": "what's the default db filename the agent writes to?", 850 "gold_memory_ids": [], 851 "no_hit": true, 852 "reference_time": 1776870000 853 }, 854 { 855 "query_id": "dev_q29", 856 "split": "dev", 857 "category": "no-hit query", 858 "namespace": "default", 859 "timeline_id": "t3-benchmark-scaffolding", 860 "text": "what's the current p95 embedding latency on this machine?", 861 "gold_memory_ids": [], 862 "no_hit": true, 863 "reference_time": 1776870000 864 }, 865 { 866 "query_id": "dev_q30", 867 "split": "dev", 868 "category": "no-hit query", 869 "namespace": "default", 870 "timeline_id": "t6-harness-observability", 871 "text": "what's the sqlite-vec extension version we're using?", 872 "gold_memory_ids": [], 873 "no_hit": true, 874 "reference_time": 1776870000 875 }, 876 { 877 "query_id": "dev_q31", 878 "split": "dev", 879 "category": "no-hit query", 880 "namespace": "default", 881 "timeline_id": "t11-support-vs-taxonomy", 882 "text": "what's the name of the nearest-neighbor index backend we're using in production?", 883 "gold_memory_ids": [], 884 "no_hit": true, 885 "reference_time": 1776870000 886 }, 887 { 888 "query_id": "dev_q32", 889 "split": "dev", 890 "category": "no-hit query", 891 "namespace": "default", 892 "timeline_id": "t2-error-propagation", 893 "text": "what's the default watermark token budget for compaction?", 894 "gold_memory_ids": [], 895 "no_hit": true, 896 "reference_time": 1776870000 897 }, 898 { 899 "query_id": "test_q13", 900 "split": "test", 901 "category": "exact recent event", 902 "namespace": "default", 903 "timeline_id": "t12-expand-dataset-first", 904 "text": "what are we doing before passive recall work?", 905 "gold_memory_ids": [32], 906 "no_hit": false, 907 "gold_answer": "expanding the eval dataset and rerunning the retrieval benchmarks", 908 "reference_time": 1776871800 909 }, 910 { 911 "query_id": "test_q14", 912 "split": "test", 913 "category": "multi-evidence query", 914 "namespace": "default", 915 "timeline_id": "t12-expand-dataset-first", 916 "text": "why are we expanding the dataset before treating thresholds as final?", 917 "gold_memory_ids": [32, 33, 34], 918 "no_hit": false, 919 "gold_answer": "because the eval set needed stronger coverage and the thresholds should stay provisional until the larger dataset is benchmarked again", 920 "reference_time": 1776875400 921 }, 922 { 923 "query_id": "test_q15", 924 "split": "test", 925 "category": "no-hit query", 926 "namespace": "default", 927 "timeline_id": "t12-expand-dataset-first", 928 "route_label": "tools", 929 "text": "which postgres extension are we enabling before passive recall?", 930 "required_tools": ["read_file"], 931 "gold_memory_ids": [], 932 "no_hit": true, 933 "reference_time": 1776875400 934 }, 935 { 936 "query_id": "test_q16", 937 "split": "test", 938 "category": "recurring theme / pattern-like question", 939 "namespace": "default", 940 "timeline_id": "t12-expand-dataset-first", 941 "text": "what kinds of eval cases did the starter set need more of?", 942 "gold_memory_ids": [33], 943 "no_hit": false, 944 "gold_answer": "no-hit, conflict/update, and multi-evidence cases", 945 "reference_time": 1776873600 946 }, 947 { 948 "query_id": "test_q17", 949 "split": "test", 950 "category": "conflict / update question", 951 "namespace": "default", 952 "timeline_id": "t12-expand-dataset-first", 953 "text": "are the current thresholds final now that we have one sweep?", 954 "gold_memory_ids": [34], 955 "no_hit": false, 956 "gold_answer": "no, they should remain provisional until the larger dataset is benchmarked again", 957 "reference_time": 1776875400 958 }, 959 { 960 "query_id": "test_q18", 961 "split": "test", 962 "category": "temporally ambiguous query", 963 "namespace": "default", 964 "timeline_id": "t12-expand-dataset-first", 965 "text": "what was the sequence around the dataset expansion decision?", 966 "gold_memory_ids": [32, 33, 34], 967 "no_hit": false, 968 "gold_answer": "the next step was to expand the dataset, it needed more hard cases, and thresholds stayed provisional until rerunning the larger benchmark", 969 "reference_time": 1776875400 970 } 971 ] 972}