{ "dataset_id": "internal-eval-starter-v2", "description": "Expanded internal eval set for KLBR MVP retrieval benchmarking. Timeline-aware, category-complete, and large enough to stress no-hit, support-calibration, and multi-evidence behavior more realistically.", "memories": [ { "memory_id": 1, "namespace": "default", "layer": "L1", "timeline_id": "t0-editor-before-switch", "text": "before the switch, the primary editor for klbr work was neovim.", "event_time": 1776153600, "status": "active", "source_ref": "session:t0:m1", "tags": ["project:klbr", "topic:editor"] }, { "memory_id": 2, "namespace": "default", "layer": "L1", "timeline_id": "t1-editor-and-embedding-split", "text": "switched the day-to-day klbr editor from neovim to zed.", "event_time": 1776157200, "status": "active", "source_ref": "session:t1:m1", "tags": ["project:klbr", "topic:editor"] }, { "memory_id": 3, "namespace": "default", "layer": "L1", "timeline_id": "t1-editor-and-embedding-split", "text": "split embeddings onto a separate llama-server at localhost 8002 using bge-m3 with dimension 1024.", "event_time": 1776159000, "status": "active", "source_ref": "session:t1:m2", "tags": ["project:klbr", "topic:embeddings"] }, { "memory_id": 4, "namespace": "default", "layer": "L1", "timeline_id": "t1-editor-and-embedding-split", "text": "the first benchmark slice should stay exact retrieval first before indexed search.", "event_time": 1776160800, "status": "active", "source_ref": "session:t1:m3", "tags": ["project:klbr", "topic:benchmarking"] }, { "memory_id": 5, "namespace": "default", "layer": "L1", "timeline_id": "t2-error-propagation", "text": "fixed the daemon swallowing llm stream errors and forwarded the error to the tui.", "event_time": 1776250800, "status": "active", "source_ref": "session:t2:m1", "tags": ["project:klbr", "topic:daemon", "topic:tui"] }, { "memory_id": 6, "namespace": "default", "layer": "L1", "timeline_id": "t2-error-propagation", "text": "compaction summaries are stored with the compaction_summary tag.", "event_time": 1776252600, "status": "active", "source_ref": "session:t2:m2", "tags": ["project:klbr", "topic:memory"] }, { "memory_id": 7, "namespace": "default", "layer": "L1", "timeline_id": "t2-error-propagation", "text": "the chat llama-server did not expose embeddings, so embeddings had to stay on a separate process.", "event_time": 1776254400, "status": "active", "source_ref": "session:t2:m3", "tags": ["project:klbr", "topic:embeddings", "topic:llama-server"] }, { "memory_id": 8, "namespace": "default", "layer": "L1", "timeline_id": "t3-benchmark-scaffolding", "text": "the benchmark harness should log per-query candidate lists and stage latencies.", "event_time": 1776351600, "status": "active", "source_ref": "session:t3:m1", "tags": ["project:klbr", "topic:benchmarking"] }, { "memory_id": 9, "namespace": "default", "layer": "L1", "timeline_id": "t3-benchmark-scaffolding", "text": "internal eval should split by timeline or session, not by individual question only.", "event_time": 1776353400, "status": "active", "source_ref": "session:t3:m2", "tags": ["project:klbr", "topic:benchmarking", "topic:eval-data"] }, { "memory_id": 10, "namespace": "default", "layer": "L1", "timeline_id": "t3-benchmark-scaffolding", "text": "recent work has mostly been retrieval plumbing, instrumentation, and benchmark scaffolding.", "event_time": 1776355200, "status": "active", "source_ref": "session:t3:m3", "tags": ["project:klbr", "topic:benchmarking", "topic:retrieval"] }, { "memory_id": 11, "namespace": "default", "layer": "L1", "timeline_id": "t4-reranker-plan", "text": "planned a separate reranker endpoint on localhost 8003.", "event_time": 1776421800, "status": "active", "source_ref": "session:t4:m1", "tags": ["project:klbr", "topic:reranker"] }, { "memory_id": 12, "namespace": "default", "layer": "L1", "timeline_id": "t4-reranker-plan", "text": "the cpu-only reranker should use plain http, not https.", "event_time": 1776423600, "status": "active", "source_ref": "session:t4:m2", "tags": ["project:klbr", "topic:reranker", "topic:networking"] }, { "memory_id": 13, "namespace": "default", "layer": "L1", "timeline_id": "t4-reranker-plan", "text": "the reranker choice for the mvp is bge-reranker-v2-m3.", "event_time": 1776425400, "status": "active", "source_ref": "session:t4:m3", "tags": ["project:klbr", "topic:reranker"] }, { "memory_id": 14, "namespace": "default", "layer": "L1", "timeline_id": "t5-llama-server-migration", "text": "moved chat model serving from lm studio to llama-server.", "event_time": 1776517200, "status": "active", "source_ref": "session:t5:m1", "tags": ["project:klbr", "topic:llama-server"] }, { "memory_id": 15, "namespace": "default", "layer": "L1", "timeline_id": "t5-llama-server-migration", "text": "around the migration, auto model discovery was added for chat and embeddings.", "event_time": 1776519000, "status": "active", "source_ref": "session:t5:m2", "tags": ["project:klbr", "topic:llama-server", "topic:models"] }, { "memory_id": 16, "namespace": "default", "layer": "L1", "timeline_id": "t5-llama-server-migration", "text": "the separate embedding server still uses bge-m3 rather than whatever chat model is currently loaded.", "event_time": 1776520800, "status": "active", "source_ref": "session:t5:m3", "tags": ["project:klbr", "topic:embeddings"] }, { "memory_id": 17, "namespace": "default", "layer": "L1", "timeline_id": "t6-harness-observability", "text": "added progress logging to the benchmark harness because silent hangs were confusing.", "event_time": 1776614400, "status": "active", "source_ref": "session:t6:m1", "tags": ["project:klbr", "topic:benchmarking", "topic:logging"] }, { "memory_id": 18, "namespace": "default", "layer": "L1", "timeline_id": "t6-harness-observability", "text": "the first tiny benchmark only proved the plumbing and was too small for calibration.", "event_time": 1776616200, "status": "active", "source_ref": "session:t6:m2", "tags": ["project:klbr", "topic:benchmarking", "topic:calibration"] }, { "memory_id": 19, "namespace": "default", "layer": "L1", "timeline_id": "t6-harness-observability", "text": "a recurring infra pattern is to separate chat, embedding, and rerank services by role to avoid model mismatch.", "event_time": 1776618000, "status": "active", "source_ref": "session:t6:m3", "tags": ["project:klbr", "topic:infra", "topic:retrieval"] }, { "memory_id": 20, "namespace": "default", "layer": "L1", "timeline_id": "t7-latency-and-abstain", "text": "rerank latency dropped sharply after switching the reranker endpoint to plain http and using a warmed model.", "event_time": 1776686400, "status": "active", "source_ref": "session:t7:m1", "tags": ["project:klbr", "topic:reranker", "topic:latency"] }, { "memory_id": 21, "namespace": "default", "layer": "L1", "timeline_id": "t7-latency-and-abstain", "text": "the benchmark still abstained wrongly on one positive query because abstention used first-stage distance instead of rerank confidence.", "event_time": 1776688200, "status": "active", "source_ref": "session:t7:m2", "tags": ["project:klbr", "topic:abstention", "topic:reranker"] }, { "memory_id": 22, "namespace": "default", "layer": "L1", "timeline_id": "t8-better-eval-data", "text": "a better internal eval set should include no-hit, temporal ambiguity, and multi-evidence cases.", "event_time": 1776780000, "status": "active", "source_ref": "session:t8:m1", "tags": ["project:klbr", "topic:eval-data"] }, { "memory_id": 23, "namespace": "default", "layer": "L1", "timeline_id": "t8-better-eval-data", "text": "the live agent and the benchmark harness now share the same time-windowed exact retrieval path.", "event_time": 1776781800, "status": "active", "source_ref": "session:t8:m2", "tags": ["project:klbr", "topic:retrieval"] }, { "memory_id": 24, "namespace": "default", "layer": "L1", "timeline_id": "t9-current-endpoints", "text": "the current reranker endpoint is http://localhost:8003.", "event_time": 1776852000, "status": "active", "source_ref": "session:t9:m1", "tags": ["project:klbr", "topic:reranker", "topic:networking"] }, { "memory_id": 25, "namespace": "default", "layer": "L1", "timeline_id": "t9-current-endpoints", "text": "the current embedding endpoint is http://localhost:8002.", "event_time": 1776853800, "status": "active", "source_ref": "session:t9:m2", "tags": ["project:klbr", "topic:embeddings", "topic:networking"] }, { "memory_id": 26, "namespace": "default", "layer": "L1", "timeline_id": "t10-support-calibration", "text": "added a generic lexical support score based on weighted overlap between salient query terms and the top memory text.", "event_time": 1776861000, "status": "active", "source_ref": "session:t10:m1", "tags": ["project:klbr", "topic:calibration", "topic:support"] }, { "memory_id": 27, "namespace": "default", "layer": "L1", "timeline_id": "t10-support-calibration", "text": "a focused support sweep found a useful operating point at rerank score -6.0, margin 0.0, and support 0.2.", "event_time": 1776862800, "status": "active", "source_ref": "session:t10:m2", "tags": ["project:klbr", "topic:calibration", "topic:support"] }, { "memory_id": 28, "namespace": "default", "layer": "L1", "timeline_id": "t10-support-calibration", "text": "the support-assisted operating point removed no-hit false answers on the dev slice, but answerable coverage settled at 0.8333 instead of the old 0.9167 balanced point.", "event_time": 1776864600, "status": "active", "source_ref": "session:t10:m3", "tags": ["project:klbr", "topic:calibration", "topic:support"] }, { "memory_id": 29, "namespace": "default", "layer": "L1", "timeline_id": "t11-support-vs-taxonomy", "text": "decided against adding many brittle query-type guards for calibration.", "event_time": 1776866400, "status": "active", "source_ref": "session:t11:m1", "tags": ["project:klbr", "topic:calibration", "topic:query-types"] }, { "memory_id": 30, "namespace": "default", "layer": "L1", "timeline_id": "t11-support-vs-taxonomy", "text": "the better direction is generic evidence support that works for both direct questions and non-question inputs.", "event_time": 1776868200, "status": "active", "source_ref": "session:t11:m2", "tags": ["project:klbr", "topic:support", "topic:passive-recall"] }, { "memory_id": 31, "namespace": "default", "layer": "L1", "timeline_id": "t11-support-vs-taxonomy", "text": "passive recall should reuse the same support features instead of introducing a separate query taxonomy.", "event_time": 1776870000, "status": "active", "source_ref": "session:t11:m3", "tags": ["project:klbr", "topic:support", "topic:passive-recall"] }, { "memory_id": 32, "namespace": "default", "layer": "L1", "timeline_id": "t12-expand-dataset-first", "text": "before passive recall work, the next step was to expand the eval dataset and rerun the retrieval benchmarks.", "event_time": 1776871800, "status": "active", "source_ref": "session:t12:m1", "tags": ["project:klbr", "topic:eval-data", "topic:benchmarking"] }, { "memory_id": 33, "namespace": "default", "layer": "L1", "timeline_id": "t12-expand-dataset-first", "text": "the starter eval set needed stronger coverage in no-hit, conflict/update, and multi-evidence cases.", "event_time": 1776873600, "status": "active", "source_ref": "session:t12:m2", "tags": ["project:klbr", "topic:eval-data", "topic:calibration"] }, { "memory_id": 34, "namespace": "default", "layer": "L1", "timeline_id": "t12-expand-dataset-first", "text": "the current thresholds should stay provisional until the larger dataset is benchmarked again.", "event_time": 1776875400, "status": "active", "source_ref": "session:t12:m3", "tags": ["project:klbr", "topic:calibration", "topic:eval-data"] } ], "queries": [ { "query_id": "dev_q1", "split": "test", "category": "exact recent event", "namespace": "default", "timeline_id": "t9-current-endpoints", "text": "what is the reranker endpoint right now?", "gold_memory_ids": [24], "no_hit": false, "gold_answer": "http://localhost:8003", "reference_time": 1776852000 }, { "query_id": "dev_q2", "split": "test", "category": "exact dated event", "namespace": "default", "timeline_id": "t5-llama-server-migration", "text": "what did i switch to on april 18 when i moved off lm studio?", "gold_memory_ids": [14], "no_hit": false, "gold_answer": "llama-server", "reference_time": 1776517200 }, { "query_id": "dev_q3", "split": "dev", "category": "vague recent lookup", "namespace": "default", "timeline_id": "t8-better-eval-data", "text": "what have i been working on lately?", "gold_memory_ids": [10, 22, 23], "no_hit": false, "gold_answer": "better eval data and the shared retrieval path", "reference_time": 1776852000 }, { "query_id": "dev_q4", "split": "dev", "category": "recurring theme / pattern-like question", "namespace": "default", "timeline_id": "t6-harness-observability", "text": "what kind of infra pattern do i keep using in this project?", "gold_memory_ids": [19], "no_hit": false, "gold_answer": "splitting chat, embedding, and rerank services by role", "reference_time": 1776780000 }, { "query_id": "dev_q5", "split": "dev", "category": "conflict / update question", "namespace": "default", "timeline_id": "t1-editor-and-embedding-split", "text": "what editor am i using now for klbr?", "gold_memory_ids": [2], "no_hit": false, "gold_answer": "zed", "reference_time": 1776852000 }, { "query_id": "dev_q6", "split": "test", "category": "conflict / update question", "namespace": "default", "timeline_id": "t7-latency-and-abstain", "text": "is the reranker endpoint using https now?", "gold_memory_ids": [12, 24], "no_hit": false, "gold_answer": "no, it should use plain http and the current endpoint is http://localhost:8003", "reference_time": 1776852000 }, { "query_id": "dev_q7", "split": "test", "category": "temporally ambiguous query", "namespace": "default", "timeline_id": "t5-llama-server-migration", "text": "what happened around the migration?", "gold_memory_ids": [14, 15, 16], "no_hit": false, "gold_answer": "chat moved to llama-server, auto model discovery was added, and embeddings stayed separate on bge-m3", "reference_time": 1776519000 }, { "query_id": "dev_q8", "split": "test", "category": "multi-evidence query", "namespace": "default", "timeline_id": "t7-latency-and-abstain", "text": "why did the benchmark latency improve?", "gold_memory_ids": [20, 24], "no_hit": false, "gold_answer": "because the reranker endpoint switched to plain http and the model was warm", "reference_time": 1776852000 }, { "query_id": "dev_q9", "split": "test", "category": "no-hit query", "namespace": "default", "timeline_id": "t9-current-endpoints", "text": "what postgres version am i running for klbr?", "gold_memory_ids": [], "no_hit": true, "reference_time": 1776852000 }, { "query_id": "dev_q10", "split": "dev", "category": "exact dated event", "namespace": "default", "timeline_id": "t2-error-propagation", "text": "what was fixed on april 15 when the stream failed?", "gold_memory_ids": [5], "no_hit": false, "gold_answer": "the daemon stopped swallowing llm stream errors and forwarded them to the tui", "reference_time": 1776250800 }, { "query_id": "dev_q11", "split": "dev", "category": "vague recent lookup", "namespace": "default", "timeline_id": "t3-benchmark-scaffolding", "text": "what benchmark-related work has been happening?", "gold_memory_ids": [8, 9, 10, 17, 18, 22], "no_hit": false, "gold_answer": "benchmark logging, timeline-aware eval data, retrieval plumbing, and replacing the toy eval set", "reference_time": 1776852000 }, { "query_id": "dev_q12", "split": "dev", "category": "multi-evidence query", "namespace": "default", "timeline_id": "t1-editor-and-embedding-split", "text": "what setup is protecting us from embedding/model mismatch?", "gold_memory_ids": [3, 7, 19], "no_hit": false, "gold_answer": "keeping embeddings separate on bge-m3 and separating services by role", "reference_time": 1776852000 }, { "query_id": "dev_q13", "split": "dev", "category": "no-hit query", "namespace": "default", "timeline_id": "t8-better-eval-data", "route_label": "tools", "text": "what postgres version is klbr using right now?", "gold_memory_ids": [], "no_hit": true, "reference_time": 1776852000 }, { "query_id": "dev_q14", "split": "dev", "category": "no-hit query", "namespace": "default", "timeline_id": "t6-harness-observability", "route_label": "tools", "text": "which browser am i using to inspect the benchmark output?", "gold_memory_ids": [], "no_hit": true, "reference_time": 1776852000 }, { "query_id": "dev_q15", "split": "dev", "category": "no-hit query", "namespace": "default", "timeline_id": "t1-editor-and-embedding-split", "route_label": "tools", "text": "what gpu is running the reranker?", "gold_memory_ids": [], "no_hit": true, "reference_time": 1776852000 }, { "query_id": "dev_q16", "split": "dev", "category": "no-hit query", "namespace": "default", "timeline_id": "t3-benchmark-scaffolding", "route_label": "tools", "text": "which postgres extension did i enable for the benchmark database?", "gold_memory_ids": [], "no_hit": true, "reference_time": 1776852000 }, { "query_id": "test_q1", "split": "dev", "category": "exact recent event", "namespace": "default", "timeline_id": "t8-better-eval-data", "text": "what changed between the live agent and the benchmark harness?", "gold_memory_ids": [23], "no_hit": false, "gold_answer": "they now share the same time-windowed exact retrieval path", "reference_time": 1776781800 }, { "query_id": "test_q2", "split": "dev", "category": "conflict / update question", "namespace": "default", "timeline_id": "t1-editor-and-embedding-split", "text": "was i still using neovim after the editor switch?", "gold_memory_ids": [2], "no_hit": false, "gold_answer": "no, the newer fact says the editor switched to zed", "reference_time": 1776852000 }, { "query_id": "test_q3", "split": "test", "category": "temporally ambiguous query", "namespace": "default", "timeline_id": "t7-latency-and-abstain", "text": "what happened around the latency fix?", "gold_memory_ids": [20, 21], "no_hit": false, "gold_answer": "latency improved after moving to http and a remaining bug was the wrong abstention rule", "reference_time": 1776688200 }, { "query_id": "test_q4", "split": "test", "category": "no-hit query", "namespace": "default", "timeline_id": "t9-current-endpoints", "text": "which browser am i using to inspect the benchmark output?", "gold_memory_ids": [], "no_hit": true, "reference_time": 1776852000 }, { "query_id": "test_q5", "split": "test", "category": "exact dated event", "namespace": "default", "timeline_id": "t4-reranker-plan", "text": "what reranker did i decide to use on april 17?", "gold_memory_ids": [13], "no_hit": false, "gold_answer": "bge-reranker-v2-m3", "reference_time": 1776425400 }, { "query_id": "test_q6", "split": "test", "category": "vague recent lookup", "namespace": "default", "timeline_id": "t5-llama-server-migration", "text": "what changed in model serving around llama-server?", "gold_memory_ids": [14, 15, 16], "no_hit": false, "gold_answer": "chat moved to llama-server, auto model discovery was added, and embeddings remained separate", "reference_time": 1776852000 }, { "query_id": "test_q7", "split": "dev", "category": "multi-evidence query", "namespace": "default", "timeline_id": "t8-better-eval-data", "text": "why do we need better benchmark data now?", "gold_memory_ids": [18, 22], "no_hit": false, "gold_answer": "because the toy benchmark only proved plumbing and the eval set needs harder no-hit and ambiguity cases", "reference_time": 1776852000 }, { "query_id": "test_q8", "split": "dev", "category": "recurring theme / pattern-like question", "namespace": "default", "timeline_id": "t6-harness-observability", "text": "what sort of retrieval work keeps coming up?", "gold_memory_ids": [10, 19, 23], "no_hit": false, "gold_answer": "retrieval plumbing, service separation, and aligning the live and benchmark retrieval path", "reference_time": 1776852000 }, { "query_id": "test_q9", "split": "test", "category": "no-hit query", "namespace": "default", "timeline_id": "t9-current-endpoints", "route_label": "tools", "text": "what gpu am i using for the reranker?", "required_tools": ["shell"], "gold_memory_ids": [], "no_hit": true, "reference_time": 1776852000 }, { "query_id": "test_q10", "split": "dev", "category": "exact recent event", "namespace": "default", "timeline_id": "t6-harness-observability", "text": "why did i add progress logging to the benchmark harness?", "gold_memory_ids": [17], "no_hit": false, "gold_answer": "because silent hangs were confusing", "reference_time": 1776614400 }, { "query_id": "test_q11", "split": "test", "category": "conflict / update question", "namespace": "default", "timeline_id": "t7-latency-and-abstain", "text": "what was wrong with the benchmark after the reranker got fast?", "gold_memory_ids": [21], "no_hit": false, "gold_answer": "it still abstained on a positive query because it used first-stage distance instead of rerank confidence", "reference_time": 1776688200 }, { "query_id": "test_q12", "split": "dev", "category": "temporally ambiguous query", "namespace": "default", "timeline_id": "t1-editor-and-embedding-split", "text": "what happened around the embedding split?", "gold_memory_ids": [2, 3, 4], "no_hit": false, "gold_answer": "the editor switched to zed, embeddings moved to a separate bge-m3 server, and exact retrieval stayed the first benchmark step", "reference_time": 1776160800 }, { "query_id": "dev_q17", "split": "dev", "category": "exact recent event", "namespace": "default", "timeline_id": "t10-support-calibration", "text": "what support threshold looked best in the focused support sweep?", "gold_memory_ids": [27], "no_hit": false, "gold_answer": "0.2", "reference_time": 1776862800 }, { "query_id": "dev_q18", "split": "dev", "category": "multi-evidence query", "namespace": "default", "timeline_id": "t10-support-calibration", "text": "what changed after adding the support score to the policy?", "gold_memory_ids": [26, 27, 28], "no_hit": false, "gold_answer": "the policy added weighted lexical support, settled on score -6.0 / margin 0.0 / support 0.2, and removed no-hit false answers with some coverage loss", "reference_time": 1776864600 }, { "query_id": "dev_q19", "split": "dev", "category": "recurring theme / pattern-like question", "namespace": "default", "timeline_id": "t11-support-vs-taxonomy", "text": "what kind of calibration fix are we avoiding because it would be brittle?", "gold_memory_ids": [29], "no_hit": false, "gold_answer": "many query-type guards", "reference_time": 1776870000 }, { "query_id": "dev_q20", "split": "dev", "category": "multi-evidence query", "namespace": "default", "timeline_id": "t11-support-vs-taxonomy", "text": "how should passive recall connect to the current calibration work?", "gold_memory_ids": [30, 31], "no_hit": false, "gold_answer": "it should reuse the same generic support features instead of a separate taxonomy", "reference_time": 1776870000 }, { "query_id": "dev_q21", "split": "dev", "category": "no-hit query", "namespace": "default", "timeline_id": "t11-support-vs-taxonomy", "route_label": "tools", "text": "which classifier model are we using to assign query types?", "gold_memory_ids": [], "no_hit": true, "reference_time": 1776870000 }, { "query_id": "dev_q22", "split": "dev", "category": "conflict / update question", "namespace": "default", "timeline_id": "t10-support-calibration", "text": "did the support-assisted sweep get coverage all the way back to the old 0.9167 balanced point?", "gold_memory_ids": [28], "no_hit": false, "gold_answer": "no, coverage settled at 0.8333", "reference_time": 1776864600 }, { "query_id": "dev_q23", "split": "dev", "category": "no-hit query", "namespace": "default", "timeline_id": "t8-better-eval-data", "text": "what editor am i using for my dotfiles repo these days?", "gold_memory_ids": [], "no_hit": true, "reference_time": 1776870000 }, { "query_id": "dev_q24", "split": "dev", "category": "no-hit query", "namespace": "default", "timeline_id": "t3-benchmark-scaffolding", "text": "what's the p99 rerank latency on the current machine?", "gold_memory_ids": [], "no_hit": true, "reference_time": 1776870000 }, { "query_id": "dev_q25", "split": "dev", "category": "no-hit query", "namespace": "default", "timeline_id": "t11-support-vs-taxonomy", "text": "what's the commit hash for the router model we're using right now?", "gold_memory_ids": [], "no_hit": true, "reference_time": 1776870000 }, { "query_id": "dev_q26", "split": "dev", "category": "no-hit query", "namespace": "default", "timeline_id": "t6-harness-observability", "text": "what rust edition is klbr using?", "gold_memory_ids": [], "no_hit": true, "reference_time": 1776870000 }, { "query_id": "dev_q27", "split": "dev", "category": "no-hit query", "namespace": "default", "timeline_id": "t10-support-calibration", "text": "what's the websocket port the daemon is listening on?", "gold_memory_ids": [], "no_hit": true, "reference_time": 1776870000 }, { "query_id": "dev_q28", "split": "dev", "category": "no-hit query", "namespace": "default", "timeline_id": "t8-better-eval-data", "text": "what's the default db filename the agent writes to?", "gold_memory_ids": [], "no_hit": true, "reference_time": 1776870000 }, { "query_id": "dev_q29", "split": "dev", "category": "no-hit query", "namespace": "default", "timeline_id": "t3-benchmark-scaffolding", "text": "what's the current p95 embedding latency on this machine?", "gold_memory_ids": [], "no_hit": true, "reference_time": 1776870000 }, { "query_id": "dev_q30", "split": "dev", "category": "no-hit query", "namespace": "default", "timeline_id": "t6-harness-observability", "text": "what's the sqlite-vec extension version we're using?", "gold_memory_ids": [], "no_hit": true, "reference_time": 1776870000 }, { "query_id": "dev_q31", "split": "dev", "category": "no-hit query", "namespace": "default", "timeline_id": "t11-support-vs-taxonomy", "text": "what's the name of the nearest-neighbor index backend we're using in production?", "gold_memory_ids": [], "no_hit": true, "reference_time": 1776870000 }, { "query_id": "dev_q32", "split": "dev", "category": "no-hit query", "namespace": "default", "timeline_id": "t2-error-propagation", "text": "what's the default watermark token budget for compaction?", "gold_memory_ids": [], "no_hit": true, "reference_time": 1776870000 }, { "query_id": "test_q13", "split": "test", "category": "exact recent event", "namespace": "default", "timeline_id": "t12-expand-dataset-first", "text": "what are we doing before passive recall work?", "gold_memory_ids": [32], "no_hit": false, "gold_answer": "expanding the eval dataset and rerunning the retrieval benchmarks", "reference_time": 1776871800 }, { "query_id": "test_q14", "split": "test", "category": "multi-evidence query", "namespace": "default", "timeline_id": "t12-expand-dataset-first", "text": "why are we expanding the dataset before treating thresholds as final?", "gold_memory_ids": [32, 33, 34], "no_hit": false, "gold_answer": "because the eval set needed stronger coverage and the thresholds should stay provisional until the larger dataset is benchmarked again", "reference_time": 1776875400 }, { "query_id": "test_q15", "split": "test", "category": "no-hit query", "namespace": "default", "timeline_id": "t12-expand-dataset-first", "route_label": "tools", "text": "which postgres extension are we enabling before passive recall?", "required_tools": ["read_file"], "gold_memory_ids": [], "no_hit": true, "reference_time": 1776875400 }, { "query_id": "test_q16", "split": "test", "category": "recurring theme / pattern-like question", "namespace": "default", "timeline_id": "t12-expand-dataset-first", "text": "what kinds of eval cases did the starter set need more of?", "gold_memory_ids": [33], "no_hit": false, "gold_answer": "no-hit, conflict/update, and multi-evidence cases", "reference_time": 1776873600 }, { "query_id": "test_q17", "split": "test", "category": "conflict / update question", "namespace": "default", "timeline_id": "t12-expand-dataset-first", "text": "are the current thresholds final now that we have one sweep?", "gold_memory_ids": [34], "no_hit": false, "gold_answer": "no, they should remain provisional until the larger dataset is benchmarked again", "reference_time": 1776875400 }, { "query_id": "test_q18", "split": "test", "category": "temporally ambiguous query", "namespace": "default", "timeline_id": "t12-expand-dataset-first", "text": "what was the sequence around the dataset expansion decision?", "gold_memory_ids": [32, 33, 34], "no_hit": false, "gold_answer": "the next step was to expand the dataset, it needed more hard cases, and thresholds stayed provisional until rerunning the larger benchmark", "reference_time": 1776875400 } ] }