{
  "dataset_id": "internal-eval-starter-v2",
  "description": "Expanded internal eval set for KLBR MVP retrieval benchmarking. Timeline-aware, category-complete, and large enough to stress no-hit, support-calibration, and multi-evidence behavior more realistically.",
  "memories": [
    {
      "memory_id": 1,
      "namespace": "default",
      "layer": "L1",
      "timeline_id": "t0-editor-before-switch",
      "text": "before the switch, the primary editor for klbr work was neovim.",
      "event_time": 1776153600,
      "status": "active",
      "source_ref": "session:t0:m1",
      "tags": ["project:klbr", "topic:editor"]
    },
    {
      "memory_id": 2,
      "namespace": "default",
      "layer": "L1",
      "timeline_id": "t1-editor-and-embedding-split",
      "text": "switched the day-to-day klbr editor from neovim to zed.",
      "event_time": 1776157200,
      "status": "active",
      "source_ref": "session:t1:m1",
      "tags": ["project:klbr", "topic:editor"]
    },
    {
      "memory_id": 3,
      "namespace": "default",
      "layer": "L1",
      "timeline_id": "t1-editor-and-embedding-split",
      "text": "split embeddings onto a separate llama-server at localhost 8002 using bge-m3 with dimension 1024.",
      "event_time": 1776159000,
      "status": "active",
      "source_ref": "session:t1:m2",
      "tags": ["project:klbr", "topic:embeddings"]
    },
    {
      "memory_id": 4,
      "namespace": "default",
      "layer": "L1",
      "timeline_id": "t1-editor-and-embedding-split",
      "text": "the first benchmark slice should stay exact retrieval first before indexed search.",
      "event_time": 1776160800,
      "status": "active",
      "source_ref": "session:t1:m3",
      "tags": ["project:klbr", "topic:benchmarking"]
    },
    {
      "memory_id": 5,
      "namespace": "default",
      "layer": "L1",
      "timeline_id": "t2-error-propagation",
      "text": "fixed the daemon swallowing llm stream errors and forwarded the error to the tui.",
      "event_time": 1776250800,
      "status": "active",
      "source_ref": "session:t2:m1",
      "tags": ["project:klbr", "topic:daemon", "topic:tui"]
    },
    {
      "memory_id": 6,
      "namespace": "default",
      "layer": "L1",
      "timeline_id": "t2-error-propagation",
      "text": "compaction summaries are stored with the compaction_summary tag.",
      "event_time": 1776252600,
      "status": "active",
      "source_ref": "session:t2:m2",
      "tags": ["project:klbr", "topic:memory"]
    },
    {
      "memory_id": 7,
      "namespace": "default",
      "layer": "L1",
      "timeline_id": "t2-error-propagation",
      "text": "the chat llama-server did not expose embeddings, so embeddings had to stay on a separate process.",
      "event_time": 1776254400,
      "status": "active",
      "source_ref": "session:t2:m3",
      "tags": ["project:klbr", "topic:embeddings", "topic:llama-server"]
    },
    {
      "memory_id": 8,
      "namespace": "default",
      "layer": "L1",
      "timeline_id": "t3-benchmark-scaffolding",
      "text": "the benchmark harness should log per-query candidate lists and stage latencies.",
      "event_time": 1776351600,
      "status": "active",
      "source_ref": "session:t3:m1",
      "tags": ["project:klbr", "topic:benchmarking"]
    },
    {
      "memory_id": 9,
      "namespace": "default",
      "layer": "L1",
      "timeline_id": "t3-benchmark-scaffolding",
      "text": "internal eval should split by timeline or session, not by individual question only.",
      "event_time": 1776353400,
      "status": "active",
      "source_ref": "session:t3:m2",
      "tags": ["project:klbr", "topic:benchmarking", "topic:eval-data"]
    },
    {
      "memory_id": 10,
      "namespace": "default",
      "layer": "L1",
      "timeline_id": "t3-benchmark-scaffolding",
      "text": "recent work has mostly been retrieval plumbing, instrumentation, and benchmark scaffolding.",
      "event_time": 1776355200,
      "status": "active",
      "source_ref": "session:t3:m3",
      "tags": ["project:klbr", "topic:benchmarking", "topic:retrieval"]
    },
    {
      "memory_id": 11,
      "namespace": "default",
      "layer": "L1",
      "timeline_id": "t4-reranker-plan",
      "text": "planned a separate reranker endpoint on localhost 8003.",
      "event_time": 1776421800,
      "status": "active",
      "source_ref": "session:t4:m1",
      "tags": ["project:klbr", "topic:reranker"]
    },
    {
      "memory_id": 12,
      "namespace": "default",
      "layer": "L1",
      "timeline_id": "t4-reranker-plan",
      "text": "the cpu-only reranker should use plain http, not https.",
      "event_time": 1776423600,
      "status": "active",
      "source_ref": "session:t4:m2",
      "tags": ["project:klbr", "topic:reranker", "topic:networking"]
    },
    {
      "memory_id": 13,
      "namespace": "default",
      "layer": "L1",
      "timeline_id": "t4-reranker-plan",
      "text": "the reranker choice for the mvp is bge-reranker-v2-m3.",
      "event_time": 1776425400,
      "status": "active",
      "source_ref": "session:t4:m3",
      "tags": ["project:klbr", "topic:reranker"]
    },
    {
      "memory_id": 14,
      "namespace": "default",
      "layer": "L1",
      "timeline_id": "t5-llama-server-migration",
      "text": "moved chat model serving from lm studio to llama-server.",
      "event_time": 1776517200,
      "status": "active",
      "source_ref": "session:t5:m1",
      "tags": ["project:klbr", "topic:llama-server"]
    },
    {
      "memory_id": 15,
      "namespace": "default",
      "layer": "L1",
      "timeline_id": "t5-llama-server-migration",
      "text": "around the migration, auto model discovery was added for chat and embeddings.",
      "event_time": 1776519000,
      "status": "active",
      "source_ref": "session:t5:m2",
      "tags": ["project:klbr", "topic:llama-server", "topic:models"]
    },
    {
      "memory_id": 16,
      "namespace": "default",
      "layer": "L1",
      "timeline_id": "t5-llama-server-migration",
      "text": "the separate embedding server still uses bge-m3 rather than whatever chat model is currently loaded.",
      "event_time": 1776520800,
      "status": "active",
      "source_ref": "session:t5:m3",
      "tags": ["project:klbr", "topic:embeddings"]
    },
    {
      "memory_id": 17,
      "namespace": "default",
      "layer": "L1",
      "timeline_id": "t6-harness-observability",
      "text": "added progress logging to the benchmark harness because silent hangs were confusing.",
      "event_time": 1776614400,
      "status": "active",
      "source_ref": "session:t6:m1",
      "tags": ["project:klbr", "topic:benchmarking", "topic:logging"]
    },
    {
      "memory_id": 18,
      "namespace": "default",
      "layer": "L1",
      "timeline_id": "t6-harness-observability",
      "text": "the first tiny benchmark only proved the plumbing and was too small for calibration.",
      "event_time": 1776616200,
      "status": "active",
      "source_ref": "session:t6:m2",
      "tags": ["project:klbr", "topic:benchmarking", "topic:calibration"]
    },
    {
      "memory_id": 19,
      "namespace": "default",
      "layer": "L1",
      "timeline_id": "t6-harness-observability",
      "text": "a recurring infra pattern is to separate chat, embedding, and rerank services by role to avoid model mismatch.",
      "event_time": 1776618000,
      "status": "active",
      "source_ref": "session:t6:m3",
      "tags": ["project:klbr", "topic:infra", "topic:retrieval"]
    },
    {
      "memory_id": 20,
      "namespace": "default",
      "layer": "L1",
      "timeline_id": "t7-latency-and-abstain",
      "text": "rerank latency dropped sharply after switching the reranker endpoint to plain http and using a warmed model.",
      "event_time": 1776686400,
      "status": "active",
      "source_ref": "session:t7:m1",
      "tags": ["project:klbr", "topic:reranker", "topic:latency"]
    },
    {
      "memory_id": 21,
      "namespace": "default",
      "layer": "L1",
      "timeline_id": "t7-latency-and-abstain",
      "text": "the benchmark still abstained wrongly on one positive query because abstention used first-stage distance instead of rerank confidence.",
      "event_time": 1776688200,
      "status": "active",
      "source_ref": "session:t7:m2",
      "tags": ["project:klbr", "topic:abstention", "topic:reranker"]
    },
    {
      "memory_id": 22,
      "namespace": "default",
      "layer": "L1",
      "timeline_id": "t8-better-eval-data",
      "text": "a better internal eval set should include no-hit, temporal ambiguity, and multi-evidence cases.",
      "event_time": 1776780000,
      "status": "active",
      "source_ref": "session:t8:m1",
      "tags": ["project:klbr", "topic:eval-data"]
    },
    {
      "memory_id": 23,
      "namespace": "default",
      "layer": "L1",
      "timeline_id": "t8-better-eval-data",
      "text": "the live agent and the benchmark harness now share the same time-windowed exact retrieval path.",
      "event_time": 1776781800,
      "status": "active",
      "source_ref": "session:t8:m2",
      "tags": ["project:klbr", "topic:retrieval"]
    },
    {
      "memory_id": 24,
      "namespace": "default",
      "layer": "L1",
      "timeline_id": "t9-current-endpoints",
      "text": "the current reranker endpoint is http://localhost:8003.",
      "event_time": 1776852000,
      "status": "active",
      "source_ref": "session:t9:m1",
      "tags": ["project:klbr", "topic:reranker", "topic:networking"]
    },
    {
      "memory_id": 25,
      "namespace": "default",
      "layer": "L1",
      "timeline_id": "t9-current-endpoints",
      "text": "the current embedding endpoint is http://localhost:8002.",
      "event_time": 1776853800,
      "status": "active",
      "source_ref": "session:t9:m2",
      "tags": ["project:klbr", "topic:embeddings", "topic:networking"]
    },
    {
      "memory_id": 26,
      "namespace": "default",
      "layer": "L1",
      "timeline_id": "t10-support-calibration",
      "text": "added a generic lexical support score based on weighted overlap between salient query terms and the top memory text.",
      "event_time": 1776861000,
      "status": "active",
      "source_ref": "session:t10:m1",
      "tags": ["project:klbr", "topic:calibration", "topic:support"]
    },
    {
      "memory_id": 27,
      "namespace": "default",
      "layer": "L1",
      "timeline_id": "t10-support-calibration",
      "text": "a focused support sweep found a useful operating point at rerank score -6.0, margin 0.0, and support 0.2.",
      "event_time": 1776862800,
      "status": "active",
      "source_ref": "session:t10:m2",
      "tags": ["project:klbr", "topic:calibration", "topic:support"]
    },
    {
      "memory_id": 28,
      "namespace": "default",
      "layer": "L1",
      "timeline_id": "t10-support-calibration",
      "text": "the support-assisted operating point removed no-hit false answers on the dev slice, but answerable coverage settled at 0.8333 instead of the old 0.9167 balanced point.",
      "event_time": 1776864600,
      "status": "active",
      "source_ref": "session:t10:m3",
      "tags": ["project:klbr", "topic:calibration", "topic:support"]
    },
    {
      "memory_id": 29,
      "namespace": "default",
      "layer": "L1",
      "timeline_id": "t11-support-vs-taxonomy",
      "text": "decided against adding many brittle query-type guards for calibration.",
      "event_time": 1776866400,
      "status": "active",
      "source_ref": "session:t11:m1",
      "tags": ["project:klbr", "topic:calibration", "topic:query-types"]
    },
    {
      "memory_id": 30,
      "namespace": "default",
      "layer": "L1",
      "timeline_id": "t11-support-vs-taxonomy",
      "text": "the better direction is generic evidence support that works for both direct questions and non-question inputs.",
      "event_time": 1776868200,
      "status": "active",
      "source_ref": "session:t11:m2",
      "tags": ["project:klbr", "topic:support", "topic:passive-recall"]
    },
    {
      "memory_id": 31,
      "namespace": "default",
      "layer": "L1",
      "timeline_id": "t11-support-vs-taxonomy",
      "text": "passive recall should reuse the same support features instead of introducing a separate query taxonomy.",
      "event_time": 1776870000,
      "status": "active",
      "source_ref": "session:t11:m3",
      "tags": ["project:klbr", "topic:support", "topic:passive-recall"]
    },
    {
      "memory_id": 32,
      "namespace": "default",
      "layer": "L1",
      "timeline_id": "t12-expand-dataset-first",
      "text": "before passive recall work, the next step was to expand the eval dataset and rerun the retrieval benchmarks.",
      "event_time": 1776871800,
      "status": "active",
      "source_ref": "session:t12:m1",
      "tags": ["project:klbr", "topic:eval-data", "topic:benchmarking"]
    },
    {
      "memory_id": 33,
      "namespace": "default",
      "layer": "L1",
      "timeline_id": "t12-expand-dataset-first",
      "text": "the starter eval set needed stronger coverage in no-hit, conflict/update, and multi-evidence cases.",
      "event_time": 1776873600,
      "status": "active",
      "source_ref": "session:t12:m2",
      "tags": ["project:klbr", "topic:eval-data", "topic:calibration"]
    },
    {
      "memory_id": 34,
      "namespace": "default",
      "layer": "L1",
      "timeline_id": "t12-expand-dataset-first",
      "text": "the current thresholds should stay provisional until the larger dataset is benchmarked again.",
      "event_time": 1776875400,
      "status": "active",
      "source_ref": "session:t12:m3",
      "tags": ["project:klbr", "topic:calibration", "topic:eval-data"]
    }
  ],
  "queries": [
    {
      "query_id": "dev_q1",
      "split": "test",
      "category": "exact recent event",
      "namespace": "default",
      "timeline_id": "t9-current-endpoints",
      "text": "what is the reranker endpoint right now?",
      "gold_memory_ids": [24],
      "no_hit": false,
      "gold_answer": "http://localhost:8003",
      "reference_time": 1776852000
    },
    {
      "query_id": "dev_q2",
      "split": "test",
      "category": "exact dated event",
      "namespace": "default",
      "timeline_id": "t5-llama-server-migration",
      "text": "what did i switch to on april 18 when i moved off lm studio?",
      "gold_memory_ids": [14],
      "no_hit": false,
      "gold_answer": "llama-server",
      "reference_time": 1776517200
    },
    {
      "query_id": "dev_q3",
      "split": "dev",
      "category": "vague recent lookup",
      "namespace": "default",
      "timeline_id": "t8-better-eval-data",
      "text": "what have i been working on lately?",
      "gold_memory_ids": [10, 22, 23],
      "no_hit": false,
      "gold_answer": "better eval data and the shared retrieval path",
      "reference_time": 1776852000
    },
    {
      "query_id": "dev_q4",
      "split": "dev",
      "category": "recurring theme / pattern-like question",
      "namespace": "default",
      "timeline_id": "t6-harness-observability",
      "text": "what kind of infra pattern do i keep using in this project?",
      "gold_memory_ids": [19],
      "no_hit": false,
      "gold_answer": "splitting chat, embedding, and rerank services by role",
      "reference_time": 1776780000
    },
    {
      "query_id": "dev_q5",
      "split": "dev",
      "category": "conflict / update question",
      "namespace": "default",
      "timeline_id": "t1-editor-and-embedding-split",
      "text": "what editor am i using now for klbr?",
      "gold_memory_ids": [2],
      "no_hit": false,
      "gold_answer": "zed",
      "reference_time": 1776852000
    },
    {
      "query_id": "dev_q6",
      "split": "test",
      "category": "conflict / update question",
      "namespace": "default",
      "timeline_id": "t7-latency-and-abstain",
      "text": "is the reranker endpoint using https now?",
      "gold_memory_ids": [12, 24],
      "no_hit": false,
      "gold_answer": "no, it should use plain http and the current endpoint is http://localhost:8003",
      "reference_time": 1776852000
    },
    {
      "query_id": "dev_q7",
      "split": "test",
      "category": "temporally ambiguous query",
      "namespace": "default",
      "timeline_id": "t5-llama-server-migration",
      "text": "what happened around the migration?",
      "gold_memory_ids": [14, 15, 16],
      "no_hit": false,
      "gold_answer": "chat moved to llama-server, auto model discovery was added, and embeddings stayed separate on bge-m3",
      "reference_time": 1776519000
    },
    {
      "query_id": "dev_q8",
      "split": "test",
      "category": "multi-evidence query",
      "namespace": "default",
      "timeline_id": "t7-latency-and-abstain",
      "text": "why did the benchmark latency improve?",
      "gold_memory_ids": [20, 24],
      "no_hit": false,
      "gold_answer": "because the reranker endpoint switched to plain http and the model was warm",
      "reference_time": 1776852000
    },
    {
      "query_id": "dev_q9",
      "split": "test",
      "category": "no-hit query",
      "namespace": "default",
      "timeline_id": "t9-current-endpoints",
      "text": "what postgres version am i running for klbr?",
      "gold_memory_ids": [],
      "no_hit": true,
      "reference_time": 1776852000
    },
    {
      "query_id": "dev_q10",
      "split": "dev",
      "category": "exact dated event",
      "namespace": "default",
      "timeline_id": "t2-error-propagation",
      "text": "what was fixed on april 15 when the stream failed?",
      "gold_memory_ids": [5],
      "no_hit": false,
      "gold_answer": "the daemon stopped swallowing llm stream errors and forwarded them to the tui",
      "reference_time": 1776250800
    },
    {
      "query_id": "dev_q11",
      "split": "dev",
      "category": "vague recent lookup",
      "namespace": "default",
      "timeline_id": "t3-benchmark-scaffolding",
      "text": "what benchmark-related work has been happening?",
      "gold_memory_ids": [8, 9, 10, 17, 18, 22],
      "no_hit": false,
      "gold_answer": "benchmark logging, timeline-aware eval data, retrieval plumbing, and replacing the toy eval set",
      "reference_time": 1776852000
    },
    {
      "query_id": "dev_q12",
      "split": "dev",
      "category": "multi-evidence query",
      "namespace": "default",
      "timeline_id": "t1-editor-and-embedding-split",
      "text": "what setup is protecting us from embedding/model mismatch?",
      "gold_memory_ids": [3, 7, 19],
      "no_hit": false,
      "gold_answer": "keeping embeddings separate on bge-m3 and separating services by role",
      "reference_time": 1776852000
    },
    {
      "query_id": "dev_q13",
      "split": "dev",
      "category": "no-hit query",
      "namespace": "default",
      "timeline_id": "t8-better-eval-data",
      "route_label": "tools",
      "text": "what postgres version is klbr using right now?",
      "gold_memory_ids": [],
      "no_hit": true,
      "reference_time": 1776852000
    },
    {
      "query_id": "dev_q14",
      "split": "dev",
      "category": "no-hit query",
      "namespace": "default",
      "timeline_id": "t6-harness-observability",
      "route_label": "tools",
      "text": "which browser am i using to inspect the benchmark output?",
      "gold_memory_ids": [],
      "no_hit": true,
      "reference_time": 1776852000
    },
    {
      "query_id": "dev_q15",
      "split": "dev",
      "category": "no-hit query",
      "namespace": "default",
      "timeline_id": "t1-editor-and-embedding-split",
      "route_label": "tools",
      "text": "what gpu is running the reranker?",
      "gold_memory_ids": [],
      "no_hit": true,
      "reference_time": 1776852000
    },
    {
      "query_id": "dev_q16",
      "split": "dev",
      "category": "no-hit query",
      "namespace": "default",
      "timeline_id": "t3-benchmark-scaffolding",
      "route_label": "tools",
      "text": "which postgres extension did i enable for the benchmark database?",
      "gold_memory_ids": [],
      "no_hit": true,
      "reference_time": 1776852000
    },
    {
      "query_id": "test_q1",
      "split": "dev",
      "category": "exact recent event",
      "namespace": "default",
      "timeline_id": "t8-better-eval-data",
      "text": "what changed between the live agent and the benchmark harness?",
      "gold_memory_ids": [23],
      "no_hit": false,
      "gold_answer": "they now share the same time-windowed exact retrieval path",
      "reference_time": 1776781800
    },
    {
      "query_id": "test_q2",
      "split": "dev",
      "category": "conflict / update question",
      "namespace": "default",
      "timeline_id": "t1-editor-and-embedding-split",
      "text": "was i still using neovim after the editor switch?",
      "gold_memory_ids": [2],
      "no_hit": false,
      "gold_answer": "no, the newer fact says the editor switched to zed",
      "reference_time": 1776852000
    },
    {
      "query_id": "test_q3",
      "split": "test",
      "category": "temporally ambiguous query",
      "namespace": "default",
      "timeline_id": "t7-latency-and-abstain",
      "text": "what happened around the latency fix?",
      "gold_memory_ids": [20, 21],
      "no_hit": false,
      "gold_answer": "latency improved after moving to http and a remaining bug was the wrong abstention rule",
      "reference_time": 1776688200
    },
    {
      "query_id": "test_q4",
      "split": "test",
      "category": "no-hit query",
      "namespace": "default",
      "timeline_id": "t9-current-endpoints",
      "text": "which browser am i using to inspect the benchmark output?",
      "gold_memory_ids": [],
      "no_hit": true,
      "reference_time": 1776852000
    },
    {
      "query_id": "test_q5",
      "split": "test",
      "category": "exact dated event",
      "namespace": "default",
      "timeline_id": "t4-reranker-plan",
      "text": "what reranker did i decide to use on april 17?",
      "gold_memory_ids": [13],
      "no_hit": false,
      "gold_answer": "bge-reranker-v2-m3",
      "reference_time": 1776425400
    },
    {
      "query_id": "test_q6",
      "split": "test",
      "category": "vague recent lookup",
      "namespace": "default",
      "timeline_id": "t5-llama-server-migration",
      "text": "what changed in model serving around llama-server?",
      "gold_memory_ids": [14, 15, 16],
      "no_hit": false,
      "gold_answer": "chat moved to llama-server, auto model discovery was added, and embeddings remained separate",
      "reference_time": 1776852000
    },
    {
      "query_id": "test_q7",
      "split": "dev",
      "category": "multi-evidence query",
      "namespace": "default",
      "timeline_id": "t8-better-eval-data",
      "text": "why do we need better benchmark data now?",
      "gold_memory_ids": [18, 22],
      "no_hit": false,
      "gold_answer": "because the toy benchmark only proved plumbing and the eval set needs harder no-hit and ambiguity cases",
      "reference_time": 1776852000
    },
    {
      "query_id": "test_q8",
      "split": "dev",
      "category": "recurring theme / pattern-like question",
      "namespace": "default",
      "timeline_id": "t6-harness-observability",
      "text": "what sort of retrieval work keeps coming up?",
      "gold_memory_ids": [10, 19, 23],
      "no_hit": false,
      "gold_answer": "retrieval plumbing, service separation, and aligning the live and benchmark retrieval path",
      "reference_time": 1776852000
    },
    {
      "query_id": "test_q9",
      "split": "test",
      "category": "no-hit query",
      "namespace": "default",
      "timeline_id": "t9-current-endpoints",
      "route_label": "tools",
      "text": "what gpu am i using for the reranker?",
      "required_tools": ["shell"],
      "gold_memory_ids": [],
      "no_hit": true,
      "reference_time": 1776852000
    },
    {
      "query_id": "test_q10",
      "split": "dev",
      "category": "exact recent event",
      "namespace": "default",
      "timeline_id": "t6-harness-observability",
      "text": "why did i add progress logging to the benchmark harness?",
      "gold_memory_ids": [17],
      "no_hit": false,
      "gold_answer": "because silent hangs were confusing",
      "reference_time": 1776614400
    },
    {
      "query_id": "test_q11",
      "split": "test",
      "category": "conflict / update question",
      "namespace": "default",
      "timeline_id": "t7-latency-and-abstain",
      "text": "what was wrong with the benchmark after the reranker got fast?",
      "gold_memory_ids": [21],
      "no_hit": false,
      "gold_answer": "it still abstained on a positive query because it used first-stage distance instead of rerank confidence",
      "reference_time": 1776688200
    },
    {
      "query_id": "test_q12",
      "split": "dev",
      "category": "temporally ambiguous query",
      "namespace": "default",
      "timeline_id": "t1-editor-and-embedding-split",
      "text": "what happened around the embedding split?",
      "gold_memory_ids": [2, 3, 4],
      "no_hit": false,
      "gold_answer": "the editor switched to zed, embeddings moved to a separate bge-m3 server, and exact retrieval stayed the first benchmark step",
      "reference_time": 1776160800
    },
    {
      "query_id": "dev_q17",
      "split": "dev",
      "category": "exact recent event",
      "namespace": "default",
      "timeline_id": "t10-support-calibration",
      "text": "what support threshold looked best in the focused support sweep?",
      "gold_memory_ids": [27],
      "no_hit": false,
      "gold_answer": "0.2",
      "reference_time": 1776862800
    },
    {
      "query_id": "dev_q18",
      "split": "dev",
      "category": "multi-evidence query",
      "namespace": "default",
      "timeline_id": "t10-support-calibration",
      "text": "what changed after adding the support score to the policy?",
      "gold_memory_ids": [26, 27, 28],
      "no_hit": false,
      "gold_answer": "the policy added weighted lexical support, settled on score -6.0 / margin 0.0 / support 0.2, and removed no-hit false answers with some coverage loss",
      "reference_time": 1776864600
    },
    {
      "query_id": "dev_q19",
      "split": "dev",
      "category": "recurring theme / pattern-like question",
      "namespace": "default",
      "timeline_id": "t11-support-vs-taxonomy",
      "text": "what kind of calibration fix are we avoiding because it would be brittle?",
      "gold_memory_ids": [29],
      "no_hit": false,
      "gold_answer": "many query-type guards",
      "reference_time": 1776870000
    },
    {
      "query_id": "dev_q20",
      "split": "dev",
      "category": "multi-evidence query",
      "namespace": "default",
      "timeline_id": "t11-support-vs-taxonomy",
      "text": "how should passive recall connect to the current calibration work?",
      "gold_memory_ids": [30, 31],
      "no_hit": false,
      "gold_answer": "it should reuse the same generic support features instead of a separate taxonomy",
      "reference_time": 1776870000
    },
    {
      "query_id": "dev_q21",
      "split": "dev",
      "category": "no-hit query",
      "namespace": "default",
      "timeline_id": "t11-support-vs-taxonomy",
      "route_label": "tools",
      "text": "which classifier model are we using to assign query types?",
      "gold_memory_ids": [],
      "no_hit": true,
      "reference_time": 1776870000
    },
    {
      "query_id": "dev_q22",
      "split": "dev",
      "category": "conflict / update question",
      "namespace": "default",
      "timeline_id": "t10-support-calibration",
      "text": "did the support-assisted sweep get coverage all the way back to the old 0.9167 balanced point?",
      "gold_memory_ids": [28],
      "no_hit": false,
      "gold_answer": "no, coverage settled at 0.8333",
      "reference_time": 1776864600
    },
    {
      "query_id": "dev_q23",
      "split": "dev",
      "category": "no-hit query",
      "namespace": "default",
      "timeline_id": "t8-better-eval-data",
      "text": "what editor am i using for my dotfiles repo these days?",
      "gold_memory_ids": [],
      "no_hit": true,
      "reference_time": 1776870000
    },
    {
      "query_id": "dev_q24",
      "split": "dev",
      "category": "no-hit query",
      "namespace": "default",
      "timeline_id": "t3-benchmark-scaffolding",
      "text": "what's the p99 rerank latency on the current machine?",
      "gold_memory_ids": [],
      "no_hit": true,
      "reference_time": 1776870000
    },
    {
      "query_id": "dev_q25",
      "split": "dev",
      "category": "no-hit query",
      "namespace": "default",
      "timeline_id": "t11-support-vs-taxonomy",
      "text": "what's the commit hash for the router model we're using right now?",
      "gold_memory_ids": [],
      "no_hit": true,
      "reference_time": 1776870000
    },
    {
      "query_id": "dev_q26",
      "split": "dev",
      "category": "no-hit query",
      "namespace": "default",
      "timeline_id": "t6-harness-observability",
      "text": "what rust edition is klbr using?",
      "gold_memory_ids": [],
      "no_hit": true,
      "reference_time": 1776870000
    },
    {
      "query_id": "dev_q27",
      "split": "dev",
      "category": "no-hit query",
      "namespace": "default",
      "timeline_id": "t10-support-calibration",
      "text": "what's the websocket port the daemon is listening on?",
      "gold_memory_ids": [],
      "no_hit": true,
      "reference_time": 1776870000
    },
    {
      "query_id": "dev_q28",
      "split": "dev",
      "category": "no-hit query",
      "namespace": "default",
      "timeline_id": "t8-better-eval-data",
      "text": "what's the default db filename the agent writes to?",
      "gold_memory_ids": [],
      "no_hit": true,
      "reference_time": 1776870000
    },
    {
      "query_id": "dev_q29",
      "split": "dev",
      "category": "no-hit query",
      "namespace": "default",
      "timeline_id": "t3-benchmark-scaffolding",
      "text": "what's the current p95 embedding latency on this machine?",
      "gold_memory_ids": [],
      "no_hit": true,
      "reference_time": 1776870000
    },
    {
      "query_id": "dev_q30",
      "split": "dev",
      "category": "no-hit query",
      "namespace": "default",
      "timeline_id": "t6-harness-observability",
      "text": "what's the sqlite-vec extension version we're using?",
      "gold_memory_ids": [],
      "no_hit": true,
      "reference_time": 1776870000
    },
    {
      "query_id": "dev_q31",
      "split": "dev",
      "category": "no-hit query",
      "namespace": "default",
      "timeline_id": "t11-support-vs-taxonomy",
      "text": "what's the name of the nearest-neighbor index backend we're using in production?",
      "gold_memory_ids": [],
      "no_hit": true,
      "reference_time": 1776870000
    },
    {
      "query_id": "dev_q32",
      "split": "dev",
      "category": "no-hit query",
      "namespace": "default",
      "timeline_id": "t2-error-propagation",
      "text": "what's the default watermark token budget for compaction?",
      "gold_memory_ids": [],
      "no_hit": true,
      "reference_time": 1776870000
    },
    {
      "query_id": "test_q13",
      "split": "test",
      "category": "exact recent event",
      "namespace": "default",
      "timeline_id": "t12-expand-dataset-first",
      "text": "what are we doing before passive recall work?",
      "gold_memory_ids": [32],
      "no_hit": false,
      "gold_answer": "expanding the eval dataset and rerunning the retrieval benchmarks",
      "reference_time": 1776871800
    },
    {
      "query_id": "test_q14",
      "split": "test",
      "category": "multi-evidence query",
      "namespace": "default",
      "timeline_id": "t12-expand-dataset-first",
      "text": "why are we expanding the dataset before treating thresholds as final?",
      "gold_memory_ids": [32, 33, 34],
      "no_hit": false,
      "gold_answer": "because the eval set needed stronger coverage and the thresholds should stay provisional until the larger dataset is benchmarked again",
      "reference_time": 1776875400
    },
    {
      "query_id": "test_q15",
      "split": "test",
      "category": "no-hit query",
      "namespace": "default",
      "timeline_id": "t12-expand-dataset-first",
      "route_label": "tools",
      "text": "which postgres extension are we enabling before passive recall?",
      "required_tools": ["read_file"],
      "gold_memory_ids": [],
      "no_hit": true,
      "reference_time": 1776875400
    },
    {
      "query_id": "test_q16",
      "split": "test",
      "category": "recurring theme / pattern-like question",
      "namespace": "default",
      "timeline_id": "t12-expand-dataset-first",
      "text": "what kinds of eval cases did the starter set need more of?",
      "gold_memory_ids": [33],
      "no_hit": false,
      "gold_answer": "no-hit, conflict/update, and multi-evidence cases",
      "reference_time": 1776873600
    },
    {
      "query_id": "test_q17",
      "split": "test",
      "category": "conflict / update question",
      "namespace": "default",
      "timeline_id": "t12-expand-dataset-first",
      "text": "are the current thresholds final now that we have one sweep?",
      "gold_memory_ids": [34],
      "no_hit": false,
      "gold_answer": "no, they should remain provisional until the larger dataset is benchmarked again",
      "reference_time": 1776875400
    },
    {
      "query_id": "test_q18",
      "split": "test",
      "category": "temporally ambiguous query",
      "namespace": "default",
      "timeline_id": "t12-expand-dataset-first",
      "text": "what was the sequence around the dataset expansion decision?",
      "gold_memory_ids": [32, 33, 34],
      "no_hit": false,
      "gold_answer": "the next step was to expand the dataset, it needed more hard cases, and thresholds stayed provisional until rerunning the larger benchmark",
      "reference_time": 1776875400
    }
  ]
}