cxs is a local-first CLI for searching Codex session logs. It is designed for progressive retrieval: find the right session first, then read
1
fork

Configure Feed

Select the types of activity you want to include in your feed.

feat(eval): 增加 dogfood 验收门

Entire-Checkpoint: 3f789680b510

catoncat ced83413 f33f2bcc

+662 -1
+11
AGENTS.md
··· 95 95 - 全局 `cxs` 通过 `npx skills add` 更新;全局 `cxsd` 通过 symlink 跟随本地 repo 96 96 - 若 `cxs` 与 `cxsd` 行为不一致,先判断是“线上尚未发布”还是“dev skill 漂移”,不要直接覆盖任一通道 97 97 98 + ## Dogfood eval 边界 99 + 100 + Dogfood golden 是开发者本机的真实历史检索验收集,不是普通用户功能。 101 + 102 + - 通用 runner / schema 可以维护在 `eval/`,例如 `npm run eval:dogfood -- <goldens.local.jsonl>` 103 + - 私有 golden 默认放在 ignored 路径:`data/cxs-dogfood/goldens.local.jsonl` 104 + - 添加 / promote dogfood golden 只能在用户显式触发 dev-only skill 时进行 105 + - 本机 dev-only skill 路径:`~/.agents/skills/cxs-dogfood` 106 + - 不要把 dogfood capture 流程或私有 golden 放进 `skill-packages/cxs` / `skill-packages/cxsd` 107 + - 普通代码实现任务可以运行已有 dogfood gate,但不能自行新增 golden,也不能自行把 `candidate` promote 为 `hard` 108 + 98 109 ## 默认验证 99 110 100 111 涉及实现或文档真相变更时,至少做与改动直接相关的验证:
+7
docs/ROADMAP.md
··· 17 17 - `title_or_summary` 18 18 - `cwd` 19 19 - `snippet` 20 + - [eval/run-dogfood-eval.ts](/Users/envvar/work/repos/cxs/eval/run-dogfood-eval.ts) 已提供本机 dogfood golden runner: 21 + - 读取 ignored JSONL golden 文件 22 + - 检查 expected session / cwd / matchSource / context key phrase 23 + - `hard` 失败会以非零退出阻断本机 gate 24 + - `candidate` 失败只报告,不阻断 20 25 21 26 建议动作: 22 27 23 28 - 扩充真实 query 集 29 + - 继续用 dev-only `~/.agents/skills/cxs-dogfood` 手动策展本机 dogfood golden;不要把私有样本放进发行 skill package 24 30 - 增加更强断言: 25 31 - session 是否对 26 32 - `read-range` 是否给出有用上下文 27 33 - 是否命中关键 message / key phrase 28 34 - 继续复用现有: 29 35 - `npm run eval:manual` 36 + - `npm run eval:dogfood -- <goldens.local.jsonl>` 30 37 - `npm run eval:compare -- <before> <after>` 31 38 32 39 ### P1: 已补 session-level 字段召回
+99
eval/dogfood-eval-core.test.ts
··· 1 + import { describe, expect, test } from "vitest"; 2 + import { desiredContextMode, evaluateDogfoodItem, selectDogfoodHit } from "./dogfood-eval-core"; 3 + import type { DogfoodGolden } from "./dogfood-schema"; 4 + import type { FindResult } from "../src/types"; 5 + 6 + describe("dogfood eval core", () => { 7 + test("selects an acceptable session inside topK instead of blindly using top1", () => { 8 + const item = golden({ acceptableSessionUuids: ["session-b"], topK: 2 }); 9 + const selected = selectDogfoodHit(item, [findResult({ sessionUuid: "session-a" }), findResult({ sessionUuid: "session-b" })]); 10 + 11 + expect(selected.hit?.sessionUuid).toBe("session-b"); 12 + expect(selected.rank).toBe(2); 13 + }); 14 + 15 + test("fails a hard item when expected session is outside topK", () => { 16 + const evaluation = evaluateDogfoodItem({ 17 + item: golden({ status: "hard", acceptableSessionUuids: ["session-c"], topK: 2 }), 18 + results: [findResult({ sessionUuid: "session-a" }), findResult({ sessionUuid: "session-b" }), findResult({ sessionUuid: "session-c" })], 19 + }); 20 + 21 + expect(evaluation.mark).toBe("fail"); 22 + expect(evaluation.blocking).toBe(true); 23 + }); 24 + 25 + test("candidate failures are reported but not blocking", () => { 26 + const evaluation = evaluateDogfoodItem({ 27 + item: golden({ status: "candidate", acceptableSessionUuids: ["missing"] }), 28 + results: [findResult({ sessionUuid: "session-a" })], 29 + }); 30 + 31 + expect(evaluation.mark).toBe("fail"); 32 + expect(evaluation.blocking).toBe(false); 33 + }); 34 + 35 + test("checks context key phrases against actual read output", () => { 36 + const evaluation = evaluateDogfoodItem({ 37 + item: golden({ contextMustContain: ["official Cursor", "ccursor"] }), 38 + results: [findResult({ sessionUuid: "session-a" })], 39 + contextKind: "read-page", 40 + contextText: "Install official Cursor, log in, then install ccursor.", 41 + }); 42 + 43 + expect(evaluation.mark).toBe("pass"); 44 + }); 45 + 46 + test("uses read-range for message hits and read-page for session-only hits in auto mode", () => { 47 + const item = golden({ contextMustContain: ["decision"] }); 48 + 49 + expect(desiredContextMode(item, findResult({ matchSource: "message", matchSeq: 7 }))).toBe("read-range"); 50 + expect(desiredContextMode(item, findResult({ matchSource: "session", matchSeq: null }))).toBe("read-page"); 51 + }); 52 + }); 53 + 54 + function golden( 55 + overrides: Partial<{ 56 + status: "candidate" | "hard" | "stale"; 57 + acceptableSessionUuids: string[]; 58 + topK: number; 59 + contextMustContain: string[]; 60 + }> = {}, 61 + ): DogfoodGolden { 62 + return { 63 + id: "cursor-ccursor-install-order", 64 + query: "Cursor ccursor 安装顺序", 65 + intent: "找回 Cursor 安装顺序", 66 + status: overrides.status ?? "hard", 67 + expected: { 68 + topK: overrides.topK, 69 + acceptableSessionUuids: overrides.acceptableSessionUuids, 70 + context: overrides.contextMustContain ? { mustContain: overrides.contextMustContain } : undefined, 71 + }, 72 + }; 73 + } 74 + 75 + function findResult( 76 + overrides: Partial<{ 77 + sessionUuid: string; 78 + cwd: string; 79 + matchSource: "message" | "session"; 80 + matchSeq: number | null; 81 + }> = {}, 82 + ): FindResult { 83 + return { 84 + rank: 1, 85 + sessionUuid: overrides.sessionUuid ?? "session-a", 86 + title: "title", 87 + summaryText: "", 88 + cwd: overrides.cwd ?? "/tmp/project", 89 + startedAt: "2026-04-22T00:00:00.000Z", 90 + endedAt: "2026-04-22T00:00:00.000Z", 91 + matchCount: 1, 92 + matchSource: overrides.matchSource ?? "message", 93 + matchSeq: overrides.matchSeq === undefined ? 0 : overrides.matchSeq, 94 + matchRole: overrides.matchSource === "session" ? "session" : "user", 95 + matchTimestamp: "2026-04-22T00:00:00.000Z", 96 + score: 100, 97 + snippet: "title", 98 + }; 99 + }
+129
eval/dogfood-eval-core.ts
··· 1 + import type { FindResult } from "../src/types"; 2 + import type { DogfoodGolden } from "./dogfood-schema"; 3 + 4 + export type DogfoodMark = "pass" | "fail" | "skip"; 5 + 6 + export interface DogfoodPredicateResult { 7 + label: "session_uuid" | "cwd" | "match_source" | "context"; 8 + expected: string; 9 + actual: string; 10 + matched: boolean; 11 + } 12 + 13 + export interface SelectedDogfoodHit { 14 + hit: FindResult | null; 15 + rank: number | null; 16 + topK: number; 17 + } 18 + 19 + export interface DogfoodEvaluationInput { 20 + item: DogfoodGolden; 21 + results: FindResult[]; 22 + contextText?: string; 23 + contextKind?: "read-range" | "read-page"; 24 + contextUnavailableReason?: string; 25 + } 26 + 27 + export interface DogfoodEvaluation { 28 + mark: DogfoodMark; 29 + blocking: boolean; 30 + selected: SelectedDogfoodHit; 31 + predicateResults: DogfoodPredicateResult[]; 32 + } 33 + 34 + export function evaluateDogfoodItem(input: DogfoodEvaluationInput): DogfoodEvaluation { 35 + if (input.item.status === "stale") { 36 + return { 37 + mark: "skip", 38 + blocking: false, 39 + selected: { hit: null, rank: null, topK: input.item.expected.topK ?? 5 }, 40 + predicateResults: [], 41 + }; 42 + } 43 + 44 + const selected = selectDogfoodHit(input.item, input.results); 45 + const predicates = buildPredicates(input, selected); 46 + const mark = predicates.length > 0 && predicates.every((predicate) => predicate.matched) ? "pass" : "fail"; 47 + 48 + return { 49 + mark, 50 + blocking: input.item.status === "hard" && mark === "fail", 51 + selected, 52 + predicateResults: predicates, 53 + }; 54 + } 55 + 56 + export function selectDogfoodHit(item: DogfoodGolden, results: FindResult[]): SelectedDogfoodHit { 57 + const topK = item.expected.topK ?? 5; 58 + const acceptable = item.expected.acceptableSessionUuids ?? []; 59 + 60 + if (acceptable.length > 0) { 61 + const index = results.slice(0, topK).findIndex((result) => acceptable.includes(result.sessionUuid)); 62 + if (index >= 0) return { hit: results[index]!, rank: index + 1, topK }; 63 + } 64 + 65 + return { hit: results[0] ?? null, rank: results.length > 0 ? 1 : null, topK }; 66 + } 67 + 68 + export function desiredContextMode(item: DogfoodGolden, hit: FindResult | null): "read-range" | "read-page" | null { 69 + const context = item.expected.context; 70 + if (!context?.mustContain?.length) return null; 71 + const mode = context.mode ?? "auto"; 72 + if (mode !== "auto") return mode; 73 + return typeof hit?.matchSeq === "number" ? "read-range" : "read-page"; 74 + } 75 + 76 + function buildPredicates( 77 + input: DogfoodEvaluationInput, 78 + selected: SelectedDogfoodHit, 79 + ): DogfoodPredicateResult[] { 80 + const { item } = input; 81 + const hit = selected.hit; 82 + const predicates: DogfoodPredicateResult[] = []; 83 + const acceptable = item.expected.acceptableSessionUuids ?? []; 84 + 85 + if (acceptable.length > 0) { 86 + predicates.push({ 87 + label: "session_uuid", 88 + expected: `one of ${acceptable.join(", ")} in top ${selected.topK}`, 89 + actual: hit ? `${hit.sessionUuid} at rank ${selected.rank}` : "no results", 90 + matched: Boolean(hit && acceptable.includes(hit.sessionUuid) && (selected.rank ?? Infinity) <= selected.topK), 91 + }); 92 + } 93 + 94 + if (item.expected.cwdContains) { 95 + const needle = item.expected.cwdContains.toLowerCase(); 96 + predicates.push({ 97 + label: "cwd", 98 + expected: item.expected.cwdContains, 99 + actual: hit?.cwd ?? "no selected hit", 100 + matched: Boolean(hit?.cwd.toLowerCase().includes(needle)), 101 + }); 102 + } 103 + 104 + if (item.expected.matchSource) { 105 + predicates.push({ 106 + label: "match_source", 107 + expected: item.expected.matchSource, 108 + actual: hit?.matchSource ?? "no selected hit", 109 + matched: hit?.matchSource === item.expected.matchSource, 110 + }); 111 + } 112 + 113 + for (const needle of item.expected.context?.mustContain ?? []) { 114 + const haystack = input.contextText ?? ""; 115 + predicates.push({ 116 + label: "context", 117 + expected: needle, 118 + actual: input.contextUnavailableReason ?? contextActual(input.contextKind, haystack), 119 + matched: haystack.toLowerCase().includes(needle.toLowerCase()), 120 + }); 121 + } 122 + 123 + return predicates; 124 + } 125 + 126 + function contextActual(kind: string | undefined, text: string): string { 127 + if (!text) return kind ? `${kind}: empty context` : "context not read"; 128 + return `${kind ?? "context"}: ${text.length} chars`; 129 + }
+192
eval/dogfood-schema.ts
··· 1 + import type { MatchSource } from "../src/types"; 2 + 3 + export type DogfoodStatus = "candidate" | "hard" | "stale"; 4 + export type DogfoodOriginKind = "observed-user-ask" | "evidence-backed-derived" | "manual"; 5 + export type DogfoodContextMode = "auto" | "read-range" | "read-page"; 6 + 7 + export interface DogfoodOrigin { 8 + kind: DogfoodOriginKind; 9 + sourceSessionUuid?: string; 10 + sourceSeq?: number; 11 + note?: string; 12 + } 13 + 14 + export interface DogfoodExpectedContext { 15 + mode?: DogfoodContextMode; 16 + before?: number; 17 + after?: number; 18 + offset?: number; 19 + limit?: number; 20 + mustContain?: string[]; 21 + } 22 + 23 + export interface DogfoodExpected { 24 + topK?: number; 25 + acceptableSessionUuids?: string[]; 26 + cwdContains?: string; 27 + matchSource?: MatchSource; 28 + context?: DogfoodExpectedContext; 29 + } 30 + 31 + export interface DogfoodGolden { 32 + id: string; 33 + query: string; 34 + intent: string; 35 + status: DogfoodStatus; 36 + origin?: DogfoodOrigin; 37 + expected: DogfoodExpected; 38 + } 39 + 40 + export interface DogfoodParseResult { 41 + entries: DogfoodGolden[]; 42 + errors: string[]; 43 + } 44 + 45 + export function parseDogfoodJsonl(text: string, sourceName: string): DogfoodParseResult { 46 + const entries: DogfoodGolden[] = []; 47 + const errors: string[] = []; 48 + 49 + for (const [index, rawLine] of text.split(/\r?\n/).entries()) { 50 + const lineNumber = index + 1; 51 + const line = rawLine.trim(); 52 + if (!line || line.startsWith("#")) continue; 53 + 54 + try { 55 + const parsed = JSON.parse(line) as unknown; 56 + const validation = validateDogfoodGolden(parsed, lineNumber); 57 + if (validation.ok) { 58 + entries.push(validation.value); 59 + } else { 60 + errors.push(`${sourceName}:${lineNumber}: ${validation.error}`); 61 + } 62 + } catch (error) { 63 + const message = error instanceof Error ? error.message : String(error); 64 + errors.push(`${sourceName}:${lineNumber}: invalid JSONL entry: ${message}`); 65 + } 66 + } 67 + 68 + return { entries, errors }; 69 + } 70 + 71 + function validateDogfoodGolden( 72 + value: unknown, 73 + lineNumber: number, 74 + ): { ok: true; value: DogfoodGolden } | { ok: false; error: string } { 75 + if (!isRecord(value)) return { ok: false, error: "entry must be an object" }; 76 + 77 + const id = readNonEmptyString(value, "id"); 78 + const query = readNonEmptyString(value, "query"); 79 + const intent = readNonEmptyString(value, "intent"); 80 + if (!id || !query || !intent) { 81 + return { ok: false, error: "id, query and intent are required non-empty strings" }; 82 + } 83 + 84 + const status = value.status; 85 + if (status !== "candidate" && status !== "hard" && status !== "stale") { 86 + return { ok: false, error: "status must be candidate, hard or stale" }; 87 + } 88 + 89 + const expected = parseExpected(value.expected); 90 + if (!expected) return { ok: false, error: "expected must contain at least one assertion" }; 91 + 92 + const origin = parseOrigin(value.origin, lineNumber); 93 + if (origin === "invalid") return { ok: false, error: "origin is invalid" }; 94 + 95 + return { ok: true, value: { id, query, intent, status, expected, ...(origin ? { origin } : {}) } }; 96 + } 97 + 98 + function parseExpected(value: unknown): DogfoodExpected | null { 99 + if (!isRecord(value)) return null; 100 + 101 + const expected: DogfoodExpected = {}; 102 + const topK = readPositiveInteger(value.topK); 103 + if (topK) expected.topK = topK; 104 + 105 + const acceptableSessionUuids = readStringArray(value.acceptableSessionUuids); 106 + if (acceptableSessionUuids) expected.acceptableSessionUuids = acceptableSessionUuids; 107 + 108 + const cwdContains = readNonEmptyString(value, "cwdContains"); 109 + if (cwdContains) expected.cwdContains = cwdContains; 110 + 111 + if (value.matchSource === "message" || value.matchSource === "session") { 112 + expected.matchSource = value.matchSource; 113 + } 114 + 115 + const context = parseContext(value.context); 116 + if (context) expected.context = context; 117 + 118 + return hasExpectedAssertion(expected) ? expected : null; 119 + } 120 + 121 + function parseContext(value: unknown): DogfoodExpectedContext | undefined { 122 + if (!isRecord(value)) return undefined; 123 + const context: DogfoodExpectedContext = {}; 124 + 125 + if (value.mode === "auto" || value.mode === "read-range" || value.mode === "read-page") { 126 + context.mode = value.mode; 127 + } 128 + const before = readPositiveInteger(value.before); 129 + if (before) context.before = before; 130 + const after = readPositiveInteger(value.after); 131 + if (after) context.after = after; 132 + const offset = readNonNegativeInteger(value.offset); 133 + if (typeof offset === "number") context.offset = offset; 134 + const limit = readPositiveInteger(value.limit); 135 + if (limit) context.limit = limit; 136 + const mustContain = readStringArray(value.mustContain); 137 + if (mustContain) context.mustContain = mustContain; 138 + 139 + return Object.keys(context).length > 0 ? context : undefined; 140 + } 141 + 142 + function parseOrigin(value: unknown, _lineNumber: number): DogfoodOrigin | undefined | "invalid" { 143 + if (value === undefined) return undefined; 144 + if (!isRecord(value)) return "invalid"; 145 + if ( 146 + value.kind !== "observed-user-ask" 147 + && value.kind !== "evidence-backed-derived" 148 + && value.kind !== "manual" 149 + ) { 150 + return "invalid"; 151 + } 152 + 153 + const origin: DogfoodOrigin = { kind: value.kind }; 154 + const sourceSessionUuid = readNonEmptyString(value, "sourceSessionUuid"); 155 + if (sourceSessionUuid) origin.sourceSessionUuid = sourceSessionUuid; 156 + if (typeof value.sourceSeq === "number" && Number.isInteger(value.sourceSeq)) origin.sourceSeq = value.sourceSeq; 157 + const note = readNonEmptyString(value, "note"); 158 + if (note) origin.note = note; 159 + return origin; 160 + } 161 + 162 + function hasExpectedAssertion(expected: DogfoodExpected): boolean { 163 + return Boolean( 164 + expected.acceptableSessionUuids?.length 165 + || Boolean(expected.cwdContains) 166 + || Boolean(expected.matchSource) 167 + || expected.context?.mustContain?.length, 168 + ); 169 + } 170 + 171 + function readNonEmptyString(record: Record<string, unknown>, key: string): string | undefined { 172 + const value = record[key]; 173 + return typeof value === "string" && value.trim() ? value.trim() : undefined; 174 + } 175 + 176 + function readStringArray(value: unknown): string[] | undefined { 177 + if (!Array.isArray(value)) return undefined; 178 + const items = value.filter((item): item is string => typeof item === "string" && Boolean(item.trim())).map((item) => item.trim()); 179 + return items.length > 0 ? items : undefined; 180 + } 181 + 182 + function readPositiveInteger(value: unknown): number | undefined { 183 + return typeof value === "number" && Number.isInteger(value) && value > 0 ? value : undefined; 184 + } 185 + 186 + function readNonNegativeInteger(value: unknown): number | undefined { 187 + return typeof value === "number" && Number.isInteger(value) && value >= 0 ? value : undefined; 188 + } 189 + 190 + function isRecord(value: unknown): value is Record<string, unknown> { 191 + return typeof value === "object" && value !== null && !Array.isArray(value); 192 + }
+222
eval/run-dogfood-eval.ts
··· 1 + #!/usr/bin/env -S node --import tsx 2 + 3 + import { mkdirSync, readFileSync, writeFileSync } from "node:fs"; 4 + import { spawn as childSpawn } from "node:child_process"; 5 + import { basename, join, resolve } from "node:path"; 6 + import { desiredContextMode, evaluateDogfoodItem, type DogfoodEvaluation } from "./dogfood-eval-core"; 7 + import { parseDogfoodJsonl, type DogfoodGolden } from "./dogfood-schema"; 8 + import type { FindResult } from "../src/types"; 9 + 10 + interface FindOutput { 11 + query: string; 12 + results: FindResult[]; 13 + } 14 + 15 + interface Args { 16 + goldenPath: string; 17 + includeStale: boolean; 18 + } 19 + 20 + const ROOT = resolve(import.meta.dirname, ".."); 21 + const CLI_ENTRY = resolve(ROOT, "src", "cli.ts"); 22 + const OUT_BASE = resolve(ROOT, "data", "cxs-dogfood-eval"); 23 + const args = parseArgs(process.argv.slice(2)); 24 + 25 + const parsed = parseDogfoodJsonl(readFileSync(args.goldenPath, "utf8"), args.goldenPath); 26 + if (parsed.errors.length > 0) { 27 + console.error(parsed.errors.join("\n")); 28 + process.exit(1); 29 + } 30 + 31 + const entries = parsed.entries.filter((entry) => args.includeStale || entry.status !== "stale"); 32 + const stamp = new Date().toISOString().replace(/[:.]/g, "-"); 33 + const outDir = join(OUT_BASE, stamp); 34 + mkdirSync(outDir, { recursive: true }); 35 + 36 + interface DogfoodEvalRow { 37 + item: DogfoodGolden; 38 + evaluation: DogfoodEvaluation; 39 + top1Title: string; 40 + selectedTitle: string; 41 + findJsonPath: string; 42 + contextTxtPath?: string; 43 + } 44 + 45 + const rows: DogfoodEvalRow[] = []; 46 + 47 + for (const [index, item] of entries.entries()) { 48 + const prefix = String(index + 1).padStart(2, "0"); 49 + const safeId = item.id.replace(/[^a-zA-Z0-9_.-]+/g, "-"); 50 + const limit = Math.max(item.expected.topK ?? 5, 5); 51 + 52 + const findJson = await runCommand([process.execPath, "--import", "tsx", CLI_ENTRY, "find", item.query, "--limit", String(limit), "--json"]); 53 + const findText = await runCommand([process.execPath, "--import", "tsx", CLI_ENTRY, "find", item.query, "--limit", String(limit)]); 54 + const findJsonPath = join(outDir, `${prefix}-${safeId}.find.json`); 55 + const findTxtPath = join(outDir, `${prefix}-${safeId}.find.txt`); 56 + writeFileSync(findJsonPath, findJson); 57 + writeFileSync(findTxtPath, findText); 58 + 59 + const parsedFind = JSON.parse(findJson) as FindOutput; 60 + const preselected = evaluateDogfoodItem({ item, results: parsedFind.results }).selected; 61 + const context = await readContextIfNeeded(item, preselected.hit, prefix, safeId, outDir); 62 + const evaluation = evaluateDogfoodItem({ 63 + item, 64 + results: parsedFind.results, 65 + contextText: context.text, 66 + contextKind: context.kind, 67 + contextUnavailableReason: context.unavailableReason, 68 + }); 69 + 70 + rows.push({ 71 + item, 72 + evaluation, 73 + top1Title: parsedFind.results[0]?.title ?? "(none)", 74 + selectedTitle: evaluation.selected.hit?.title ?? "(none)", 75 + findJsonPath, 76 + contextTxtPath: context.textPath, 77 + }); 78 + } 79 + 80 + const scoreboard = buildScoreboard(rows); 81 + const readmePath = join(outDir, "README.md"); 82 + const scorecardPath = join(outDir, "scorecard.json"); 83 + writeFileSync(readmePath, renderReadme(args.goldenPath, scoreboard, rows)); 84 + writeFileSync(scorecardPath, `${JSON.stringify({ source: args.goldenPath, scoreboard, rows }, null, 2)}\n`); 85 + 86 + console.log(JSON.stringify({ outDir, readme: readmePath, scorecard: scorecardPath, scoreboard }, null, 2)); 87 + if (scoreboard.hardFail > 0) process.exitCode = 1; 88 + 89 + async function readContextIfNeeded( 90 + item: DogfoodGolden, 91 + hit: FindResult | null, 92 + prefix: string, 93 + safeId: string, 94 + outDir: string, 95 + ): Promise<{ kind?: "read-range" | "read-page"; text?: string; textPath?: string; unavailableReason?: string }> { 96 + const mode = desiredContextMode(item, hit); 97 + if (!mode) return {}; 98 + if (!hit) return { unavailableReason: "no selected hit for context read" }; 99 + if (mode === "read-range" && typeof hit.matchSeq !== "number") { 100 + return { kind: "read-range", unavailableReason: "selected hit has no numeric matchSeq" }; 101 + } 102 + 103 + const command = buildContextCommand(item, hit, mode); 104 + const contextJson = await runCommand([...command, "--json"]); 105 + const contextText = await runCommand(command); 106 + const jsonPath = join(outDir, `${prefix}-${safeId}.${mode}.json`); 107 + const txtPath = join(outDir, `${prefix}-${safeId}.${mode}.txt`); 108 + writeFileSync(jsonPath, contextJson); 109 + writeFileSync(txtPath, contextText); 110 + return { kind: mode, text: contextText, textPath: txtPath }; 111 + } 112 + 113 + function buildContextCommand(item: DogfoodGolden, hit: FindResult, mode: "read-range" | "read-page"): string[] { 114 + const context = item.expected.context ?? {}; 115 + if (mode === "read-range") { 116 + return [ 117 + process.execPath, "--import", "tsx", CLI_ENTRY, 118 + "read-range", hit.sessionUuid, 119 + "--seq", String(hit.matchSeq), 120 + "--before", String(context.before ?? 2), 121 + "--after", String(context.after ?? 2), 122 + ]; 123 + } 124 + 125 + return [ 126 + process.execPath, "--import", "tsx", CLI_ENTRY, 127 + "read-page", hit.sessionUuid, 128 + "--offset", String(context.offset ?? 0), 129 + "--limit", String(context.limit ?? 20), 130 + ]; 131 + } 132 + 133 + function buildScoreboard(rows: Array<{ item: DogfoodGolden; evaluation: DogfoodEvaluation }>): Record<string, number> { 134 + const scoreboard = { total: rows.length, pass: 0, fail: 0, skip: 0, hardFail: 0, candidateFail: 0 }; 135 + for (const row of rows) { 136 + scoreboard[row.evaluation.mark] += 1; 137 + if (row.item.status === "hard" && row.evaluation.mark === "fail") scoreboard.hardFail += 1; 138 + if (row.item.status === "candidate" && row.evaluation.mark === "fail") scoreboard.candidateFail += 1; 139 + } 140 + return scoreboard; 141 + } 142 + 143 + function renderReadme( 144 + sourcePath: string, 145 + scoreboard: Record<string, number>, 146 + rows: DogfoodEvalRow[], 147 + ): string { 148 + const lines = [ 149 + "# cxs dogfood eval batch", 150 + "", 151 + `- generated_at: ${new Date().toISOString()}`, 152 + `- source: \`${sourcePath}\``, 153 + `- source_file: \`${basename(sourcePath)}\``, 154 + "", 155 + "## summary", 156 + "", 157 + `- total: ${scoreboard.total}`, 158 + `- pass: ${scoreboard.pass}`, 159 + `- fail: ${scoreboard.fail}`, 160 + `- skip: ${scoreboard.skip}`, 161 + `- hard_fail: ${scoreboard.hardFail}`, 162 + `- candidate_fail: ${scoreboard.candidateFail}`, 163 + "", 164 + "| id | status | mark | blocking | selected_rank | selected_title |", 165 + "|----|--------|------|----------|---------------|----------------|", 166 + ]; 167 + 168 + for (const row of rows) { 169 + lines.push(`| ${row.item.id} | ${row.item.status} | ${row.evaluation.mark} | ${row.evaluation.blocking} | ${row.evaluation.selected.rank ?? "-"} | ${row.selectedTitle.replaceAll("|", "¦").slice(0, 60)} |`); 170 + } 171 + 172 + for (const row of rows) { 173 + lines.push("", `## ${row.item.id}`, ""); 174 + lines.push(`- intent: ${row.item.intent}`); 175 + lines.push(`- query: ${row.item.query}`); 176 + lines.push(`- status: ${row.item.status}`); 177 + lines.push(`- mark: ${row.evaluation.mark}`); 178 + lines.push(`- top1_title: ${row.top1Title}`); 179 + lines.push(`- selected_title: ${row.selectedTitle}`); 180 + lines.push(`- find_json: \`${rel(row.findJsonPath)}\``); 181 + if (row.contextTxtPath) lines.push(`- context_txt: \`${rel(row.contextTxtPath)}\``); 182 + lines.push(`- predicates: ${formatPredicates(row.evaluation.predicateResults)}`); 183 + } 184 + 185 + return lines.join("\n"); 186 + } 187 + 188 + function formatPredicates(predicates: DogfoodEvaluation["predicateResults"]): string { 189 + if (predicates.length === 0) return "(none)"; 190 + return predicates.map((predicate) => `${predicate.label}=${predicate.matched ? "ok" : "miss"}(${predicate.expected})`).join(", "); 191 + } 192 + 193 + function rel(path: string): string { 194 + return path.replace(`${ROOT}/`, ""); 195 + } 196 + 197 + function parseArgs(argv: string[]): Args { 198 + const includeStale = argv.includes("--include-stale"); 199 + const goldenPath = argv.find((arg) => !arg.startsWith("--")); 200 + if (!goldenPath) { 201 + console.error("usage: npm run eval:dogfood -- <goldens.local.jsonl> [--include-stale]"); 202 + process.exit(1); 203 + } 204 + return { goldenPath: resolve(goldenPath), includeStale }; 205 + } 206 + 207 + function runCommand(args: string[]): Promise<string> { 208 + return new Promise((resolve, reject) => { 209 + const proc = childSpawn(args[0]!, args.slice(1), { cwd: ROOT, stdio: ["ignore", "pipe", "pipe"] }); 210 + let stdout = ""; 211 + let stderr = ""; 212 + proc.stdout!.setEncoding("utf8"); 213 + proc.stderr!.setEncoding("utf8"); 214 + proc.stdout!.on("data", (chunk: string) => { stdout += chunk; }); 215 + proc.stderr!.on("data", (chunk: string) => { stderr += chunk; }); 216 + proc.on("error", reject); 217 + proc.on("close", (code) => { 218 + if (code === 0) resolve(stdout); 219 + else reject(new Error(`command failed: ${args.join(" ")}\n${stderr || stdout}`)); 220 + }); 221 + }); 222 + }
+2 -1
package.json
··· 46 46 "cxs": "tsx ./src/cli.ts", 47 47 "eval:manual": "tsx ./eval/run-manual-eval.ts", 48 48 "eval:compare": "tsx ./eval/compare-eval-batches.ts", 49 - "eval:perf": "tsx ./eval/perf-bench.ts" 49 + "eval:perf": "tsx ./eval/perf-bench.ts", 50 + "eval:dogfood": "tsx ./eval/run-dogfood-eval.ts" 50 51 }, 51 52 "dependencies": { 52 53 "better-sqlite3": "^12.9.0",