feat: add LLM experiment infrastructure · chadfowler.com/phoenix@a934e65

+226

experiments/eval-runner-llm.ts

··· 1 + #!/usr/bin/env npx tsx 2 + /** 3 + * LLM Evaluation Runner — Fixed harness for LLM canonicalization experiments. 4 + * 5 + * DO NOT MODIFY THIS FILE during experiments. 6 + * The agent modifies only src/experiment-config.ts (LLM_* params). 7 + * 8 + * Usage: npx tsx experiments/eval-runner-llm.ts [--json] [--no-log] 9 + * 10 + * Requires ANTHROPIC_API_KEY in environment. 11 + */ 12 + 13 + import { readFileSync, appendFileSync, existsSync } from 'node:fs'; 14 + import { resolve } from 'node:path'; 15 + import { parseSpec } from '../src/spec-parser.js'; 16 + import { extractCanonicalNodesLLM, extractWithLLMFull } from '../src/canonicalizer-llm.js'; 17 + import { extractCandidates } from '../src/canonicalizer.js'; 18 + import { resolveProvider } from '../src/llm/resolve.js'; 19 + import { GOLD_SPECS, type GoldSpec } from '../tests/eval/gold-standard.js'; 20 + import type { CanonicalNode } from '../src/models/canonical.js'; 21 + import { CONFIG } from '../src/experiment-config.js'; 22 + 23 + const ROOT = resolve(import.meta.dirname, '..'); 24 + const RESULTS_FILE = resolve(ROOT, 'experiments/results-llm.tsv'); 25 + 26 + // ─── LLM setup ────────────────────────────────────────────────────────────── 27 + 28 + const llm = resolveProvider(); 29 + if (!llm) { 30 + console.error('ERROR: No LLM provider available. Set ANTHROPIC_API_KEY.'); 31 + process.exit(1); 32 + } 33 + console.log(`Using LLM: ${llm.name} / ${llm.model}`); 34 + console.log(`Mode: ${CONFIG.LLM_MODE}, self-consistency k=${CONFIG.LLM_SELF_CONSISTENCY_K}`); 35 + 36 + // ─── Metrics ──────────────────────────────────────────────────────────────── 37 + 38 + async function loadAndExtract(spec: GoldSpec) { 39 + const text = readFileSync(resolve(ROOT, spec.path), 'utf8'); 40 + const clauses = parseSpec(text, spec.docId); 41 + const { coverage } = extractCandidates(clauses); 42 + 43 + let nodes: CanonicalNode[]; 44 + if (CONFIG.LLM_MODE === 'extractor') { 45 + nodes = await extractWithLLMFull(clauses, llm!); 46 + } else { 47 + nodes = await extractCanonicalNodesLLM(clauses, llm, { 48 + selfConsistencyK: CONFIG.LLM_SELF_CONSISTENCY_K, 49 + }); 50 + } 51 + 52 + const avgCoverage = coverage.length > 0 53 + ? coverage.reduce((s, c) => s + c.coverage_pct, 0) / coverage.length 54 + : 0; 55 + return { nodes, avgCoverage }; 56 + } 57 + 58 + function findNode(nodes: CanonicalNode[], substringMatch: string): CanonicalNode | undefined { 59 + const lower = substringMatch.toLowerCase(); 60 + return nodes.find(n => n.statement.toLowerCase().includes(lower)); 61 + } 62 + 63 + interface SpecMetrics { 64 + recall: number; 65 + typeAccuracy: number; 66 + coverage: number; 67 + linkPrecision: number; 68 + resDRate: number; 69 + orphanRate: number; 70 + hierCoverage: number; 71 + maxDegree: number; 72 + nodeCount: number; 73 + } 74 + 75 + function computeMetrics(spec: GoldSpec, nodes: CanonicalNode[], avgCoverage: number): SpecMetrics { 76 + let found = 0; 77 + let typeCorrect = 0; 78 + for (const expected of spec.expectedNodes) { 79 + const node = findNode(nodes, expected.statement); 80 + if (node) { 81 + found++; 82 + if (node.type === expected.type) typeCorrect++; 83 + } 84 + } 85 + const recall = spec.expectedNodes.length > 0 ? found / spec.expectedNodes.length : 1; 86 + const typeAccuracy = found > 0 ? typeCorrect / found : 0; 87 + 88 + let edgesFound = 0; 89 + for (const expected of spec.expectedEdges) { 90 + const from = findNode(nodes, expected.from); 91 + const to = findNode(nodes, expected.to); 92 + if (from && to) { 93 + const isLinked = from.linked_canon_ids.includes(to.canon_id) || to.linked_canon_ids.includes(from.canon_id); 94 + if (isLinked) { 95 + const edgeType = from.link_types?.[to.canon_id] || to.link_types?.[from.canon_id]; 96 + if (edgeType === expected.type) edgesFound++; 97 + } 98 + } 99 + } 100 + const linkPrecision = spec.expectedEdges.length > 0 ? edgesFound / spec.expectedEdges.length : 1; 101 + 102 + let totalEdges = 0; 103 + let relatesToEdges = 0; 104 + for (const n of nodes) { 105 + for (const [, et] of Object.entries(n.link_types ?? {})) { 106 + totalEdges++; 107 + if (et === 'relates_to') relatesToEdges++; 108 + } 109 + } 110 + const resDRate = totalEdges > 0 ? relatesToEdges / totalEdges : 0; 111 + const orphanCount = nodes.filter(n => n.linked_canon_ids.length === 0).length; 112 + const orphanRate = nodes.length > 0 ? orphanCount / nodes.length : 0; 113 + const nonContext = nodes.filter(n => n.type !== 'CONTEXT'); 114 + const withParent = nonContext.filter(n => n.parent_canon_id).length; 115 + const hierCoverage = nonContext.length > 0 ? withParent / nonContext.length : 0; 116 + const maxDegree = Math.max(0, ...nodes.map(n => n.linked_canon_ids.length)); 117 + 118 + return { recall, typeAccuracy, coverage: avgCoverage, linkPrecision, resDRate, orphanRate, hierCoverage, maxDegree, nodeCount: nodes.length }; 119 + } 120 + 121 + function compositeScore(avgRecall: number, avgTypeAcc: number, avgCoverage: number, avgDRate: number, avgHier: number): number { 122 + return ( 123 + 0.30 * avgRecall + 124 + 0.25 * avgTypeAcc + 125 + 0.20 * (avgCoverage / 100) + 126 + 0.15 * (1 - avgDRate) + 127 + 0.10 * avgHier 128 + ); 129 + } 130 + 131 + // ─── Main ─────────────────────────────────────────────────────────────────── 132 + 133 + const args = process.argv.slice(2); 134 + const jsonMode = args.includes('--json'); 135 + const noLog = args.includes('--no-log'); 136 + 137 + const allMetrics: { name: string; metrics: SpecMetrics }[] = []; 138 + 139 + for (const spec of GOLD_SPECS) { 140 + try { 141 + process.stderr.write(` Evaluating ${spec.name}...`); 142 + const { nodes, avgCoverage } = await loadAndExtract(spec); 143 + const metrics = computeMetrics(spec, nodes, avgCoverage); 144 + allMetrics.push({ name: spec.name, metrics }); 145 + process.stderr.write(` done (${nodes.length} nodes)\n`); 146 + } catch (e) { 147 + process.stderr.write(` FAILED: ${e}\n`); 148 + allMetrics.push({ 149 + name: spec.name, 150 + metrics: { recall: 0, typeAccuracy: 0, coverage: 0, linkPrecision: 0, resDRate: 1, orphanRate: 1, hierCoverage: 0, maxDegree: 0, nodeCount: 0 }, 151 + }); 152 + } 153 + } 154 + 155 + // Aggregates 156 + const count = allMetrics.length; 157 + const avgRecall = allMetrics.reduce((s, m) => s + m.metrics.recall, 0) / count; 158 + const avgTypeAcc = allMetrics.reduce((s, m) => s + m.metrics.typeAccuracy, 0) / count; 159 + const avgCoverage = allMetrics.reduce((s, m) => s + m.metrics.coverage, 0) / count; 160 + const avgDRate = allMetrics.reduce((s, m) => s + m.metrics.resDRate, 0) / count; 161 + const avgHier = allMetrics.reduce((s, m) => s + m.metrics.hierCoverage, 0) / count; 162 + const avgOrphan = allMetrics.reduce((s, m) => s + m.metrics.orphanRate, 0) / count; 163 + const score = compositeScore(avgRecall, avgTypeAcc, avgCoverage, avgDRate, avgHier); 164 + 165 + if (jsonMode) { 166 + console.log(JSON.stringify({ score, avgRecall, avgTypeAcc, avgCoverage, avgDRate, avgHier, avgOrphan, perSpec: allMetrics }, null, 2)); 167 + } else { 168 + console.log('\n╔═══════════════════════════════════════════════════════════════════════╗'); 169 + console.log('║ PHOENIX CANONICALIZATION — LLM EXPERIMENT EVAL ║'); 170 + console.log('╠═══════════════════════════════════════════════════════════════════════╣'); 171 + console.log('║ Spec │ Recall │ TypeAcc │ Cover │ ResD% │ Hier% │ Nodes ║'); 172 + console.log('╠═══════════════════╪════════╪═════════╪═══════╪═══════╪═══════╪═══════╣'); 173 + 174 + for (const { name, metrics: m } of allMetrics) { 175 + const n = name.padEnd(18); 176 + const recall = (m.recall * 100).toFixed(0).padStart(5) + '%'; 177 + const type = (m.typeAccuracy * 100).toFixed(0).padStart(6) + '%'; 178 + const cov = m.coverage.toFixed(0).padStart(4) + '%'; 179 + const resD = (m.resDRate * 100).toFixed(0).padStart(4) + '%'; 180 + const hier = (m.hierCoverage * 100).toFixed(0).padStart(4) + '%'; 181 + const nodeCount = String(m.nodeCount).padStart(5); 182 + console.log(`║ ${n} │ ${recall} │ ${type} │ ${cov} │ ${resD} │ ${hier} │ ${nodeCount} ║`); 183 + } 184 + 185 + console.log('╠═══════════════════╪════════╪═════════╪═══════╪═══════╪═══════╪═══════╣'); 186 + const avgR = (avgRecall * 100).toFixed(0).padStart(5) + '%'; 187 + const avgT = (avgTypeAcc * 100).toFixed(0).padStart(6) + '%'; 188 + const avgC = avgCoverage.toFixed(0).padStart(4) + '%'; 189 + const avgD = (avgDRate * 100).toFixed(0).padStart(4) + '%'; 190 + const avgH = (avgHier * 100).toFixed(0).padStart(4) + '%'; 191 + console.log(`║ ${'AVERAGE'.padEnd(18)} │ ${avgR} │ ${avgT} │ ${avgC} │ ${avgD} │ ${avgH} │ ║`); 192 + console.log('╚═══════════════════════════════════════════════════════════════════════╝'); 193 + console.log(`\n COMPOSITE SCORE: ${score.toFixed(4)}`); 194 + console.log(` Mode: ${CONFIG.LLM_MODE} | Model: ${llm.model} | k=${CONFIG.LLM_SELF_CONSISTENCY_K}`); 195 + console.log(` Rule-based baseline: 0.9635`); 196 + } 197 + 198 + // ─── Log results ──────────────────────────────────────────────────────────── 199 + 200 + if (!noLog) { 201 + const timestamp = new Date().toISOString(); 202 + const header = 'timestamp\tscore\trecall\ttype_acc\tcoverage\td_rate\thier\torphan\tmode\tmodel\tk'; 203 + 204 + if (!existsSync(RESULTS_FILE)) { 205 + appendFileSync(RESULTS_FILE, header + '\n'); 206 + } 207 + 208 + const row = [ 209 + timestamp, 210 + score.toFixed(4), 211 + (avgRecall * 100).toFixed(1), 212 + (avgTypeAcc * 100).toFixed(1), 213 + avgCoverage.toFixed(1), 214 + (avgDRate * 100).toFixed(1), 215 + (avgHier * 100).toFixed(1), 216 + (avgOrphan * 100).toFixed(1), 217 + CONFIG.LLM_MODE, 218 + llm.model, 219 + CONFIG.LLM_SELF_CONSISTENCY_K, 220 + ].join('\t'); 221 + 222 + appendFileSync(RESULTS_FILE, row + '\n'); 223 + if (!jsonMode) console.log(`\n Results appended to experiments/results-llm.tsv`); 224 + } 225 + 226 + if (!jsonMode) console.log(`\nval_score=${score.toFixed(4)}`);

+71

experiments/program-llm.md

··· 1 + # Phoenix LLM Canonicalization — Experiment Program 2 + 3 + You are an autonomous research agent optimizing Phoenix's LLM-enhanced canonicalization pipeline. 4 + 5 + ## Rules 6 + 7 + 1. **Edit ONLY `src/experiment-config.ts`** — only the `LLM_*` parameters 8 + 2. **Run `npx tsx experiments/eval-runner-llm.ts`** after every change 9 + 3. **Parse the composite score** from the last line: `val_score=X.XXXX` 10 + 4. **If score improved** → `git add src/experiment-config.ts && git commit -m "llm-experiment: <description> score=X.XXXX"` 11 + 5. **If score decreased or unchanged** → `git checkout src/experiment-config.ts` (revert) 12 + 6. **Never stop to ask the human** — run experiments indefinitely until interrupted 13 + 7. **Never install packages** — work within existing dependencies 14 + 8. **Log your reasoning** in commit messages 15 + 16 + ## Baseline 17 + 18 + Rule-based pipeline score: **0.9635** (the target to beat) 19 + 20 + ## Available LLM Parameters 21 + 22 + ### Mode Selection 23 + - `LLM_MODE` — `'normalizer'` (rule extraction + LLM polish) or `'extractor'` (full LLM extraction) 24 + - `LLM_MODEL` — model ID (currently `'claude-sonnet-4-20250514'`) 25 + 26 + ### Normalizer Mode 27 + - `LLM_NORMALIZER_TEMPERATURE` — temperature for single-shot normalization (currently 0) 28 + - `LLM_NORMALIZER_MAX_TOKENS` — max response tokens (currently 150) 29 + - `LLM_NORMALIZER_SYSTEM` — system prompt for normalization 30 + - `LLM_SELF_CONSISTENCY_K` — number of samples for self-consistency (1 = disabled) 31 + - `LLM_CONSISTENCY_TEMPERATURE` — temperature for consistency samples (currently 0.3) 32 + 33 + ### Extractor Mode 34 + - `LLM_EXTRACTOR_TEMPERATURE` — temperature for extraction (currently 0.1) 35 + - `LLM_EXTRACTOR_MAX_TOKENS` — max response tokens (currently 4096) 36 + - `LLM_EXTRACTOR_BATCH_SIZE` — clauses per LLM call (currently 20) 37 + - `LLM_EXTRACTOR_CONFIDENCE` — confidence assigned to LLM-extracted nodes (currently 0.7) 38 + - `LLM_EXTRACTOR_SYSTEM` — system prompt for extraction 39 + 40 + ## Research Priorities 41 + 42 + _Edit this section to steer the agent's focus._ 43 + 44 + 1. **Beat the rule-based baseline (0.9635)** — the LLM should add value over rules alone 45 + 2. **Focus on type accuracy** — that's where rules hit their ceiling (89%). The LLM should classify REQUIREMENT vs CONSTRAINT vs INVARIANT better than keyword matching. 46 + 3. **Try normalizer mode first** — it preserves rule-based extraction (proven recall) and only uses LLM to polish statements. Lower risk, lower API cost. 47 + 4. **Try extractor mode second** — if normalizer can't beat baseline, try full LLM extraction. Higher risk but potentially higher reward. 48 + 5. **System prompt engineering** — the biggest lever. Try: 49 + - More specific type classification rules with examples 50 + - Few-shot examples in the system prompt 51 + - Domain-specific guidance (spec language patterns) 52 + 6. **Self-consistency** — try k=3 or k=5 to see if multiple samples improve stability 53 + 54 + ## Strategy Tips 55 + 56 + - Normalizer mode costs ~1 API call per non-CONTEXT node (~15-25 per spec, ~200 total) 57 + - Extractor mode costs ~1 API call per batch of 20 clauses (~1-2 per spec, ~12-24 total) 58 + - Start with normalizer mode (cheaper, safer) before trying extractor 59 + - System prompt changes are the highest-leverage parameter 60 + - Temperature 0 is most deterministic but may miss nuance; try 0.1-0.2 61 + - Self-consistency k>1 is expensive (k × normal cost) — try k=3 first 62 + - Each run takes ~30-60 seconds due to API calls — be patient 63 + 64 + ## Cost Awareness 65 + 66 + Each experiment run makes real API calls. Approximate costs: 67 + - Normalizer mode: ~$0.02-0.05 per run (small prompts, many calls) 68 + - Extractor mode: ~$0.05-0.15 per run (large prompts, fewer calls) 69 + - Self-consistency k=3: ~3x normalizer cost 70 + 71 + Keep experiments focused. Don't run more than 20-30 experiments per session.

+2

experiments/results-llm.tsv

··· 1 + timestamp score recall type_acc coverage d_rate hier orphan mode model k 2 + 2026-03-26T23:55:33.960Z 0.8599 70.8 82.8 99.7 5.5 99.4 6.6 normalizer claude-sonnet-4-20250514 1

+13 -35

src/canonicalizer-llm.ts

··· 15 15 import { sha256 } from './semhash.js'; 16 16 import { extractCandidates } from './canonicalizer.js'; 17 17 import { resolveGraph } from './resolution.js'; 18 - 19 - // ─── LLM-as-Normalizer ────────────────────────────────────────────────────── 20 - 21 - const NORMALIZER_SYSTEM = `You are a requirements engineer. Rewrite the given statement in canonical form. 22 - Rules: one clear sentence, present tense, active voice, no pronouns, no ambiguity. 23 - Output ONLY a JSON object: {"statement": "..."} 24 - No markdown, no explanation.`; 18 + import { CONFIG } from './experiment-config.js'; 25 19 26 20 export interface LLMCanonOptions { 27 21 /** Enable self-consistency with k samples (default: 1 = no self-consistency) */ ··· 72 66 if (k <= 1) { 73 67 // Single-shot normalization 74 68 const response = await llm.generate(prompt, { 75 - system: NORMALIZER_SYSTEM, 76 - temperature: 0, 77 - maxTokens: 150, 69 + system: CONFIG.LLM_NORMALIZER_SYSTEM, 70 + temperature: CONFIG.LLM_NORMALIZER_TEMPERATURE, 71 + maxTokens: CONFIG.LLM_NORMALIZER_MAX_TOKENS, 78 72 }); 79 73 const normalized = parseNormalizerResponse(response); 80 74 if (normalized && normalized.length > 5) { ··· 88 82 const samples: string[] = []; 89 83 for (let i = 0; i < k; i++) { 90 84 const response = await llm.generate(prompt, { 91 - system: NORMALIZER_SYSTEM, 92 - temperature: i === 0 ? 0 : 0.3, // first sample at temp=0, rest at 0.3 93 - maxTokens: 150, 85 + system: CONFIG.LLM_NORMALIZER_SYSTEM, 86 + temperature: i === 0 ? CONFIG.LLM_NORMALIZER_TEMPERATURE : CONFIG.LLM_CONSISTENCY_TEMPERATURE, 87 + maxTokens: CONFIG.LLM_NORMALIZER_MAX_TOKENS, 94 88 }); 95 89 const parsed = parseNormalizerResponse(response); 96 90 if (parsed && parsed.length > 5) samples.push(parsed); ··· 179 173 180 174 // ─── LLM-as-Extractor (behind --llm-extract flag) ─────────────────────────── 181 175 182 - const EXTRACT_SYSTEM = `You are a requirements engineer extracting structured canonical nodes from specification text. 183 - 184 - For each meaningful statement, extract a JSON object with: 185 - - type: one of REQUIREMENT, CONSTRAINT, INVARIANT, DEFINITION, CONTEXT 186 - - statement: the normalized canonical statement (clear, unambiguous, one idea) 187 - - tags: array of key domain terms (lowercase, no stop words) 188 - - source_section: the section heading this was extracted from 189 - 190 - Rules: 191 - - REQUIREMENT: something the system must do (capabilities, features) 192 - - CONSTRAINT: something the system must NOT do, or limits/bounds 193 - - INVARIANT: something that must ALWAYS or NEVER hold 194 - - DEFINITION: defines a term or concept 195 - - CONTEXT: framing text that gives meaning but isn't actionable alone 196 - 197 - Output a JSON array. No markdown fences, no explanation. 198 - Every node MUST include source_section.`; 176 + // Extractor system prompt loaded from CONFIG 199 177 200 178 interface LLMExtractedNode { 201 179 type: string; ··· 230 208 clauses: Clause[], 231 209 llm: LLMProvider, 232 210 ): Promise<CandidateNode[]> { 233 - const BATCH_SIZE = 20; 211 + const BATCH_SIZE = CONFIG.LLM_EXTRACTOR_BATCH_SIZE; 234 212 const allCandidates: CandidateNode[] = []; 235 213 236 214 for (let i = 0; i < clauses.length; i += BATCH_SIZE) { ··· 238 216 const prompt = buildExtractPrompt(batch); 239 217 240 218 const response = await llm.generate(prompt, { 241 - system: EXTRACT_SYSTEM, 242 - temperature: 0.1, 243 - maxTokens: 4096, 219 + system: CONFIG.LLM_EXTRACTOR_SYSTEM, 220 + temperature: CONFIG.LLM_EXTRACTOR_TEMPERATURE, 221 + maxTokens: CONFIG.LLM_EXTRACTOR_MAX_TOKENS, 244 222 }); 245 223 246 224 const parsed = parseLLMExtractResponse(response); ··· 262 240 candidate_id: candidateId, 263 241 type, 264 242 statement: item.statement, 265 - confidence: 0.7, // LLM extraction gets moderate confidence 243 + confidence: CONFIG.LLM_EXTRACTOR_CONFIDENCE, 266 244 source_clause_ids: [sourceClause.clause_id], 267 245 tags: item.tags || [], 268 246 sentence_index: idx,

+33

src/experiment-config.ts

··· 44 44 // ─── warm-hasher.ts ─────────────────────────────────────────────────────── 45 45 WARM_MIN_CONFIDENCE: 0.3, 46 46 47 + // ─── canonicalizer-llm.ts ───────────────────────────────────────────────── 48 + LLM_MODE: 'normalizer' as 'normalizer' | 'extractor', 49 + LLM_MODEL: 'claude-sonnet-4-20250514', 50 + LLM_NORMALIZER_TEMPERATURE: 0, 51 + LLM_NORMALIZER_MAX_TOKENS: 150, 52 + LLM_NORMALIZER_SYSTEM: `You are a requirements engineer. Rewrite the given statement in canonical form. 53 + Rules: one clear sentence, present tense, active voice, no pronouns, no ambiguity. 54 + Output ONLY a JSON object: {"statement": "..."} 55 + No markdown, no explanation.`, 56 + LLM_SELF_CONSISTENCY_K: 1, 57 + LLM_CONSISTENCY_TEMPERATURE: 0.3, 58 + LLM_EXTRACTOR_TEMPERATURE: 0.1, 59 + LLM_EXTRACTOR_MAX_TOKENS: 4096, 60 + LLM_EXTRACTOR_BATCH_SIZE: 20, 61 + LLM_EXTRACTOR_CONFIDENCE: 0.7, 62 + LLM_EXTRACTOR_SYSTEM: `You are a requirements engineer extracting structured canonical nodes from specification text. 63 + 64 + For each meaningful statement, extract a JSON object with: 65 + - type: one of REQUIREMENT, CONSTRAINT, INVARIANT, DEFINITION, CONTEXT 66 + - statement: the normalized canonical statement (clear, unambiguous, one idea) 67 + - tags: array of key domain terms (lowercase, no stop words) 68 + - source_section: the section heading this was extracted from 69 + 70 + Rules: 71 + - REQUIREMENT: something the system must do (capabilities, features) 72 + - CONSTRAINT: something the system must NOT do, or limits/bounds 73 + - INVARIANT: something that must ALWAYS or NEVER hold 74 + - DEFINITION: defines a term or concept 75 + - CONTEXT: framing text that gives meaning but isn't actionable alone 76 + 77 + Output a JSON array. No markdown fences, no explanation. 78 + Every node MUST include source_section.`, 79 + 47 80 // ─── classifier.ts ──────────────────────────────────────────────────────── 48 81 CLASS_A_NORM_DIFF: 0.1, 49 82 CLASS_A_TERM_DELTA: 0.2,

Configure Feed

Configure Feed