feat: add autoresearch-style experiment loop for canonicalization tuning

+218

experiments/eval-runner.ts

··· 1 + #!/usr/bin/env npx tsx 2 + /** 3 + * Evaluation Runner — Fixed harness for the autoresearch experiment loop. 4 + * 5 + * DO NOT MODIFY THIS FILE during experiments. 6 + * The agent modifies only experiments/config.ts. 7 + * 8 + * Usage: npx tsx experiments/eval-runner.ts [--json] [--no-log] 9 + */ 10 + 11 + import { readFileSync, appendFileSync, existsSync } from 'node:fs'; 12 + import { resolve } from 'node:path'; 13 + import { parseSpec } from '../src/spec-parser.js'; 14 + import { extractCanonicalNodes, extractCandidates } from '../src/canonicalizer.js'; 15 + import { GOLD_SPECS, type GoldSpec } from '../tests/eval/gold-standard.js'; 16 + import type { CanonicalNode } from '../src/models/canonical.js'; 17 + import { CONFIG } from '../src/experiment-config.js'; 18 + 19 + const ROOT = resolve(import.meta.dirname, '..'); 20 + const RESULTS_FILE = resolve(ROOT, 'experiments/results.tsv'); 21 + 22 + // ─── Metrics computation (same as eval test, but standalone) ──────────────── 23 + 24 + function loadAndExtract(spec: GoldSpec) { 25 + const text = readFileSync(resolve(ROOT, spec.path), 'utf8'); 26 + const clauses = parseSpec(text, spec.docId); 27 + const { candidates, coverage } = extractCandidates(clauses); 28 + const nodes = extractCanonicalNodes(clauses); 29 + const avgCoverage = coverage.length > 0 30 + ? coverage.reduce((s, c) => s + c.coverage_pct, 0) / coverage.length 31 + : 0; 32 + return { clauses, candidates, coverage, nodes, avgCoverage }; 33 + } 34 + 35 + function findNode(nodes: CanonicalNode[], substringMatch: string): CanonicalNode | undefined { 36 + const lower = substringMatch.toLowerCase(); 37 + return nodes.find(n => n.statement.toLowerCase().includes(lower)); 38 + } 39 + 40 + interface SpecMetrics { 41 + recall: number; 42 + typeAccuracy: number; 43 + coverage: number; 44 + linkPrecision: number; 45 + resDRate: number; 46 + orphanRate: number; 47 + hierCoverage: number; 48 + maxDegree: number; 49 + nodeCount: number; 50 + } 51 + 52 + function computeMetrics(spec: GoldSpec, nodes: CanonicalNode[], avgCoverage: number): SpecMetrics { 53 + let found = 0; 54 + let typeCorrect = 0; 55 + for (const expected of spec.expectedNodes) { 56 + const node = findNode(nodes, expected.statement); 57 + if (node) { 58 + found++; 59 + if (node.type === expected.type) typeCorrect++; 60 + } 61 + } 62 + const recall = spec.expectedNodes.length > 0 ? found / spec.expectedNodes.length : 1; 63 + const typeAccuracy = found > 0 ? typeCorrect / found : 0; 64 + 65 + let edgesFound = 0; 66 + for (const expected of spec.expectedEdges) { 67 + const from = findNode(nodes, expected.from); 68 + const to = findNode(nodes, expected.to); 69 + if (from && to) { 70 + const isLinked = from.linked_canon_ids.includes(to.canon_id) || to.linked_canon_ids.includes(from.canon_id); 71 + if (isLinked) { 72 + const edgeType = from.link_types?.[to.canon_id] || to.link_types?.[from.canon_id]; 73 + if (edgeType === expected.type) edgesFound++; 74 + } 75 + } 76 + } 77 + const linkPrecision = spec.expectedEdges.length > 0 ? edgesFound / spec.expectedEdges.length : 1; 78 + 79 + let totalEdges = 0; 80 + let relatesToEdges = 0; 81 + for (const n of nodes) { 82 + for (const [, et] of Object.entries(n.link_types ?? {})) { 83 + totalEdges++; 84 + if (et === 'relates_to') relatesToEdges++; 85 + } 86 + } 87 + const resDRate = totalEdges > 0 ? relatesToEdges / totalEdges : 0; 88 + 89 + const orphanCount = nodes.filter(n => n.linked_canon_ids.length === 0).length; 90 + const orphanRate = nodes.length > 0 ? orphanCount / nodes.length : 0; 91 + 92 + const nonContext = nodes.filter(n => n.type !== 'CONTEXT'); 93 + const withParent = nonContext.filter(n => n.parent_canon_id).length; 94 + const hierCoverage = nonContext.length > 0 ? withParent / nonContext.length : 0; 95 + 96 + const maxDegree = Math.max(0, ...nodes.map(n => n.linked_canon_ids.length)); 97 + 98 + return { recall, typeAccuracy, coverage: avgCoverage, linkPrecision, resDRate, orphanRate, hierCoverage, maxDegree, nodeCount: nodes.length }; 99 + } 100 + 101 + // ─── Composite score ──────────────────────────────────────────────────────── 102 + 103 + function compositeScore(avgRecall: number, avgTypeAcc: number, avgCoverage: number, avgDRate: number, avgHier: number): number { 104 + return ( 105 + 0.30 * avgRecall + 106 + 0.25 * avgTypeAcc + 107 + 0.20 * (avgCoverage / 100) + 108 + 0.15 * (1 - avgDRate) + 109 + 0.10 * avgHier 110 + ); 111 + } 112 + 113 + // ─── Main ─────────────────────────────────────────────────────────────────── 114 + 115 + const args = process.argv.slice(2); 116 + const jsonMode = args.includes('--json'); 117 + const noLog = args.includes('--no-log'); 118 + 119 + const allMetrics: { name: string; metrics: SpecMetrics }[] = []; 120 + 121 + for (const spec of GOLD_SPECS) { 122 + try { 123 + const { nodes, avgCoverage } = loadAndExtract(spec); 124 + const metrics = computeMetrics(spec, nodes, avgCoverage); 125 + allMetrics.push({ name: spec.name, metrics }); 126 + } catch (e) { 127 + console.error(`FAILED: ${spec.name} — ${e}`); 128 + allMetrics.push({ 129 + name: spec.name, 130 + metrics: { recall: 0, typeAccuracy: 0, coverage: 0, linkPrecision: 0, resDRate: 1, orphanRate: 1, hierCoverage: 0, maxDegree: 0, nodeCount: 0 }, 131 + }); 132 + } 133 + } 134 + 135 + // Aggregates 136 + const count = allMetrics.length; 137 + const avgRecall = allMetrics.reduce((s, m) => s + m.metrics.recall, 0) / count; 138 + const avgTypeAcc = allMetrics.reduce((s, m) => s + m.metrics.typeAccuracy, 0) / count; 139 + const avgCoverage = allMetrics.reduce((s, m) => s + m.metrics.coverage, 0) / count; 140 + const avgDRate = allMetrics.reduce((s, m) => s + m.metrics.resDRate, 0) / count; 141 + const avgHier = allMetrics.reduce((s, m) => s + m.metrics.hierCoverage, 0) / count; 142 + const avgOrphan = allMetrics.reduce((s, m) => s + m.metrics.orphanRate, 0) / count; 143 + const score = compositeScore(avgRecall, avgTypeAcc, avgCoverage, avgDRate, avgHier); 144 + 145 + if (jsonMode) { 146 + console.log(JSON.stringify({ score, avgRecall, avgTypeAcc, avgCoverage, avgDRate, avgHier, avgOrphan, perSpec: allMetrics }, null, 2)); 147 + } else { 148 + // ASCII table 149 + console.log('\n╔═══════════════════════════════════════════════════════════════════════╗'); 150 + console.log('║ PHOENIX CANONICALIZATION — EXPERIMENT EVAL ║'); 151 + console.log('╠═══════════════════════════════════════════════════════════════════════╣'); 152 + console.log('║ Spec │ Recall │ TypeAcc │ Cover │ ResD% │ Hier% │ Nodes ║'); 153 + console.log('╠═══════════════════╪════════╪═════════╪═══════╪═══════╪═══════╪═══════╣'); 154 + 155 + for (const { name, metrics: m } of allMetrics) { 156 + const n = name.padEnd(18); 157 + const recall = (m.recall * 100).toFixed(0).padStart(5) + '%'; 158 + const type = (m.typeAccuracy * 100).toFixed(0).padStart(6) + '%'; 159 + const cov = m.coverage.toFixed(0).padStart(4) + '%'; 160 + const resD = (m.resDRate * 100).toFixed(0).padStart(4) + '%'; 161 + const hier = (m.hierCoverage * 100).toFixed(0).padStart(4) + '%'; 162 + const nodeCount = String(m.nodeCount).padStart(5); 163 + console.log(`║ ${n} │ ${recall} │ ${type} │ ${cov} │ ${resD} │ ${hier} │ ${nodeCount} ║`); 164 + } 165 + 166 + console.log('╠═══════════════════╪════════╪═════════╪═══════╪═══════╪═══════╪═══════╣'); 167 + const avgR = (avgRecall * 100).toFixed(0).padStart(5) + '%'; 168 + const avgT = (avgTypeAcc * 100).toFixed(0).padStart(6) + '%'; 169 + const avgC = avgCoverage.toFixed(0).padStart(4) + '%'; 170 + const avgD = (avgDRate * 100).toFixed(0).padStart(4) + '%'; 171 + const avgH = (avgHier * 100).toFixed(0).padStart(4) + '%'; 172 + console.log(`║ ${'AVERAGE'.padEnd(18)} │ ${avgR} │ ${avgT} │ ${avgC} │ ${avgD} │ ${avgH} │ ║`); 173 + console.log('╚═══════════════════════════════════════════════════════════════════════╝'); 174 + console.log(`\n COMPOSITE SCORE: ${score.toFixed(4)}`); 175 + console.log(` Formula: 0.30·recall + 0.25·typeAcc + 0.20·coverage + 0.15·(1-dRate) + 0.10·hier`); 176 + console.log(`\n Targets: Recall ≥95%, TypeAcc ≥90%, Coverage ≥95%, ResD ≤20%, Hier ≥50%`); 177 + } 178 + 179 + // ─── Append to results.tsv ────────────────────────────────────────────────── 180 + 181 + if (!noLog) { 182 + const timestamp = new Date().toISOString(); 183 + const header = 'timestamp\tscore\trecall\ttype_acc\tcoverage\td_rate\thier\torphan\tconfig_hash'; 184 + 185 + if (!existsSync(RESULTS_FILE)) { 186 + appendFileSync(RESULTS_FILE, header + '\n'); 187 + } 188 + 189 + // Simple config hash for dedup detection 190 + const configStr = JSON.stringify(CONFIG); 191 + let hash = 0; 192 + for (let i = 0; i < configStr.length; i++) { 193 + hash = ((hash << 5) - hash + configStr.charCodeAt(i)) | 0; 194 + } 195 + const configHash = Math.abs(hash).toString(36); 196 + 197 + const row = [ 198 + timestamp, 199 + score.toFixed(4), 200 + (avgRecall * 100).toFixed(1), 201 + (avgTypeAcc * 100).toFixed(1), 202 + avgCoverage.toFixed(1), 203 + (avgDRate * 100).toFixed(1), 204 + (avgHier * 100).toFixed(1), 205 + (avgOrphan * 100).toFixed(1), 206 + configHash, 207 + ].join('\t'); 208 + 209 + appendFileSync(RESULTS_FILE, row + '\n'); 210 + if (!jsonMode) { 211 + console.log(`\n Results appended to experiments/results.tsv`); 212 + } 213 + } 214 + 215 + // Exit with score as a parseable last line 216 + if (!jsonMode) { 217 + console.log(`\nval_score=${score.toFixed(4)}`); 218 + }

+91

experiments/program.md

··· 1 + # Phoenix Canonicalization — Experiment Program 2 + 3 + You are an autonomous research agent optimizing Phoenix's canonicalization pipeline. 4 + 5 + ## Rules 6 + 7 + 1. **Edit ONLY `src/experiment-config.ts`** — never touch source files, tests, or this file 8 + 2. **Run `npx tsx experiments/eval-runner.ts`** after every change 9 + 3. **Parse the composite score** from the last line: `val_score=X.XXXX` 10 + 4. **If score improved** → `git add src/experiment-config.ts && git commit -m "experiment: <description> score=X.XXXX"` 11 + 5. **If score decreased or unchanged** → `git checkout src/experiment-config.ts` (revert) 12 + 6. **Never stop to ask the human** — run experiments indefinitely until interrupted 13 + 7. **Never install packages** — work within existing dependencies 14 + 8. **Log your reasoning** in commit messages so the human can review your thought process 15 + 16 + ## Composite Score Formula 17 + 18 + ``` 19 + score = 0.30 * avg_recall 20 + + 0.25 * avg_type_accuracy 21 + + 0.20 * avg_coverage / 100 22 + + 0.15 * (1 - avg_d_rate) 23 + + 0.10 * avg_hier_coverage 24 + ``` 25 + 26 + Higher is better. Current baseline is established in `experiments/results.tsv`. 27 + 28 + ## Available Parameters (in config.ts) 29 + 30 + ### Resolution (graph construction) 31 + - `MAX_DEGREE` — max edges per node (currently 8) 32 + - `MIN_SHARED_TAGS` — minimum shared tags to create an edge (currently 2) 33 + - `JACCARD_DEDUP_THRESHOLD` — similarity threshold for merging duplicates (currently 0.7) 34 + - `FINGERPRINT_PREFIX_COUNT` — number of token prefixes for dedup bucketing (currently 8) 35 + - `DOC_FREQ_CUTOFF` — fraction above which tags are considered trivial (currently 0.4) 36 + 37 + ### Scoring Weights (type classification) 38 + - `CONSTRAINT_NEGATION_WEIGHT` — "must not", "forbidden", etc. (currently 4) 39 + - `CONSTRAINT_LIMIT_WEIGHT` — "maximum", "at most", etc. (currently 3) 40 + - `CONSTRAINT_NUMERIC_WEIGHT` — numeric bounds like "≤100" (currently 2) 41 + - `INVARIANT_SIGNAL_WEIGHT` — "always", "never", "guaranteed" (currently 4) 42 + - `REQUIREMENT_MODAL_WEIGHT` — "must", "shall" (currently 2) 43 + - `REQUIREMENT_KEYWORD_WEIGHT` — "required", "needs to" (currently 2) 44 + - `REQUIREMENT_VERB_WEIGHT` — action verbs like "implement", "validate" (currently 1) 45 + - `DEFINITION_EXPLICIT_WEIGHT` — "is defined as", "means" (currently 4) 46 + - `DEFINITION_COLON_WEIGHT` — "Term: definition" pattern (currently 3) 47 + - `CONTEXT_NO_MODAL_WEIGHT` — no modal verbs present (currently 2) 48 + - `CONTEXT_SHORT_WEIGHT` — short sentence without modals (currently 1) 49 + - `HEADING_CONTEXT_BONUS` — bonus from heading keywords (currently 2) 50 + - `CONSTRAINT_MUST_BONUS` — extra constraint credit for "must" (currently 1) 51 + 52 + ### Confidence & Extraction 53 + - `MIN_CONFIDENCE` / `MAX_CONFIDENCE` — confidence bounds (currently 0.3 / 1.0) 54 + - `DEFINITION_MAX_LENGTH` — max text length for definition detection (currently 200) 55 + - `MIN_EXTRACTION_LENGTH` — minimum sentence length to extract (currently 5) 56 + - `MIN_TERM_LENGTH` — minimum hyphenated compound length (currently 3) 57 + - `MIN_WORD_LENGTH` — minimum individual word length for terms (currently 2) 58 + 59 + ### Sentence Segmentation 60 + - `MIN_LIST_ITEM_LENGTH` — minimum list item character length (currently 3) 61 + - `MIN_PROSE_SENTENCE_LENGTH` — minimum prose sentence length (currently 3) 62 + - `PROSE_SPLIT_THRESHOLD` — text length below which no sentence splitting (currently 80) 63 + - `MIN_SPLIT_PART_LENGTH` — minimum part length for compound splits (currently 3) 64 + 65 + ### Warm Hashing 66 + - `WARM_MIN_CONFIDENCE` — minimum confidence for warm hash inclusion (currently 0.3) 67 + 68 + ### Change Classification 69 + - `CLASS_A_NORM_DIFF` / `CLASS_A_TERM_DELTA` — thresholds for trivial (currently 0.1 / 0.2) 70 + - `CLASS_B_NORM_DIFF` / `CLASS_B_TERM_DELTA` — thresholds for local semantic (currently 0.5 / 0.5) 71 + - `CLASS_D_HIGH_CHANGE` — threshold for uncertain classification (currently 0.7) 72 + - `ANCHOR_MATCH_THRESHOLD` — anchor overlap to rescue from D→B (currently 0.5) 73 + 74 + ## Research Priorities 75 + 76 + _Edit this section to steer the agent's focus._ 77 + 78 + 1. **Maximize recall** — the gold-standard nodes that aren't being found. This is the highest-weighted component. 79 + 2. **Improve type accuracy** — correct classification of REQUIREMENT vs CONSTRAINT vs INVARIANT. 80 + 3. **Reduce D-rate** — lower the fraction of 'relates_to' fallback edges. 81 + 4. **Tune dedup** — the Jaccard threshold (0.7) and fingerprint settings might be too aggressive or too loose. 82 + 5. **Explore scoring weight ratios** — the relative weights between type signals matter more than absolute values. 83 + 84 + ## Strategy Tips 85 + 86 + - Change ONE parameter at a time to isolate effects 87 + - Try both directions (increase and decrease) for each parameter 88 + - The scoring weights interact — after finding a good single-param change, try combinations 89 + - The resolution parameters (JACCARD_DEDUP_THRESHOLD, DOC_FREQ_CUTOFF) affect graph structure globally 90 + - Small changes to MIN_EXTRACTION_LENGTH or PROSE_SPLIT_THRESHOLD can change which sentences get extracted at all 91 + - Watch for overfitting: if one spec improves dramatically but others drop, the change isn't generalizable

+2

experiments/results.tsv

··· 1 + timestamp score recall type_acc coverage d_rate hier orphan config_hash 2 + 2026-03-26T21:15:29.707Z 0.8785 100.0 94.4 95.5 65.8 100.0 46.0 bgggha

+20 -19

src/canonicalizer.ts

··· 15 15 import { normalizeText } from './normalizer.js'; 16 16 import { segmentSentences } from './sentence-segmenter.js'; 17 17 import { resolveGraph } from './resolution.js'; 18 + import { CONFIG } from './experiment-config.js'; 18 19 19 20 // ─── Domain term whitelist (short tokens to keep) ──────────────────────────── 20 21 ··· 52 53 53 54 // ── Constraint signals ── 54 55 if (/\b(?:must not|shall not|may not|cannot|can't|disallowed|forbidden|prohibited)\b/i.test(text)) { 55 - scores[CanonicalType.CONSTRAINT] += 4; 56 + scores[CanonicalType.CONSTRAINT] += CONFIG.CONSTRAINT_NEGATION_WEIGHT; 56 57 } 57 58 if (/\b(?:limited to|maximum|minimum|at most|at least|no more than|no fewer than|up to|ceiling|floor)\b/i.test(text)) { 58 - scores[CanonicalType.CONSTRAINT] += 3; 59 + scores[CanonicalType.CONSTRAINT] += CONFIG.CONSTRAINT_LIMIT_WEIGHT; 59 60 } 60 61 // Numeric bounds: "5 per minute", "≤ 100", "between 1 and 10" 61 62 if (/\b\d+\s*(?:per|\/)\s*\w+\b/i.test(text) || /[≤≥<>]\s*\d+/.test(text)) { 62 - scores[CanonicalType.CONSTRAINT] += 2; 63 + scores[CanonicalType.CONSTRAINT] += CONFIG.CONSTRAINT_NUMERIC_WEIGHT; 63 64 } 64 65 65 66 // ── Invariant signals ── 66 67 if (/\b(?:always|never|at all times|regardless|invariant|guaranteed|must remain|must always|must never)\b/i.test(text)) { 67 - scores[CanonicalType.INVARIANT] += 4; 68 + scores[CanonicalType.INVARIANT] += CONFIG.INVARIANT_SIGNAL_WEIGHT; 68 69 } 69 70 70 71 // ── Requirement signals ── 71 72 if (/\b(?:must|shall)\b/i.test(text) && !/\b(?:must not|shall not|must always|must never|must remain)\b/i.test(text)) { 72 - scores[CanonicalType.REQUIREMENT] += 2; 73 + scores[CanonicalType.REQUIREMENT] += CONFIG.REQUIREMENT_MODAL_WEIGHT; 73 74 } 74 75 if (/\b(?:required|requires?|needs? to|has to|will)\b/i.test(text)) { 75 - scores[CanonicalType.REQUIREMENT] += 2; 76 + scores[CanonicalType.REQUIREMENT] += CONFIG.REQUIREMENT_KEYWORD_WEIGHT; 76 77 } 77 78 if (/\b(?:support|provide|implement|enable|allow|accept|return|create|delete|update|send|receive|handle|manage|track|store|validate|generate)\b/i.test(text)) { 78 - scores[CanonicalType.REQUIREMENT] += 1; 79 + scores[CanonicalType.REQUIREMENT] += CONFIG.REQUIREMENT_VERB_WEIGHT; 79 80 } 80 81 81 82 // ── Definition signals ── 82 - if (/\b(?:is defined as|means|refers to|is a|is an)\b/i.test(text) && text.length < 200) { 83 - scores[CanonicalType.DEFINITION] += 4; 83 + if (/\b(?:is defined as|means|refers to|is a|is an)\b/i.test(text) && text.length < CONFIG.DEFINITION_MAX_LENGTH) { 84 + scores[CanonicalType.DEFINITION] += CONFIG.DEFINITION_EXPLICIT_WEIGHT; 84 85 } 85 86 // Colon pattern "Term: definition text" but not enumerations 86 87 if (/^[A-Z][a-zA-Z\s]{2,30}:\s+[A-Z]/.test(text) && !/[:,]\s*$/.test(text)) { 87 - scores[CanonicalType.DEFINITION] += 3; 88 + scores[CanonicalType.DEFINITION] += CONFIG.DEFINITION_COLON_WEIGHT; 88 89 } 89 90 90 91 // ── Context signals (no actionable keywords) ── 91 92 if (!hasAnyModal(lower) && !hasAnyKeyword(lower)) { 92 - scores[CanonicalType.CONTEXT] += 2; 93 + scores[CanonicalType.CONTEXT] += CONFIG.CONTEXT_NO_MODAL_WEIGHT; 93 94 } 94 95 // Short sentence without verb-like keywords 95 96 if (text.split(/\s+/).length < 8 && !hasAnyModal(lower)) { 96 - scores[CanonicalType.CONTEXT] += 1; 97 + scores[CanonicalType.CONTEXT] += CONFIG.CONTEXT_SHORT_WEIGHT; 97 98 } 98 99 99 100 // ── Heading context bonus ── 100 101 if (headingContext) { 101 - scores[headingContext] += 2; 102 + scores[headingContext] += CONFIG.HEADING_CONTEXT_BONUS; 102 103 } 103 104 104 105 // ── Also give constraint "must" credit since "must" appears in constraints too ── 105 106 if (/\b(?:must|shall)\b/i.test(text)) { 106 - scores[CanonicalType.CONSTRAINT] += 1; 107 + scores[CanonicalType.CONSTRAINT] += CONFIG.CONSTRAINT_MUST_BONUS; 107 108 } 108 109 109 110 // Pick winner ··· 114 115 115 116 // If nothing scored above 0, it's CONTEXT 116 117 if (winScore === 0) { 117 - return { type: CanonicalType.CONTEXT, confidence: 0.3 }; 118 + return { type: CanonicalType.CONTEXT, confidence: CONFIG.MIN_CONFIDENCE }; 118 119 } 119 120 120 - const confidence = Math.max(0.3, Math.min(1.0, (winScore - runnerUp) / Math.max(winScore, 1))); 121 + const confidence = Math.max(CONFIG.MIN_CONFIDENCE, Math.min(CONFIG.MAX_CONFIDENCE, (winScore - runnerUp) / Math.max(winScore, 1))); 121 122 return { type: winType, confidence }; 122 123 } 123 124 ··· 181 182 182 183 for (const sentence of sentences) { 183 184 const content = sentence.text.trim(); 184 - if (!content || content.length < 5) { 185 + if (!content || content.length < CONFIG.MIN_EXTRACTION_LENGTH) { 185 186 uncovered.push({ text: content, reason: 'too_short' }); 186 187 continue; 187 188 } ··· 260 261 261 262 // Add hyphenated compounds 262 263 for (const h of hyphenated) { 263 - if (h.length >= 3) terms.add(h); 264 + if (h.length >= CONFIG.MIN_TERM_LENGTH) terms.add(h); 264 265 } 265 266 266 267 // Add individual words ··· 273 274 continue; 274 275 } 275 276 // Keep words > 2 chars 276 - if (w.length > 2 && !w.includes('-')) { 277 + if (w.length > CONFIG.MIN_WORD_LENGTH && !w.includes('-')) { 277 278 terms.add(w); 278 279 } 279 280 }

+5 -4

src/classifier.ts

··· 15 15 import type { ClassificationSignals, ChangeClassification } from './models/classification.js'; 16 16 import { ChangeClass } from './models/classification.js'; 17 17 import { extractTerms } from './canonicalizer.js'; 18 + import { CONFIG } from './experiment-config.js'; 18 19 19 20 /** 20 21 * Classify a single clause diff. ··· 132 133 } 133 134 134 135 // Compute confidence and classify 135 - if (normDiff < 0.1 && termDelta < 0.2) { 136 + if (normDiff < CONFIG.CLASS_A_NORM_DIFF && termDelta < CONFIG.CLASS_A_TERM_DELTA) { 136 137 // Very small change, high confidence it's trivial 137 138 return { 138 139 change_class: ChangeClass.A, ··· 156 157 } 157 158 158 159 // Local semantic change 159 - if (normDiff < 0.5 && termDelta < 0.5) { 160 + if (normDiff < CONFIG.CLASS_B_NORM_DIFF && termDelta < CONFIG.CLASS_B_TERM_DELTA) { 160 161 return { 161 162 change_class: ChangeClass.B, 162 163 confidence: 0.8, ··· 167 168 } 168 169 169 170 // High uncertainty — but check anchor overlap first 170 - if (normDiff > 0.7 || termDelta > 0.7) { 171 + if (normDiff > CONFIG.CLASS_D_HIGH_CHANGE || termDelta > CONFIG.CLASS_D_HIGH_CHANGE) { 171 172 // If anchors match, the concepts are the same despite heavy rewording → B not D 172 - if (anchorMatch > 0.5) { 173 + if (anchorMatch > CONFIG.ANCHOR_MATCH_THRESHOLD) { 173 174 return { 174 175 change_class: ChangeClass.B, 175 176 confidence: 0.65,

+53

src/experiment-config.ts

··· 1 + /** 2 + * Experiment Configuration — Single source of truth for all tunable parameters. 3 + * 4 + * The AI agent edits ONLY this file during experiment loops. 5 + * Default values match the original hardcoded constants exactly. 6 + */ 7 + 8 + export const CONFIG = { 9 + // ─── resolution.ts ──────────────────────────────────────────────────────── 10 + MAX_DEGREE: 8, 11 + MIN_SHARED_TAGS: 2, 12 + JACCARD_DEDUP_THRESHOLD: 0.7, 13 + FINGERPRINT_PREFIX_COUNT: 8, 14 + DOC_FREQ_CUTOFF: 0.4, 15 + 16 + // ─── canonicalizer.ts — scoring weights ─────────────────────────────────── 17 + CONSTRAINT_NEGATION_WEIGHT: 4, 18 + CONSTRAINT_LIMIT_WEIGHT: 3, 19 + CONSTRAINT_NUMERIC_WEIGHT: 2, 20 + INVARIANT_SIGNAL_WEIGHT: 4, 21 + REQUIREMENT_MODAL_WEIGHT: 2, 22 + REQUIREMENT_KEYWORD_WEIGHT: 2, 23 + REQUIREMENT_VERB_WEIGHT: 1, 24 + DEFINITION_EXPLICIT_WEIGHT: 4, 25 + DEFINITION_COLON_WEIGHT: 3, 26 + CONTEXT_NO_MODAL_WEIGHT: 2, 27 + CONTEXT_SHORT_WEIGHT: 1, 28 + HEADING_CONTEXT_BONUS: 2, 29 + CONSTRAINT_MUST_BONUS: 1, 30 + MIN_CONFIDENCE: 0.3, 31 + MAX_CONFIDENCE: 1.0, 32 + DEFINITION_MAX_LENGTH: 200, 33 + MIN_EXTRACTION_LENGTH: 5, 34 + MIN_TERM_LENGTH: 3, 35 + MIN_WORD_LENGTH: 2, 36 + 37 + // ─── sentence-segmenter.ts ──────────────────────────────────────────────── 38 + MIN_LIST_ITEM_LENGTH: 3, 39 + MIN_PROSE_SENTENCE_LENGTH: 3, 40 + PROSE_SPLIT_THRESHOLD: 80, 41 + MIN_SPLIT_PART_LENGTH: 3, 42 + 43 + // ─── warm-hasher.ts ─────────────────────────────────────────────────────── 44 + WARM_MIN_CONFIDENCE: 0.3, 45 + 46 + // ─── classifier.ts ──────────────────────────────────────────────────────── 47 + CLASS_A_NORM_DIFF: 0.1, 48 + CLASS_A_TERM_DELTA: 0.2, 49 + CLASS_B_NORM_DIFF: 0.5, 50 + CLASS_B_TERM_DELTA: 0.5, 51 + CLASS_D_HIGH_CHANGE: 0.7, 52 + ANCHOR_MATCH_THRESHOLD: 0.5, 53 + };

+7 -14

src/resolution.ts

··· 15 15 import type { EdgeType } from './models/canonical.js'; 16 16 import { CanonicalType } from './models/canonical.js'; 17 17 import { sha256 } from './semhash.js'; 18 - 19 - /** Maximum outgoing edges per node (excluding 'duplicates') */ 20 - const MAX_DEGREE = 8; 21 - 22 - /** Minimum shared tags for a link (at least 1 must be non-trivial) */ 23 - const MIN_SHARED_TAGS = 2; 18 + import { CONFIG } from './experiment-config.js'; 24 19 25 20 // ─── Main entry point ──────────────────────────────────────────────────────── 26 21 ··· 106 101 for (let j = i + 1; j < group.length; j++) { 107 102 if (used.has(j)) continue; 108 103 const sim = tokenJaccard(primary.statement, group[j].statement); 109 - if (sim > 0.7 && areTypesCompatible(primary.type, group[j].type)) { 104 + if (sim > CONFIG.JACCARD_DEDUP_THRESHOLD && areTypesCompatible(primary.type, group[j].type)) { 110 105 // Merge: keep higher confidence node as primary 111 106 used.add(j); 112 107 for (const s of group[j].source_clause_ids) mergedSources.add(s); ··· 132 127 // Coarse fingerprint: sorted 3-char token prefixes 133 128 const tokens = statement.toLowerCase().split(/\s+/).filter(t => t.length > 2); 134 129 const prefixes = tokens.map(t => t.slice(0, 3)).sort(); 135 - // Use first 8 prefixes as bucket key 136 - return prefixes.slice(0, 8).join('|'); 130 + return prefixes.slice(0, CONFIG.FINGERPRINT_PREFIX_COUNT).join('|'); 137 131 } 138 132 139 133 function tokenJaccard(a: string, b: string): number { ··· 189 183 // IDF for a tag in 40% of N nodes ≈ log(N / (0.4*N)) + 1 ≈ log(2.5) + 1 ≈ 1.92 190 184 // We use a hard threshold based on document frequency, not percentile. 191 185 const N = nodes.length; 192 - const maxDF = Math.max(2, Math.floor(N * 0.4)); // tags in >40% of nodes are trivial 186 + const maxDF = Math.max(2, Math.floor(N * CONFIG.DOC_FREQ_CUTOFF)); 193 187 const idfThreshold = Math.log((N + 1) / (maxDF + 1)) + 1; 194 188 195 189 // Generate candidate pairs from inverted index ··· 215 209 // Create edges for pairs with enough shared tags (at least MIN_SHARED_TAGS total, 216 210 // and at least 1 non-trivial tag) 217 211 for (const { i, j, sharedNonTrivial, sharedTags } of pairScores.values()) { 218 - if (sharedTags.length < MIN_SHARED_TAGS || sharedNonTrivial < 1) continue; 212 + if (sharedTags.length < CONFIG.MIN_SHARED_TAGS || sharedNonTrivial < 1) continue; 219 213 220 214 const nodeA = nodes[i]; 221 215 const nodeB = nodes[j]; ··· 342 336 id => node.link_types?.[id] !== 'duplicates' 343 337 ); 344 338 345 - if (edges.length <= MAX_DEGREE) continue; 339 + if (edges.length <= CONFIG.MAX_DEGREE) continue; 346 340 347 341 // Score each edge by shared tag IDF 348 342 const nodeTagSet = new Set(node.tags); ··· 363 357 edgeScores.push({ id, score }); 364 358 } 365 359 366 - // Keep top MAX_DEGREE edges by score 367 360 edgeScores.sort((a, b) => b.score - a.score); 368 - const keep = new Set(edgeScores.slice(0, MAX_DEGREE).map(e => e.id)); 361 + const keep = new Set(edgeScores.slice(0, CONFIG.MAX_DEGREE).map(e => e.id)); 369 362 370 363 // Also keep all 'duplicates' edges 371 364 for (const id of node.linked_canon_ids) {

+7 -5

src/sentence-segmenter.ts

··· 8 8 * - Lines with sequence indicators (→, ->) are kept atomic 9 9 */ 10 10 11 + import { CONFIG } from './experiment-config.js'; 12 + 11 13 /** A segmented sentence with its position index */ 12 14 export interface Sentence { 13 15 text: string; ··· 52 54 proseBuffer = ''; 53 55 } 54 56 const content = listMatch[1].trim(); 55 - if (content.length >= 3) { 57 + if (content.length >= CONFIG.MIN_LIST_ITEM_LENGTH) { 56 58 // Split compound modals within list items 57 59 const subs = splitCompoundModals(content); 58 60 for (const sub of subs) { ··· 82 84 let idx = startIdx; 83 85 for (const s of raw) { 84 86 const trimmed = s.trim(); 85 - if (trimmed.length < 3) continue; 87 + if (trimmed.length < CONFIG.MIN_PROSE_SENTENCE_LENGTH) continue; 86 88 // Split compound modals 87 89 const subs = splitCompoundModals(trimmed); 88 90 for (const sub of subs) { ··· 96 98 */ 97 99 function splitProseIntoSentences(text: string): string[] { 98 100 // Don't split if it's short enough to be one sentence 99 - if (text.length < 80) return [text]; 101 + if (text.length < CONFIG.PROSE_SPLIT_THRESHOLD) return [text]; 100 102 101 103 const results: string[] = []; 102 104 // Split on '. ', '! ', '? ' followed by uppercase letter ··· 128 130 // Check for semicolons with modals on both sides 129 131 const semiParts = text.split(/\s*;\s*/); 130 132 if (semiParts.length > 1 && semiParts.every(p => hasModal(p))) { 131 - return semiParts.filter(p => p.length >= 3); 133 + return semiParts.filter(p => p.length >= CONFIG.MIN_SPLIT_PART_LENGTH); 132 134 } 133 135 134 136 // Check for " and " + modal or " and " separating complete modal clauses ··· 137 139 if (andMatch && andMatch.index !== undefined) { 138 140 const left = text.slice(0, andMatch.index).trim(); 139 141 const right = text.slice(andMatch.index + andMatch[0].length).trim(); 140 - if (left.length >= 3 && right.length >= 3 && hasModal(left)) { 142 + if (left.length >= CONFIG.MIN_SPLIT_PART_LENGTH && right.length >= CONFIG.MIN_SPLIT_PART_LENGTH && hasModal(left)) { 141 143 return [left, right]; 142 144 } 143 145 }

+2 -4

src/warm-hasher.ts

··· 11 11 import type { Clause } from './models/clause.js'; 12 12 import type { CanonicalNode } from './models/canonical.js'; 13 13 import { sha256 } from './semhash.js'; 14 - 15 - /** Minimum confidence for a node to be included in warm context */ 16 - const MIN_CONFIDENCE = 0.3; 14 + import { CONFIG } from './experiment-config.js'; 17 15 18 16 /** 19 17 * Compute warm context hash for a clause, incorporating canonical context. ··· 31 29 // Find canonical nodes sourced from this clause 32 30 const relatedNodes = canonicalNodes.filter( 33 31 n => n.source_clause_ids.includes(clause.clause_id) 34 - && (n.confidence ?? 1.0) >= MIN_CONFIDENCE 32 + && (n.confidence ?? 1.0) >= CONFIG.WARM_MIN_CONFIDENCE 35 33 ); 36 34 37 35 // Collect linked canon IDs — only from typed edges (not 'relates_to')

Configure Feed

Configure Feed