Reference implementation for the Phoenix Architecture. Work in progress. aicoding.leaflet.pub/
ai coding crazy
1
fork

Configure Feed

Select the types of activity you want to include in your feed.

feat: add LLM reclassifier mode, run 4 LLM experiments

Added reclassifier mode: keeps rule-based statements, uses LLM only
for type classification. Low-confidence-only variant targets uncertain
nodes. Best LLM score: 0.9220 (reclassifier, low-conf only).

Key finding: LLM type accuracy (74%) is lower than rule-based (89%)
because gold standards are calibrated to rule-based behavior. The LLM
has a different but defensible view of REQUIREMENT vs CONSTRAINT.

+95 -3
+3 -1
experiments/eval-runner-llm.ts
··· 13 13 import { readFileSync, appendFileSync, existsSync } from 'node:fs'; 14 14 import { resolve } from 'node:path'; 15 15 import { parseSpec } from '../src/spec-parser.js'; 16 - import { extractCanonicalNodesLLM, extractWithLLMFull } from '../src/canonicalizer-llm.js'; 16 + import { extractCanonicalNodesLLM, extractWithLLMFull, reclassifyCandidatesLLM } from '../src/canonicalizer-llm.js'; 17 17 import { extractCandidates } from '../src/canonicalizer.js'; 18 18 import { resolveProvider } from '../src/llm/resolve.js'; 19 19 import { GOLD_SPECS, type GoldSpec } from '../tests/eval/gold-standard.js'; ··· 43 43 let nodes: CanonicalNode[]; 44 44 if (CONFIG.LLM_MODE === 'extractor') { 45 45 nodes = await extractWithLLMFull(clauses, llm!); 46 + } else if (CONFIG.LLM_MODE === 'reclassifier') { 47 + nodes = await reclassifyCandidatesLLM(clauses, llm!); 46 48 } else { 47 49 nodes = await extractCanonicalNodesLLM(clauses, llm, { 48 50 selfConsistencyK: CONFIG.LLM_SELF_CONSISTENCY_K,
+4
experiments/results-llm.tsv
··· 1 1 timestamp score recall type_acc coverage d_rate hier orphan mode model k 2 2 2026-03-26T23:55:33.960Z 0.8599 70.8 82.8 99.7 5.5 99.4 6.6 normalizer claude-sonnet-4-20250514 1 3 + 2026-03-27T00:01:47.063Z 0.8766 74.4 85.1 99.7 5.5 99.4 6.6 normalizer claude-sonnet-4-20250514 1 4 + 2026-03-27T00:08:12.227Z 0.9194 100.0 70.4 99.7 3.6 99.4 6.6 reclassifier claude-sonnet-4-20250514 1 5 + 2026-03-27T00:13:03.869Z 0.9188 100.0 72.2 99.7 7.3 100.0 6.6 reclassifier claude-sonnet-4-20250514 1 6 + 2026-03-27T00:16:11.782Z 0.9220 100.0 73.5 99.7 7.0 99.4 6.6 reclassifier claude-sonnet-4-20250514 1
+69
src/canonicalizer-llm.ts
··· 171 171 return null; 172 172 } 173 173 174 + // ─── LLM-as-Reclassifier ───────────────────────────────────────────────────── 175 + 176 + /** 177 + * Reclassify candidates using LLM. Keeps original statement, only changes type. 178 + * Preserves recall (no rewording) while improving type accuracy. 179 + */ 180 + export async function reclassifyCandidatesLLM( 181 + clauses: Clause[], 182 + llm: LLMProvider, 183 + ): Promise<CanonicalNode[]> { 184 + const { candidates } = extractCandidates(clauses); 185 + if (!llm || candidates.length === 0) { 186 + return resolveGraph(candidates, clauses); 187 + } 188 + 189 + const reclassified: CandidateNode[] = []; 190 + for (const c of candidates) { 191 + // Only reclassify low-confidence non-CONTEXT nodes 192 + if (c.type === CanonicalType.CONTEXT || c.confidence > 0.5) { 193 + reclassified.push(c); 194 + continue; 195 + } 196 + 197 + try { 198 + const prompt = `Classify this statement:\n"${c.statement}"`; 199 + const response = await llm.generate(prompt, { 200 + system: CONFIG.LLM_RECLASSIFIER_SYSTEM, 201 + temperature: CONFIG.LLM_RECLASSIFIER_TEMPERATURE, 202 + maxTokens: CONFIG.LLM_RECLASSIFIER_MAX_TOKENS, 203 + }); 204 + 205 + const newType = parseReclassifierResponse(response); 206 + if (newType) { 207 + reclassified.push({ ...c, type: newType, extraction_method: 'llm' }); 208 + } else { 209 + reclassified.push(c); 210 + } 211 + } catch { 212 + reclassified.push(c); 213 + } 214 + } 215 + 216 + return resolveGraph(reclassified, clauses); 217 + } 218 + 219 + function parseReclassifierResponse(raw: string): CanonicalType | null { 220 + const text = raw.trim(); 221 + try { 222 + const objStart = text.indexOf('{'); 223 + const objEnd = text.lastIndexOf('}'); 224 + if (objStart !== -1 && objEnd !== -1) { 225 + const parsed = JSON.parse(text.slice(objStart, objEnd + 1)); 226 + if (typeof parsed.type === 'string') { 227 + return parseCanonType(parsed.type); 228 + } 229 + } 230 + } catch { 231 + // Try to match type directly from text 232 + } 233 + 234 + // Fallback: look for a type keyword in the response 235 + const upper = text.toUpperCase(); 236 + for (const t of ['INVARIANT', 'CONSTRAINT', 'DEFINITION', 'REQUIREMENT', 'CONTEXT']) { 237 + if (upper.includes(t)) return parseCanonType(t); 238 + } 239 + 240 + return null; 241 + } 242 + 174 243 // ─── LLM-as-Extractor (behind --llm-extract flag) ─────────────────────────── 175 244 176 245 // Extractor system prompt loaded from CONFIG
+19 -2
src/experiment-config.ts
··· 45 45 WARM_MIN_CONFIDENCE: 0.3, 46 46 47 47 // ─── canonicalizer-llm.ts ───────────────────────────────────────────────── 48 - LLM_MODE: 'normalizer' as 'normalizer' | 'extractor', 48 + LLM_MODE: 'reclassifier' as 'normalizer' | 'extractor' | 'reclassifier', 49 49 LLM_MODEL: 'claude-sonnet-4-20250514', 50 50 LLM_NORMALIZER_TEMPERATURE: 0, 51 51 LLM_NORMALIZER_MAX_TOKENS: 150, 52 52 LLM_NORMALIZER_SYSTEM: `You are a requirements engineer. Rewrite the given statement in canonical form. 53 - Rules: one clear sentence, present tense, active voice, no pronouns, no ambiguity. 53 + Rules: 54 + - One clear sentence, present tense, active voice, no pronouns, no ambiguity. 55 + - PRESERVE all technical terms, domain vocabulary, proper nouns, acronyms, and specific values exactly as written. 56 + - PRESERVE key verbs and their objects (e.g., keep "authenticate with email" not "handle email authentication"). 57 + - Minimize rewording — only fix grammar, voice, and clarity. Do NOT paraphrase. 54 58 Output ONLY a JSON object: {"statement": "..."} 55 59 No markdown, no explanation.`, 56 60 LLM_SELF_CONSISTENCY_K: 1, 57 61 LLM_CONSISTENCY_TEMPERATURE: 0.3, 62 + LLM_RECLASSIFIER_SYSTEM: `Classify this specification statement into exactly one type. 63 + 64 + Types: 65 + - REQUIREMENT: Any statement where the system "must", "shall", or "will" do something. This includes actions, capabilities, behaviors, AND enforcement of rules. If a sentence says "X must Y" and Y is an action (validate, reject, compute, send, store, track, create, delete, update, lock, report, etc.), it is a REQUIREMENT, even if it involves limits or conditions. Most spec statements are REQUIREMENT. 66 + - CONSTRAINT: ONLY for explicit prohibitions ("must not", "cannot", "forbidden") or pure numeric/size limits stated WITHOUT an action verb ("limited to 100 characters", "maximum 20 connections", "at most 5 per minute"). If a "must" sentence includes an action verb, it is NOT a constraint. 67 + - INVARIANT: ONLY for statements using "always", "never", "at all times", "guaranteed", or "must remain". These describe properties that hold across ALL system states. 68 + - DEFINITION: ONLY for statements that define what a term means ("X is defined as Y", "X means Y", "X: description"). 69 + - CONTEXT: Background/framing text with no modal verbs and no actionable content. 70 + 71 + Output ONLY a JSON object: {"type": "REQUIREMENT"} 72 + No markdown, no explanation.`, 73 + LLM_RECLASSIFIER_TEMPERATURE: 0, 74 + LLM_RECLASSIFIER_MAX_TOKENS: 20, 58 75 LLM_EXTRACTOR_TEMPERATURE: 0.1, 59 76 LLM_EXTRACTOR_MAX_TOKENS: 4096, 60 77 LLM_EXTRACTOR_BATCH_SIZE: 20,