feat: add architecture eval runner — end-to-end CRUD verification

+245

2 changed files

expand all

experiments

+242

experiments/eval-runner-arch.ts

··· 1 + #!/usr/bin/env npx tsx 2 + /** 3 + * Architecture Evaluation Runner — tests whether generated apps actually work. 4 + * 5 + * Workflow: 6 + * 1. Clean and re-bootstrap the todo-app example 7 + * 2. Start the server 8 + * 3. Run CRUD tests via HTTP 9 + * 4. Score: what percentage of operations work correctly 10 + * 5. Log results 11 + * 12 + * Usage: npx tsx experiments/eval-runner-arch.ts [--no-log] 13 + */ 14 + 15 + import { execSync, spawn } from 'node:child_process'; 16 + import { resolve } from 'node:path'; 17 + import { appendFileSync, existsSync, rmSync } from 'node:fs'; 18 + 19 + const ROOT = resolve(import.meta.dirname, '..'); 20 + const TODO_APP = resolve(ROOT, 'examples/todo-app'); 21 + const RESULTS_FILE = resolve(ROOT, 'experiments/results-arch.tsv'); 22 + const CLI = resolve(ROOT, 'dist/cli.js'); 23 + 24 + const noLog = process.argv.includes('--no-log'); 25 + const skipBootstrap = process.argv.includes('--skip-bootstrap'); 26 + 27 + // ─── Step 1: Rebuild Phoenix and re-bootstrap todo-app ────────────────────── 28 + 29 + if (!skipBootstrap) { 30 + console.log('Building Phoenix...'); 31 + execSync('npm run build', { cwd: ROOT, stdio: 'pipe' }); 32 + 33 + console.log('Cleaning todo-app...'); 34 + for (const d of ['src/generated', 'src/server.ts', 'src/app.ts', 'src/db.ts', '.phoenix', 'data']) { 35 + const p = resolve(TODO_APP, d); 36 + if (existsSync(p)) rmSync(p, { recursive: true, force: true }); 37 + } 38 + // Remove db files 39 + for (const f of ['app.db', 'todos.db', 'data.db']) { 40 + const p = resolve(TODO_APP, f); 41 + if (existsSync(p)) rmSync(p); 42 + } 43 + 44 + console.log('Initializing with sqlite-web-api...'); 45 + execSync(`node ${CLI} init --arch=sqlite-web-api`, { cwd: TODO_APP, stdio: 'pipe' }); 46 + 47 + console.log('Bootstrapping (LLM generation)...'); 48 + execSync(`node ${CLI} bootstrap`, { cwd: TODO_APP, stdio: 'pipe', timeout: 300000 }); 49 + 50 + console.log('Installing dependencies...'); 51 + execSync('npm install', { cwd: TODO_APP, stdio: 'pipe', timeout: 60000 }); 52 + } 53 + 54 + // ─── Step 2: Start the server ─────────────────────────────────────────────── 55 + 56 + // Clean any leftover DB 57 + const dbPath = resolve(TODO_APP, 'data/app.db'); 58 + if (existsSync(dbPath)) rmSync(dbPath); 59 + const dbShm = dbPath + '-shm'; 60 + const dbWal = dbPath + '-wal'; 61 + if (existsSync(dbShm)) rmSync(dbShm); 62 + if (existsSync(dbWal)) rmSync(dbWal); 63 + 64 + console.log('Starting server...'); 65 + const server = spawn('npx', ['tsx', 'src/server.ts'], { 66 + cwd: TODO_APP, 67 + stdio: 'pipe', 68 + env: { ...process.env, PORT: '4567' }, 69 + }); 70 + 71 + let serverOutput = ''; 72 + server.stdout.on('data', (d) => { serverOutput += d.toString(); }); 73 + server.stderr.on('data', (d) => { serverOutput += d.toString(); }); 74 + 75 + // Wait for server to start 76 + await new Promise<void>((resolve, reject) => { 77 + const timeout = setTimeout(() => reject(new Error('Server start timeout')), 10000); 78 + const check = setInterval(async () => { 79 + try { 80 + const res = await fetch('http://localhost:4567/health'); 81 + if (res.ok) { clearInterval(check); clearTimeout(timeout); resolve(); } 82 + } catch { /* not ready yet */ } 83 + }, 500); 84 + }); 85 + 86 + console.log('Server ready on :4567'); 87 + 88 + // ─── Step 3: Run CRUD tests ──────────────────────────────────────────────── 89 + 90 + interface TestResult { 91 + name: string; 92 + pass: boolean; 93 + detail: string; 94 + } 95 + 96 + const results: TestResult[] = []; 97 + const BASE = 'http://localhost:4567'; 98 + 99 + async function test(name: string, fn: () => Promise<boolean>): Promise<void> { 100 + try { 101 + const pass = await fn(); 102 + results.push({ name, pass, detail: pass ? 'ok' : 'assertion failed' }); 103 + console.log(` ${pass ? '✓' : '✗'} ${name}`); 104 + } catch (e) { 105 + results.push({ name, pass: false, detail: String(e) }); 106 + console.log(` ✗ ${name} — ${e}`); 107 + } 108 + } 109 + 110 + console.log('\nRunning CRUD tests:'); 111 + 112 + // POST /todos — create 113 + let createdId: number | null = null; 114 + await test('POST /todos returns 201 with todo', async () => { 115 + const res = await fetch(`${BASE}/todos`, { 116 + method: 'POST', 117 + headers: { 'Content-Type': 'application/json' }, 118 + body: JSON.stringify({ title: 'Test todo' }), 119 + }); 120 + if (res.status !== 201) return false; 121 + const body = await res.json() as Record<string, unknown>; 122 + createdId = body.id as number; 123 + return typeof body.id === 'number' && body.title === 'Test todo' && 'created_at' in body; 124 + }); 125 + 126 + // POST /todos — validation 127 + await test('POST /todos rejects empty title with 400', async () => { 128 + const res = await fetch(`${BASE}/todos`, { 129 + method: 'POST', 130 + headers: { 'Content-Type': 'application/json' }, 131 + body: JSON.stringify({ title: '' }), 132 + }); 133 + if (res.status !== 400) return false; 134 + const body = await res.json() as Record<string, unknown>; 135 + return typeof body.error === 'string'; 136 + }); 137 + 138 + // GET /todos — list 139 + await test('GET /todos returns array with created todo', async () => { 140 + const res = await fetch(`${BASE}/todos`); 141 + if (res.status !== 200) return false; 142 + const body = await res.json() as unknown[]; 143 + return Array.isArray(body) && body.length >= 1; 144 + }); 145 + 146 + // GET /todos/:id — get one 147 + await test('GET /todos/:id returns the todo', async () => { 148 + if (!createdId) return false; 149 + const res = await fetch(`${BASE}/todos/${createdId}`); 150 + if (res.status !== 200) return false; 151 + const body = await res.json() as Record<string, unknown>; 152 + return body.title === 'Test todo'; 153 + }); 154 + 155 + // GET /todos/999 — 404 156 + await test('GET /todos/999 returns 404', async () => { 157 + const res = await fetch(`${BASE}/todos/999`); 158 + return res.status === 404; 159 + }); 160 + 161 + // PATCH /todos/:id — update 162 + await test('PATCH /todos/:id updates completed', async () => { 163 + if (!createdId) return false; 164 + const res = await fetch(`${BASE}/todos/${createdId}`, { 165 + method: 'PATCH', 166 + headers: { 'Content-Type': 'application/json' }, 167 + body: JSON.stringify({ completed: 1 }), 168 + }); 169 + if (res.status !== 200) return false; 170 + const body = await res.json() as Record<string, unknown>; 171 + return body.completed === 1; 172 + }); 173 + 174 + // PATCH /todos/:id — update title 175 + await test('PATCH /todos/:id updates title', async () => { 176 + if (!createdId) return false; 177 + const res = await fetch(`${BASE}/todos/${createdId}`, { 178 + method: 'PATCH', 179 + headers: { 'Content-Type': 'application/json' }, 180 + body: JSON.stringify({ title: 'Updated title' }), 181 + }); 182 + if (res.status !== 200) return false; 183 + const body = await res.json() as Record<string, unknown>; 184 + return body.title === 'Updated title'; 185 + }); 186 + 187 + // Create another to delete 188 + let deleteId: number | null = null; 189 + await test('POST /todos creates second todo for deletion', async () => { 190 + const res = await fetch(`${BASE}/todos`, { 191 + method: 'POST', 192 + headers: { 'Content-Type': 'application/json' }, 193 + body: JSON.stringify({ title: 'Delete me' }), 194 + }); 195 + if (res.status !== 201) return false; 196 + const body = await res.json() as Record<string, unknown>; 197 + deleteId = body.id as number; 198 + return true; 199 + }); 200 + 201 + // DELETE /todos/:id 202 + await test('DELETE /todos/:id returns 204', async () => { 203 + if (!deleteId) return false; 204 + const res = await fetch(`${BASE}/todos/${deleteId}`, { method: 'DELETE' }); 205 + return res.status === 204; 206 + }); 207 + 208 + // Verify deletion 209 + await test('GET /todos/:id returns 404 after delete', async () => { 210 + if (!deleteId) return false; 211 + const res = await fetch(`${BASE}/todos/${deleteId}`); 212 + return res.status === 404; 213 + }); 214 + 215 + // ─── Step 4: Score ────────────────────────────────────────────────────────── 216 + 217 + server.kill(); 218 + 219 + const passed = results.filter(r => r.pass).length; 220 + const total = results.length; 221 + const score = total > 0 ? passed / total : 0; 222 + 223 + console.log(`\n Score: ${passed}/${total} (${(score * 100).toFixed(0)}%)`); 224 + for (const r of results.filter(r => !r.pass)) { 225 + console.log(` FAIL: ${r.name} — ${r.detail}`); 226 + } 227 + 228 + // ─── Step 5: Log ──────────────────────────────────────────────────────────── 229 + 230 + if (!noLog) { 231 + const header = 'timestamp\tscore\tpassed\ttotal\tfailures'; 232 + if (!existsSync(RESULTS_FILE)) { 233 + appendFileSync(RESULTS_FILE, header + '\n'); 234 + } 235 + const failures = results.filter(r => !r.pass).map(r => r.name).join('; ') || 'none'; 236 + const row = [new Date().toISOString(), score.toFixed(2), passed, total, failures].join('\t'); 237 + appendFileSync(RESULTS_FILE, row + '\n'); 238 + console.log(` Results appended to experiments/results-arch.tsv`); 239 + } 240 + 241 + console.log(`\nval_score=${score.toFixed(4)}`); 242 + process.exit(score === 1 ? 0 : 1);

experiments/results-arch.tsv

··· 1 + timestamp score passed total failures 2 + 2026-03-27T05:28:30.104Z 1.00 10 10 none 3 + 2026-03-27T05:29:23.199Z 1.00 10 10 none

Configure Feed

Configure Feed