feat(vet): add --sandbox flag for sandboxed sub-agent safety evaluation

+238 -2

src/cmd/vet.js

··· 2 2 // Copyright (c) 2026 sol pbc 3 3 4 4 import { existsSync, readFileSync, writeFileSync, appendFileSync } from 'node:fs'; 5 + import { execFile } from 'node:child_process'; 5 6 import { join } from 'node:path'; 7 + import { promisify } from 'node:util'; 6 8 import { requireDid } from '../lib/config.js'; 7 9 import { CAP_COLLECTION, SKILL_COLLECTION } from '../lib/constants.js'; 8 10 import { restoreAgent } from '../lib/oauth.js'; 9 11 import { appendLog, readProjectConfig, readFollowing, vitDir } from '../lib/vit-dir.js'; 10 - import { requireNotAgent, detectCodingAgent } from '../lib/agent.js'; 12 + import { requireNotAgent, detectCodingAgent, toSandboxName } from '../lib/agent.js'; 11 13 import { resolveRef, REF_PATTERN } from '../lib/cap-ref.js'; 12 14 import { isSkillRef, isValidSkillRef, nameFromSkillRef } from '../lib/skill-ref.js'; 13 15 import { mark, brand, name } from '../lib/brand.js'; 14 16 import { resolvePds, listRecordsFromPds, batchQuery } from '../lib/pds.js'; 15 17 import { loadConfig } from '../lib/config.js'; 16 18 import { jsonOk, jsonError } from '../lib/json-output.js'; 19 + import { sandboxArgs } from '../lib/sandbox.js'; 20 + 21 + const execFileAsync = promisify(execFile); 22 + 23 + const SANDBOX_SYSTEM_PROMPT = `You are a safety reviewer. Evaluate the following software capability or skill for safety concerns. 24 + 25 + Respond with ONLY a JSON object (no markdown, no explanation outside the JSON): 26 + { 27 + "safe": true or false, 28 + "concerns": ["list of specific concerns, empty if safe"], 29 + "summary": "one-sentence safety assessment" 30 + } 31 + 32 + Evaluate for: malicious code patterns, data exfiltration, unauthorized access, destructive operations, obfuscated logic, and social engineering.`; 33 + 34 + async function runSandboxEval(agentName, contentText, opts) { 35 + const vlog = opts.json ? (...a) => console.error(...a) : console.log; 36 + if (opts.verbose) vlog(`[verbose] sandbox: spawning ${agentName} sub-agent`); 37 + 38 + const { cmd, args, env } = sandboxArgs(agentName, { 39 + prompt: contentText, 40 + systemPrompt: SANDBOX_SYSTEM_PROMPT, 41 + }); 42 + 43 + let stdout; 44 + try { 45 + const result = await execFileAsync(cmd, args, { 46 + env: { ...process.env, ...env }, 47 + timeout: 30000, 48 + maxBuffer: 1024 * 1024, 49 + }); 50 + stdout = result.stdout; 51 + } catch (err) { 52 + if (err.killed) { 53 + throw new Error(`sandbox: ${agentName} sub-agent timed out after 30s`); 54 + } 55 + throw new Error(`sandbox: ${agentName} sub-agent failed: ${err.message}`); 56 + } 57 + 58 + if (opts.verbose) vlog(`[verbose] sandbox: raw output length ${stdout.length}`); 59 + 60 + // Claude wraps output in a JSON envelope with a "result" field containing the text. 61 + // Try to extract inner text from Claude's envelope first, then parse verdict. 62 + let text = stdout.trim(); 63 + try { 64 + const envelope = JSON.parse(text); 65 + if (typeof envelope.result === 'string') { 66 + text = envelope.result.trim(); 67 + } 68 + } catch { 69 + // Not a JSON envelope — use raw text 70 + } 71 + 72 + // Extract JSON from the text (may be wrapped in markdown code fences) 73 + const jsonMatch = text.match(/\{[\s\S]*\}/); 74 + if (!jsonMatch) { 75 + throw new Error('sandbox: sub-agent returned no JSON verdict'); 76 + } 77 + 78 + let verdict; 79 + try { 80 + verdict = JSON.parse(jsonMatch[0]); 81 + } catch { 82 + throw new Error('sandbox: sub-agent returned malformed JSON verdict'); 83 + } 84 + 85 + if (typeof verdict.safe !== 'boolean') { 86 + throw new Error('sandbox: verdict missing "safe" field'); 87 + } 88 + if (!Array.isArray(verdict.concerns)) { 89 + verdict.concerns = []; 90 + } 91 + if (typeof verdict.summary !== 'string') { 92 + verdict.summary = ''; 93 + } 94 + 95 + return { safe: verdict.safe, concerns: verdict.concerns, summary: verdict.summary }; 96 + } 97 + 98 + function resolveSandboxAgent(opts) { 99 + if (typeof opts.sandbox === 'string') { 100 + // Explicit agent name — validate it 101 + const valid = new Set(['claude', 'codex', 'gemini']); 102 + if (!valid.has(opts.sandbox)) { 103 + throw new Error(`unknown sandbox agent: '${opts.sandbox}'. must be one of: claude, codex, gemini`); 104 + } 105 + return opts.sandbox; 106 + } 107 + // opts.sandbox === true (flag without value) — auto-detect 108 + const detected = detectCodingAgent(); 109 + if (!detected) { 110 + throw new Error('could not detect agent for sandbox. specify one explicitly: --sandbox claude'); 111 + } 112 + const mapped = toSandboxName(detected.name); 113 + if (!mapped) { 114 + throw new Error(`detected agent '${detected.name}' has no sandbox mapping`); 115 + } 116 + return mapped; 117 + } 17 118 18 119 function ensureGitignore() { 19 120 const gitignorePath = join(vitDir(), '.gitignore'); ··· 36 137 .option('--confirm', 'Confirm dangerous-accept, or bypass agent gate with --trust') 37 138 .option('--json', 'Output as JSON') 38 139 .option('-v, --verbose', 'Show step-by-step details') 140 + .option('--sandbox [agent]', 'Spawn a sandboxed sub-agent to evaluate safety') 39 141 .action(async (ref, opts) => { 40 142 try { 41 143 const { verbose } = opts; ··· 124 226 // --- Agent gate --- 125 227 const agent = detectCodingAgent(); 126 228 if (agent) { 127 - if (opts.trust && opts.confirm) { 229 + if ((opts.trust && opts.confirm) || opts.sandbox) { 128 230 // Sandboxed sub-agent pattern — allow it 129 231 } else { 130 232 if (opts.json) { ··· 144 246 return; 145 247 } 146 248 } 249 + 250 + const sandboxAgent = opts.sandbox ? resolveSandboxAgent(opts) : null; 147 251 148 252 if (opts.json && !(opts.did || loadConfig().did)) { 149 253 jsonError('no DID configured', "run 'vit login <handle>' first"); ··· 208 312 209 313 const record = match.value; 210 314 315 + if (opts.sandbox) { 316 + const contentText = [ 317 + `Type: cap`, 318 + record.title ? `Title: ${record.title}` : '', 319 + record.description ? `Description: ${record.description}` : '', 320 + record.text ? `\nContent:\n${record.text}` : '', 321 + ].filter(Boolean).join('\n'); 322 + 323 + const verdict = await runSandboxEval(sandboxAgent, contentText, opts); 324 + 325 + if (opts.trust) { 326 + if (verdict.safe) { 327 + appendLog('trusted.jsonl', { 328 + ref, 329 + uri: match.uri, 330 + trustedAt: new Date().toISOString(), 331 + sandboxVerdict: verdict, 332 + }); 333 + if (opts.json) { 334 + jsonOk({ trusted: true, ref, uri: match.uri, sandbox: verdict }); 335 + return; 336 + } 337 + console.log(`${mark} trusted: ${ref} (sandbox: safe)`); 338 + return; 339 + } else { 340 + // Unsafe — do NOT trust 341 + if (opts.json) { 342 + jsonOk({ trusted: false, ref, uri: match.uri, sandbox: verdict }); 343 + process.exitCode = 1; 344 + return; 345 + } 346 + console.error(`${mark} sandbox verdict: UNSAFE`); 347 + console.error(` summary: ${verdict.summary}`); 348 + for (const c of verdict.concerns) { 349 + console.error(` - ${c}`); 350 + } 351 + console.error(''); 352 + console.error('not trusted due to safety concerns.'); 353 + process.exitCode = 1; 354 + return; 355 + } 356 + } 357 + 358 + // --sandbox without --trust: display verdict 359 + if (opts.json) { 360 + const author = match.uri.split('/')[2]; 361 + jsonOk({ ref, type: 'cap', author, title: record.title || '', description: record.description || '', text: record.text || '', sandbox: verdict, trusted: false }); 362 + return; 363 + } 364 + console.log(`${mark} sandbox verdict: ${verdict.safe ? 'SAFE' : 'UNSAFE'}`); 365 + console.log(` summary: ${verdict.summary}`); 366 + if (verdict.concerns.length > 0) { 367 + for (const c of verdict.concerns) { 368 + console.log(` - ${c}`); 369 + } 370 + } 371 + return; 372 + } 373 + 211 374 if (opts.trust) { 212 375 appendLog('trusted.jsonl', { 213 376 ref, ··· 291 454 } 292 455 293 456 const record = match.value; 457 + 458 + if (opts.sandbox) { 459 + const parts = [ 460 + `Type: skill`, 461 + `Name: ${record.name}`, 462 + record.description ? `Description: ${record.description}` : '', 463 + record.text ? `\nContent:\n${record.text}` : '', 464 + ]; 465 + if (record.resources && record.resources.length > 0) { 466 + parts.push('\nResources:'); 467 + for (const r of record.resources) { 468 + parts.push(` ${r.path}${r.description ? ' — ' + r.description : ''}`); 469 + } 470 + } 471 + if (record.tags && record.tags.length > 0) { 472 + parts.push(`\nTags: ${record.tags.join(', ')}`); 473 + } 474 + const contentText = parts.filter(Boolean).join('\n'); 475 + 476 + const verdict = await runSandboxEval(sandboxAgent, contentText, opts); 477 + 478 + if (opts.trust) { 479 + if (verdict.safe) { 480 + appendLog('trusted.jsonl', { 481 + ref, 482 + uri: match.uri, 483 + trustedAt: new Date().toISOString(), 484 + sandboxVerdict: verdict, 485 + }); 486 + if (opts.json) { 487 + jsonOk({ trusted: true, ref, uri: match.uri, sandbox: verdict }); 488 + return; 489 + } 490 + console.log(`${mark} trusted: ${ref} (sandbox: safe)`); 491 + return; 492 + } else { 493 + if (opts.json) { 494 + jsonOk({ trusted: false, ref, uri: match.uri, sandbox: verdict }); 495 + process.exitCode = 1; 496 + return; 497 + } 498 + console.error(`${mark} sandbox verdict: UNSAFE`); 499 + console.error(` summary: ${verdict.summary}`); 500 + for (const c of verdict.concerns) { 501 + console.error(` - ${c}`); 502 + } 503 + console.error(''); 504 + console.error('not trusted due to safety concerns.'); 505 + process.exitCode = 1; 506 + return; 507 + } 508 + } 509 + 510 + // --sandbox without --trust: display verdict 511 + if (opts.json) { 512 + const author = match.uri.split('/')[2]; 513 + jsonOk({ 514 + ref, type: 'skill', name: record.name, author, 515 + version: record.version || null, license: record.license || null, 516 + description: record.description || null, text: record.text || null, 517 + sandbox: verdict, trusted: false, 518 + }); 519 + return; 520 + } 521 + console.log(`${mark} sandbox verdict: ${verdict.safe ? 'SAFE' : 'UNSAFE'}`); 522 + console.log(` summary: ${verdict.summary}`); 523 + if (verdict.concerns.length > 0) { 524 + for (const c of verdict.concerns) { 525 + console.log(` - ${c}`); 526 + } 527 + } 528 + return; 529 + } 294 530 295 531 if (opts.trust) { 296 532 appendLog('trusted.jsonl', {

+10

src/lib/agent.js

··· 25 25 if (!agent) return { ok: true }; 26 26 return { ok: false, ...agent }; 27 27 } 28 + 29 + const SANDBOX_NAMES = { 30 + 'claude code': 'claude', 31 + 'gemini cli': 'gemini', 32 + 'codex': 'codex', 33 + }; 34 + 35 + export function toSandboxName(agentName) { 36 + return SANDBOX_NAMES[agentName] || null; 37 + }

+78

test/vet.test.js

··· 131 131 rmSync(configHome, { recursive: true, force: true }); 132 132 }); 133 133 }); 134 + 135 + describe('--sandbox', () => { 136 + test('--sandbox without agent outside agent env: error with hint', () => { 137 + const result = run('vet fast-cache-invalidation --sandbox', undefined, noAgentEnv); 138 + expect(result.exitCode).not.toBe(0); 139 + expect(result.stderr).toContain('could not detect agent'); 140 + expect(result.stderr).toContain('--sandbox claude'); 141 + }); 142 + 143 + test('--sandbox bogus: error about unknown agent', () => { 144 + const result = run('vet fast-cache-invalidation --sandbox bogus', undefined, noAgentEnv); 145 + expect(result.exitCode).not.toBe(0); 146 + expect(result.stderr).toContain('unknown sandbox agent'); 147 + }); 148 + 149 + test('--sandbox claude accepted (passes validation, fails at DID)', () => { 150 + const configHome = join(tmpdir(), '.test-vet-sb-' + Math.random().toString(36).slice(2)); 151 + mkdirSync(configHome, { recursive: true }); 152 + const result = run('vet fast-cache-invalidation --sandbox claude', undefined, { ...noAgentEnv, XDG_CONFIG_HOME: configHome }); 153 + // Should NOT contain validation errors for sandbox agent 154 + expect(result.stderr).not.toContain('unknown sandbox agent'); 155 + rmSync(configHome, { recursive: true, force: true }); 156 + }); 157 + 158 + test('--sandbox codex accepted', () => { 159 + const configHome = join(tmpdir(), '.test-vet-sb-codex-' + Math.random().toString(36).slice(2)); 160 + mkdirSync(configHome, { recursive: true }); 161 + const result = run('vet fast-cache-invalidation --sandbox codex', undefined, { ...noAgentEnv, XDG_CONFIG_HOME: configHome }); 162 + expect(result.stderr).not.toContain('unknown sandbox agent'); 163 + rmSync(configHome, { recursive: true, force: true }); 164 + }); 165 + 166 + test('--sandbox gemini accepted', () => { 167 + const configHome = join(tmpdir(), '.test-vet-sb-gemini-' + Math.random().toString(36).slice(2)); 168 + mkdirSync(configHome, { recursive: true }); 169 + const result = run('vet fast-cache-invalidation --sandbox gemini', undefined, { ...noAgentEnv, XDG_CONFIG_HOME: configHome }); 170 + expect(result.stderr).not.toContain('unknown sandbox agent'); 171 + rmSync(configHome, { recursive: true, force: true }); 172 + }); 173 + 174 + test('--sandbox without value in agent env: auto-detects agent', () => { 175 + const configHome = join(tmpdir(), '.test-vet-sb-auto-' + Math.random().toString(36).slice(2)); 176 + mkdirSync(configHome, { recursive: true }); 177 + const result = run('vet fast-cache-invalidation --sandbox', undefined, { ...agentEnv, XDG_CONFIG_HOME: configHome }); 178 + // Should pass sandbox agent resolution (claude code → claude), fail later at DID 179 + expect(result.stderr).not.toContain('could not detect agent'); 180 + expect(result.stderr).not.toContain('unknown sandbox agent'); 181 + rmSync(configHome, { recursive: true, force: true }); 182 + }); 183 + 184 + test('--sandbox passes agent gate (agent env + --sandbox flag)', () => { 185 + const configHome = join(tmpdir(), '.test-vet-sb-gate-' + Math.random().toString(36).slice(2)); 186 + mkdirSync(configHome, { recursive: true }); 187 + const result = run('vet fast-cache-invalidation --sandbox claude', undefined, { ...agentEnv, XDG_CONFIG_HOME: configHome }); 188 + // Should NOT contain the agent gate error 189 + expect(result.stderr).not.toContain('vit vet is for human review'); 190 + rmSync(configHome, { recursive: true, force: true }); 191 + }); 192 + 193 + test('--sandbox shows in help output', () => { 194 + const result = run('vet --help'); 195 + expect(result.stdout).toContain('--sandbox'); 196 + }); 197 + 198 + test('--sandbox --json without agent outside agent env: JSON error', () => { 199 + const result = run('vet fast-cache-invalidation --sandbox --json', undefined, noAgentEnv); 200 + const parsed = JSON.parse(result.stdout); 201 + expect(parsed.ok).toBe(false); 202 + expect(parsed.error).toContain('could not detect agent'); 203 + }); 204 + 205 + test('--sandbox --json bogus: JSON error', () => { 206 + const result = run('vet fast-cache-invalidation --sandbox bogus --json', undefined, noAgentEnv); 207 + const parsed = JSON.parse(result.stdout); 208 + expect(parsed.ok).toBe(false); 209 + expect(parsed.error).toContain('unknown sandbox agent'); 210 + }); 211 + }); 134 212 });

Configure Feed

Configure Feed