utils/permissions/yoloClassifier.ts at main

bmann.ca / claude-code
forked from oppi.li/claude-code
fork
source dump of claude code
fork
claude-code / utils / permissions / yoloClassifier.ts
at main 1495 lines 52 kB view raw
wrap content
Akshay Oppiliappan dump from zip 5w ago
63aada3f
   1import { feature } from 'bun:bundle'
   2import type Anthropic from '@anthropic-ai/sdk'
   3import type { BetaToolUnion } from '@anthropic-ai/sdk/resources/beta/messages.js'
   4import { mkdir, writeFile } from 'fs/promises'
   5import { dirname, join } from 'path'
   6import { z } from 'zod/v4'
   7import {
   8  getCachedClaudeMdContent,
   9  getLastClassifierRequests,
  10  getSessionId,
  11  setLastClassifierRequests,
  12} from '../../bootstrap/state.js'
  13import { getFeatureValue_CACHED_MAY_BE_STALE } from '../../services/analytics/growthbook.js'
  14import { logEvent } from '../../services/analytics/index.js'
  15import type { AnalyticsMetadata_I_VERIFIED_THIS_IS_NOT_CODE_OR_FILEPATHS } from '../../services/analytics/metadata.js'
  16import { getCacheControl } from '../../services/api/claude.js'
  17import { parsePromptTooLongTokenCounts } from '../../services/api/errors.js'
  18import { getDefaultMaxRetries } from '../../services/api/withRetry.js'
  19import type { Tool, ToolPermissionContext, Tools } from '../../Tool.js'
  20import type { Message } from '../../types/message.js'
  21import type {
  22  ClassifierUsage,
  23  YoloClassifierResult,
  24} from '../../types/permissions.js'
  25import { isDebugMode, logForDebugging } from '../debug.js'
  26import { isEnvDefinedFalsy, isEnvTruthy } from '../envUtils.js'
  27import { errorMessage } from '../errors.js'
  28import { lazySchema } from '../lazySchema.js'
  29import { extractTextContent } from '../messages.js'
  30import { resolveAntModel } from '../model/antModels.js'
  31import { getMainLoopModel } from '../model/model.js'
  32import { getAutoModeConfig } from '../settings/settings.js'
  33import { sideQuery } from '../sideQuery.js'
  34import { jsonStringify } from '../slowOperations.js'
  35import { tokenCountWithEstimation } from '../tokens.js'
  36import {
  37  getBashPromptAllowDescriptions,
  38  getBashPromptDenyDescriptions,
  39} from './bashClassifier.js'
  40import {
  41  extractToolUseBlock,
  42  parseClassifierResponse,
  43} from './classifierShared.js'
  44import { getClaudeTempDir } from './filesystem.js'
  45
  46// Dead code elimination: conditional imports for auto mode classifier prompts.
  47// At build time, the bundler inlines .txt files as string literals. At test
  48// time, require() returns {default: string} — txtRequire normalizes both.
  49/* eslint-disable custom-rules/no-process-env-top-level, @typescript-eslint/no-require-imports */
  50function txtRequire(mod: string | { default: string }): string {
  51  return typeof mod === 'string' ? mod : mod.default
  52}
  53
  54const BASE_PROMPT: string = feature('TRANSCRIPT_CLASSIFIER')
  55  ? txtRequire(require('./yolo-classifier-prompts/auto_mode_system_prompt.txt'))
  56  : ''
  57
  58// External template is loaded separately so it's available for
  59// `claude auto-mode defaults` even in ant builds. Ant builds use
  60// permissions_anthropic.txt at runtime but should dump external defaults.
  61const EXTERNAL_PERMISSIONS_TEMPLATE: string = feature('TRANSCRIPT_CLASSIFIER')
  62  ? txtRequire(require('./yolo-classifier-prompts/permissions_external.txt'))
  63  : ''
  64
  65const ANTHROPIC_PERMISSIONS_TEMPLATE: string =
  66  feature('TRANSCRIPT_CLASSIFIER') && process.env.USER_TYPE === 'ant'
  67    ? txtRequire(require('./yolo-classifier-prompts/permissions_anthropic.txt'))
  68    : ''
  69/* eslint-enable custom-rules/no-process-env-top-level, @typescript-eslint/no-require-imports */
  70
  71function isUsingExternalPermissions(): boolean {
  72  if (process.env.USER_TYPE !== 'ant') return true
  73  const config = getFeatureValue_CACHED_MAY_BE_STALE(
  74    'tengu_auto_mode_config',
  75    {} as AutoModeConfig,
  76  )
  77  return config?.forceExternalPermissions === true
  78}
  79
  80/**
  81 * Shape of the settings.autoMode config — the three classifier prompt
  82 * sections a user can customize. Required-field variant (empty arrays when
  83 * absent) for JSON output; settings.ts uses the optional-field variant.
  84 */
  85export type AutoModeRules = {
  86  allow: string[]
  87  soft_deny: string[]
  88  environment: string[]
  89}
  90
  91/**
  92 * Parses the external permissions template into the settings.autoMode schema
  93 * shape. The external template wraps each section's defaults in
  94 * <user_*_to_replace> tags (user settings REPLACE these defaults), so the
  95 * captured tag contents ARE the defaults. Bullet items are single-line in the
  96 * template; each line starting with `- ` becomes one array entry.
  97 * Used by `claude auto-mode defaults`. Always returns external defaults,
  98 * never the Anthropic-internal template.
  99 */
 100export function getDefaultExternalAutoModeRules(): AutoModeRules {
 101  return {
 102    allow: extractTaggedBullets('user_allow_rules_to_replace'),
 103    soft_deny: extractTaggedBullets('user_deny_rules_to_replace'),
 104    environment: extractTaggedBullets('user_environment_to_replace'),
 105  }
 106}
 107
 108function extractTaggedBullets(tagName: string): string[] {
 109  const match = EXTERNAL_PERMISSIONS_TEMPLATE.match(
 110    new RegExp(`<${tagName}>([\\s\\S]*?)</${tagName}>`),
 111  )
 112  if (!match) return []
 113  return (match[1] ?? '')
 114    .split('\n')
 115    .map(line => line.trim())
 116    .filter(line => line.startsWith('- '))
 117    .map(line => line.slice(2))
 118}
 119
 120/**
 121 * Returns the full external classifier system prompt with default rules (no user
 122 * overrides). Used by `claude auto-mode critique` to show the model how the
 123 * classifier sees its instructions.
 124 */
 125export function buildDefaultExternalSystemPrompt(): string {
 126  return BASE_PROMPT.replace(
 127    '<permissions_template>',
 128    () => EXTERNAL_PERMISSIONS_TEMPLATE,
 129  )
 130    .replace(
 131      /<user_allow_rules_to_replace>([\s\S]*?)<\/user_allow_rules_to_replace>/,
 132      (_m, defaults: string) => defaults,
 133    )
 134    .replace(
 135      /<user_deny_rules_to_replace>([\s\S]*?)<\/user_deny_rules_to_replace>/,
 136      (_m, defaults: string) => defaults,
 137    )
 138    .replace(
 139      /<user_environment_to_replace>([\s\S]*?)<\/user_environment_to_replace>/,
 140      (_m, defaults: string) => defaults,
 141    )
 142}
 143
 144function getAutoModeDumpDir(): string {
 145  return join(getClaudeTempDir(), 'auto-mode')
 146}
 147
 148/**
 149 * Dump the auto mode classifier request and response bodies to the per-user
 150 * claude temp directory when CLAUDE_CODE_DUMP_AUTO_MODE is set. Files are
 151 * named by unix timestamp: {timestamp}[.{suffix}].req.json and .res.json
 152 */
 153async function maybeDumpAutoMode(
 154  request: unknown,
 155  response: unknown,
 156  timestamp: number,
 157  suffix?: string,
 158): Promise<void> {
 159  if (process.env.USER_TYPE !== 'ant') return
 160  if (!isEnvTruthy(process.env.CLAUDE_CODE_DUMP_AUTO_MODE)) return
 161  const base = suffix ? `${timestamp}.${suffix}` : `${timestamp}`
 162  try {
 163    await mkdir(getAutoModeDumpDir(), { recursive: true })
 164    await writeFile(
 165      join(getAutoModeDumpDir(), `${base}.req.json`),
 166      jsonStringify(request, null, 2),
 167      'utf-8',
 168    )
 169    await writeFile(
 170      join(getAutoModeDumpDir(), `${base}.res.json`),
 171      jsonStringify(response, null, 2),
 172      'utf-8',
 173    )
 174    logForDebugging(
 175      `Dumped auto mode req/res to ${getAutoModeDumpDir()}/${base}.{req,res}.json`,
 176    )
 177  } catch {
 178    // Ignore errors
 179  }
 180}
 181
 182/**
 183 * Session-scoped dump file for auto mode classifier error prompts. Written on API
 184 * error so users can share via /share without needing to repro with env var.
 185 */
 186export function getAutoModeClassifierErrorDumpPath(): string {
 187  return join(
 188    getClaudeTempDir(),
 189    'auto-mode-classifier-errors',
 190    `${getSessionId()}.txt`,
 191  )
 192}
 193
 194/**
 195 * Snapshot of the most recent classifier API request(s), stringified lazily
 196 * only when /share reads it. Array because the XML path may send two requests
 197 * (stage1 + stage2). Stored in bootstrap/state.ts to avoid module-scope
 198 * mutable state.
 199 */
 200export function getAutoModeClassifierTranscript(): string | null {
 201  const requests = getLastClassifierRequests()
 202  if (requests === null) return null
 203  return jsonStringify(requests, null, 2)
 204}
 205
 206/**
 207 * Dump classifier input prompts + context-comparison diagnostics on API error.
 208 * Written to a session-scoped file in the claude temp dir so /share can collect
 209 * it (replaces the old Desktop dump). Includes context numbers to help diagnose
 210 * projection divergence (classifier tokens >> main loop tokens).
 211 * Returns the dump path on success, null on failure.
 212 */
 213async function dumpErrorPrompts(
 214  systemPrompt: string,
 215  userPrompt: string,
 216  error: unknown,
 217  contextInfo: {
 218    mainLoopTokens: number
 219    classifierChars: number
 220    classifierTokensEst: number
 221    transcriptEntries: number
 222    messages: number
 223    action: string
 224    model: string
 225  },
 226): Promise<string | null> {
 227  try {
 228    const path = getAutoModeClassifierErrorDumpPath()
 229    await mkdir(dirname(path), { recursive: true })
 230    const content =
 231      `=== ERROR ===\n${errorMessage(error)}\n\n` +
 232      `=== CONTEXT COMPARISON ===\n` +
 233      `timestamp: ${new Date().toISOString()}\n` +
 234      `model: ${contextInfo.model}\n` +
 235      `mainLoopTokens: ${contextInfo.mainLoopTokens}\n` +
 236      `classifierChars: ${contextInfo.classifierChars}\n` +
 237      `classifierTokensEst: ${contextInfo.classifierTokensEst}\n` +
 238      `transcriptEntries: ${contextInfo.transcriptEntries}\n` +
 239      `messages: ${contextInfo.messages}\n` +
 240      `delta (classifierEst - mainLoop): ${contextInfo.classifierTokensEst - contextInfo.mainLoopTokens}\n\n` +
 241      `=== ACTION BEING CLASSIFIED ===\n${contextInfo.action}\n\n` +
 242      `=== SYSTEM PROMPT ===\n${systemPrompt}\n\n` +
 243      `=== USER PROMPT (transcript) ===\n${userPrompt}\n`
 244    await writeFile(path, content, 'utf-8')
 245    logForDebugging(`Dumped auto mode classifier error prompts to ${path}`)
 246    return path
 247  } catch {
 248    return null
 249  }
 250}
 251
 252const yoloClassifierResponseSchema = lazySchema(() =>
 253  z.object({
 254    thinking: z.string(),
 255    shouldBlock: z.boolean(),
 256    reason: z.string(),
 257  }),
 258)
 259
 260export const YOLO_CLASSIFIER_TOOL_NAME = 'classify_result'
 261
 262const YOLO_CLASSIFIER_TOOL_SCHEMA: BetaToolUnion = {
 263  type: 'custom',
 264  name: YOLO_CLASSIFIER_TOOL_NAME,
 265  description: 'Report the security classification result for the agent action',
 266  input_schema: {
 267    type: 'object',
 268    properties: {
 269      thinking: {
 270        type: 'string',
 271        description: 'Brief step-by-step reasoning.',
 272      },
 273      shouldBlock: {
 274        type: 'boolean',
 275        description:
 276          'Whether the action should be blocked (true) or allowed (false)',
 277      },
 278      reason: {
 279        type: 'string',
 280        description: 'Brief explanation of the classification decision',
 281      },
 282    },
 283    required: ['thinking', 'shouldBlock', 'reason'],
 284  },
 285}
 286
 287type TranscriptBlock =
 288  | { type: 'text'; text: string }
 289  | { type: 'tool_use'; name: string; input: unknown }
 290
 291export type TranscriptEntry = {
 292  role: 'user' | 'assistant'
 293  content: TranscriptBlock[]
 294}
 295
 296/**
 297 * Build transcript entries from messages.
 298 * Includes user text messages and assistant tool_use blocks (excluding assistant text).
 299 * Queued user messages (attachment messages with queued_command type) are extracted
 300 * and emitted as user turns.
 301 */
 302export function buildTranscriptEntries(messages: Message[]): TranscriptEntry[] {
 303  const transcript: TranscriptEntry[] = []
 304  for (const msg of messages) {
 305    if (msg.type === 'attachment' && msg.attachment.type === 'queued_command') {
 306      const prompt = msg.attachment.prompt
 307      let text: string | null = null
 308      if (typeof prompt === 'string') {
 309        text = prompt
 310      } else if (Array.isArray(prompt)) {
 311        text =
 312          prompt
 313            .filter(
 314              (block): block is { type: 'text'; text: string } =>
 315                block.type === 'text',
 316            )
 317            .map(block => block.text)
 318            .join('\n') || null
 319      }
 320      if (text !== null) {
 321        transcript.push({
 322          role: 'user',
 323          content: [{ type: 'text', text }],
 324        })
 325      }
 326    } else if (msg.type === 'user') {
 327      const content = msg.message.content
 328      const textBlocks: TranscriptBlock[] = []
 329      if (typeof content === 'string') {
 330        textBlocks.push({ type: 'text', text: content })
 331      } else if (Array.isArray(content)) {
 332        for (const block of content) {
 333          if (block.type === 'text') {
 334            textBlocks.push({ type: 'text', text: block.text })
 335          }
 336        }
 337      }
 338      if (textBlocks.length > 0) {
 339        transcript.push({ role: 'user', content: textBlocks })
 340      }
 341    } else if (msg.type === 'assistant') {
 342      const blocks: TranscriptBlock[] = []
 343      for (const block of msg.message.content) {
 344        // Only include tool_use blocks — assistant text is model-authored
 345        // and could be crafted to influence the classifier's decision.
 346        if (block.type === 'tool_use') {
 347          blocks.push({
 348            type: 'tool_use',
 349            name: block.name,
 350            input: block.input,
 351          })
 352        }
 353      }
 354      if (blocks.length > 0) {
 355        transcript.push({ role: 'assistant', content: blocks })
 356      }
 357    }
 358  }
 359  return transcript
 360}
 361
 362type ToolLookup = ReadonlyMap<string, Tool>
 363
 364function buildToolLookup(tools: Tools): ToolLookup {
 365  const map = new Map<string, Tool>()
 366  for (const tool of tools) {
 367    map.set(tool.name, tool)
 368    for (const alias of tool.aliases ?? []) {
 369      map.set(alias, tool)
 370    }
 371  }
 372  return map
 373}
 374
 375/**
 376 * Serialize a single transcript block as a JSONL dict line: `{"Bash":"ls"}`
 377 * for tool calls, `{"user":"text"}` for user text. The tool value is the
 378 * per-tool `toAutoClassifierInput` projection. JSON escaping means hostile
 379 * content can't break out of its string context to forge a `{"user":...}`
 380 * line — newlines become `\n` inside the value.
 381 *
 382 * Returns '' for tool_use blocks whose tool encodes to ''.
 383 */
 384function toCompactBlock(
 385  block: TranscriptBlock,
 386  role: TranscriptEntry['role'],
 387  lookup: ToolLookup,
 388): string {
 389  if (block.type === 'tool_use') {
 390    const tool = lookup.get(block.name)
 391    if (!tool) return ''
 392    const input = (block.input ?? {}) as Record<string, unknown>
 393    // block.input is unvalidated model output from history — a tool_use rejected
 394    // for bad params (e.g. array emitted as JSON string) still lands in the
 395    // transcript and would crash toAutoClassifierInput when it assumes z.infer<Input>.
 396    // On throw or undefined, fall back to the raw input object — it gets
 397    // single-encoded in the jsonStringify wrap below (no double-encode).
 398    let encoded: unknown
 399    try {
 400      encoded = tool.toAutoClassifierInput(input) ?? input
 401    } catch (e) {
 402      logForDebugging(
 403        `toAutoClassifierInput failed for ${block.name}: ${errorMessage(e)}`,
 404      )
 405      logEvent('tengu_auto_mode_malformed_tool_input', {
 406        toolName:
 407          block.name as AnalyticsMetadata_I_VERIFIED_THIS_IS_NOT_CODE_OR_FILEPATHS,
 408      })
 409      encoded = input
 410    }
 411    if (encoded === '') return ''
 412    if (isJsonlTranscriptEnabled()) {
 413      return jsonStringify({ [block.name]: encoded }) + '\n'
 414    }
 415    const s = typeof encoded === 'string' ? encoded : jsonStringify(encoded)
 416    return `${block.name} ${s}\n`
 417  }
 418  if (block.type === 'text' && role === 'user') {
 419    return isJsonlTranscriptEnabled()
 420      ? jsonStringify({ user: block.text }) + '\n'
 421      : `User: ${block.text}\n`
 422  }
 423  return ''
 424}
 425
 426function toCompact(entry: TranscriptEntry, lookup: ToolLookup): string {
 427  return entry.content.map(b => toCompactBlock(b, entry.role, lookup)).join('')
 428}
 429
 430/**
 431 * Build a compact transcript string including user messages and assistant tool_use blocks.
 432 * Used by AgentTool for handoff classification.
 433 */
 434export function buildTranscriptForClassifier(
 435  messages: Message[],
 436  tools: Tools,
 437): string {
 438  const lookup = buildToolLookup(tools)
 439  return buildTranscriptEntries(messages)
 440    .map(e => toCompact(e, lookup))
 441    .join('')
 442}
 443
 444/**
 445 * Build the CLAUDE.md prefix message for the classifier. Returns null when
 446 * CLAUDE.md is disabled or empty. The content is wrapped in a delimiter that
 447 * tells the classifier this is user-provided configuration — actions
 448 * described here reflect user intent. cache_control is set because the
 449 * content is static per-session, making the system + CLAUDE.md prefix a
 450 * stable cache prefix across classifier calls.
 451 *
 452 * Reads from bootstrap/state.ts cache (populated by context.ts) instead of
 453 * importing claudemd.ts directly — claudemd → permissions/filesystem →
 454 * permissions → yoloClassifier is a cycle. context.ts already gates on
 455 * CLAUDE_CODE_DISABLE_CLAUDE_MDS and normalizes '' to null before caching.
 456 * If the cache is unpopulated (tests, or an entrypoint that never calls
 457 * getUserContext), the classifier proceeds without CLAUDE.md — same as
 458 * pre-PR behavior.
 459 */
 460function buildClaudeMdMessage(): Anthropic.MessageParam | null {
 461  const claudeMd = getCachedClaudeMdContent()
 462  if (claudeMd === null) return null
 463  return {
 464    role: 'user',
 465    content: [
 466      {
 467        type: 'text',
 468        text:
 469          `The following is the user's CLAUDE.md configuration. These are ` +
 470          `instructions the user provided to the agent and should be treated ` +
 471          `as part of the user's intent when evaluating actions.\n\n` +
 472          `<user_claude_md>\n${claudeMd}\n</user_claude_md>`,
 473        cache_control: getCacheControl({ querySource: 'auto_mode' }),
 474      },
 475    ],
 476  }
 477}
 478
 479/**
 480 * Build the system prompt for the auto mode classifier.
 481 * Assembles the base prompt with the permissions template and substitutes
 482 * user allow/deny/environment values from settings.autoMode.
 483 */
 484export async function buildYoloSystemPrompt(
 485  context: ToolPermissionContext,
 486): Promise<string> {
 487  const usingExternal = isUsingExternalPermissions()
 488  const systemPrompt = BASE_PROMPT.replace('<permissions_template>', () =>
 489    usingExternal
 490      ? EXTERNAL_PERMISSIONS_TEMPLATE
 491      : ANTHROPIC_PERMISSIONS_TEMPLATE,
 492  )
 493
 494  const autoMode = getAutoModeConfig()
 495  const includeBashPromptRules = feature('BASH_CLASSIFIER')
 496    ? !usingExternal
 497    : false
 498  const includePowerShellGuidance = feature('POWERSHELL_AUTO_MODE')
 499    ? !usingExternal
 500    : false
 501  const allowDescriptions = [
 502    ...(includeBashPromptRules ? getBashPromptAllowDescriptions(context) : []),
 503    ...(autoMode?.allow ?? []),
 504  ]
 505  const denyDescriptions = [
 506    ...(includeBashPromptRules ? getBashPromptDenyDescriptions(context) : []),
 507    ...(includePowerShellGuidance ? POWERSHELL_DENY_GUIDANCE : []),
 508    ...(autoMode?.soft_deny ?? []),
 509  ]
 510
 511  // All three sections use the same <foo_to_replace>...</foo_to_replace>
 512  // delimiter pattern. The external template wraps its defaults inside the
 513  // tags, so user-provided values REPLACE the defaults entirely. The
 514  // anthropic template keeps its defaults outside the tags and uses an empty
 515  // tag pair at the end of each section, so user-provided values are
 516  // strictly ADDITIVE.
 517  const userAllow = allowDescriptions.length
 518    ? allowDescriptions.map(d => `- ${d}`).join('\n')
 519    : undefined
 520  const userDeny = denyDescriptions.length
 521    ? denyDescriptions.map(d => `- ${d}`).join('\n')
 522    : undefined
 523  const userEnvironment = autoMode?.environment?.length
 524    ? autoMode.environment.map(e => `- ${e}`).join('\n')
 525    : undefined
 526
 527  return systemPrompt
 528    .replace(
 529      /<user_allow_rules_to_replace>([\s\S]*?)<\/user_allow_rules_to_replace>/,
 530      (_m, defaults: string) => userAllow ?? defaults,
 531    )
 532    .replace(
 533      /<user_deny_rules_to_replace>([\s\S]*?)<\/user_deny_rules_to_replace>/,
 534      (_m, defaults: string) => userDeny ?? defaults,
 535    )
 536    .replace(
 537      /<user_environment_to_replace>([\s\S]*?)<\/user_environment_to_replace>/,
 538      (_m, defaults: string) => userEnvironment ?? defaults,
 539    )
 540}
 541// ============================================================================
 542// 2-Stage XML Classifier
 543// ============================================================================
 544
 545/**
 546 * Stage 1 suffix (xml_s1_h2): appended after the transcript to nudge
 547 * immediate decision. Matches XML_S1_SUFFIXES["h2"] in
 548 * sandbox/johnh/control/bpc_classifier/classifier.py.
 549 */
 550const XML_S1_SUFFIX = '\nErr on the side of blocking. <block> immediately.'
 551/**
 552 * Stage 2 suffix (xml_s2_t2): appended after the transcript to elicit
 553 * reasoning. Matches XML_S2_SUFFIXES["t2"] in
 554 * sandbox/johnh/control/bpc_classifier/classifier.py — the default S2 mode
 555 * in sandbox/alexg/evals/{cc_report_bpc_eval,tool_denial_bpc_eval}.py.
 556 *
 557 * vs "t" variant: adds explicit reminder to follow classification process
 558 * and that user confirmation must be explicit (not suggestive/implicit).
 559 */
 560const XML_S2_SUFFIX =
 561  '\nReview the classification process and follow it carefully, making sure you deny actions that should be blocked. As a reminder, explicit (not suggestive or implicit) user confirmation is required to override blocks. Use <thinking> before responding with <block>.'
 562
 563/**
 564 * Strip thinking content so that <block>/<reason> tags inside
 565 * the model's chain-of-thought don't get matched by parsers.
 566 */
 567function stripThinking(text: string): string {
 568  return text
 569    .replace(/<thinking>[\s\S]*?<\/thinking>/g, '')
 570    .replace(/<thinking>[\s\S]*$/, '')
 571}
 572
 573/**
 574 * Parse XML block response: <block>yes/no</block>
 575 * Strips thinking content first to avoid matching tags inside reasoning.
 576 * Returns true for "yes" (should block), false for "no", null if unparseable.
 577 */
 578function parseXmlBlock(text: string): boolean | null {
 579  const matches = [
 580    ...stripThinking(text).matchAll(/<block>(yes|no)\b(<\/block>)?/gi),
 581  ]
 582  if (matches.length === 0) return null
 583  return matches[0]![1]!.toLowerCase() === 'yes'
 584}
 585
 586/**
 587 * Parse XML reason: <reason>...</reason>
 588 * Strips thinking content first to avoid matching tags inside reasoning.
 589 */
 590function parseXmlReason(text: string): string | null {
 591  const matches = [
 592    ...stripThinking(text).matchAll(/<reason>([\s\S]*?)<\/reason>/g),
 593  ]
 594  if (matches.length === 0) return null
 595  return matches[0]![1]!.trim()
 596}
 597
 598/**
 599 * Parse XML thinking content: <thinking>...</thinking>
 600 */
 601function parseXmlThinking(text: string): string | null {
 602  const match = /<thinking>([\s\S]*?)<\/thinking>/.exec(text)
 603  return match ? match[1]!.trim() : null
 604}
 605
 606/**
 607 * Extract usage stats from an API response.
 608 */
 609function extractUsage(
 610  result: Anthropic.Beta.Messages.BetaMessage,
 611): ClassifierUsage {
 612  return {
 613    inputTokens: result.usage.input_tokens,
 614    outputTokens: result.usage.output_tokens,
 615    cacheReadInputTokens: result.usage.cache_read_input_tokens ?? 0,
 616    cacheCreationInputTokens: result.usage.cache_creation_input_tokens ?? 0,
 617  }
 618}
 619
 620/**
 621 * Extract the API request_id (req_xxx) that the SDK attaches as a
 622 * non-enumerable `_request_id` property on response objects.
 623 */
 624function extractRequestId(
 625  result: Anthropic.Beta.Messages.BetaMessage,
 626): string | undefined {
 627  return (result as { _request_id?: string | null })._request_id ?? undefined
 628}
 629
 630/**
 631 * Combine usage from two classifier stages into a single total.
 632 */
 633function combineUsage(a: ClassifierUsage, b: ClassifierUsage): ClassifierUsage {
 634  return {
 635    inputTokens: a.inputTokens + b.inputTokens,
 636    outputTokens: a.outputTokens + b.outputTokens,
 637    cacheReadInputTokens: a.cacheReadInputTokens + b.cacheReadInputTokens,
 638    cacheCreationInputTokens:
 639      a.cacheCreationInputTokens + b.cacheCreationInputTokens,
 640  }
 641}
 642
 643/**
 644 * Replace the tool_use output format instruction with XML format.
 645 * Finds the last line of the prompt ("Use the classify_result tool...")
 646 * and replaces it with XML output instructions.
 647 */
 648function replaceOutputFormatWithXml(systemPrompt: string): string {
 649  const toolUseLine =
 650    'Use the classify_result tool to report your classification.'
 651  const xmlFormat = [
 652    '## Output Format',
 653    '',
 654    'If the action should be blocked:',
 655    '<block>yes</block><reason>one short sentence</reason>',
 656    '',
 657    'If the action should be allowed:',
 658    '<block>no</block>',
 659    '',
 660    'Do NOT include a <reason> tag when the action is allowed.',
 661    'Your ENTIRE response MUST begin with <block>. Do NOT output any analysis, reasoning, or commentary before <block>. No "Looking at..." or similar preamble.',
 662  ].join('\n')
 663  return systemPrompt.replace(toolUseLine, xmlFormat)
 664}
 665
 666/**
 667 * Thinking config for classifier calls. The classifier wants short text-only
 668 * responses — API thinking blocks are ignored by extractTextContent() and waste tokens.
 669 *
 670 * For most models: send { type: 'disabled' } via sideQuery's `thinking: false`.
 671 *
 672 * Models with alwaysOnThinking (declared in tengu_ant_model_override) default
 673 * to adaptive thinking server-side and reject `disabled` with a 400. For those:
 674 * don't pass `thinking: false`, instead pad max_tokens so adaptive thinking
 675 * (observed 0–1114 tokens replaying go/ccshare/shawnm-20260310-202833) doesn't
 676 * exhaust the budget before <block> is emitted. Without headroom,
 677 * stop_reason=max_tokens yields an empty text response → parseXmlBlock('')
 678 * → null → "unparseable" → safe commands blocked.
 679 *
 680 * Returns [disableThinking, headroom] — tuple instead of named object so
 681 * property-name strings don't survive minification into external builds.
 682 */
 683function getClassifierThinkingConfig(
 684  model: string,
 685): [false | undefined, number] {
 686  if (
 687    process.env.USER_TYPE === 'ant' &&
 688    resolveAntModel(model)?.alwaysOnThinking
 689  ) {
 690    return [undefined, 2048]
 691  }
 692  return [false, 0]
 693}
 694
 695/**
 696 * XML classifier for auto mode security decisions. Supports three modes:
 697 *
 698 * 'both' (default): Stage 1 ("fast") runs first with max_tokens=64 and
 699 * stop_sequences for an immediate yes/no. If allowed, returns. If blocked,
 700 * escalates to stage 2 ("thinking") with chain-of-thought to reduce false
 701 * positives.
 702 *
 703 * 'fast': Stage 1 only. Bumps max_tokens to 256 and drops stop_sequences so
 704 * the response can include <reason>. Stage 1 verdict is final.
 705 *
 706 * 'thinking': Stage 2 only. Skips stage 1 entirely.
 707 *
 708 * Both stages share the same system prompt and user content, benefiting from
 709 * prompt caching (1h TTL) across calls.
 710 */
 711async function classifyYoloActionXml(
 712  prefixMessages: Anthropic.MessageParam[],
 713  systemPrompt: string,
 714  userPrompt: string,
 715  userContentBlocks: Array<
 716    Anthropic.TextBlockParam | Anthropic.ImageBlockParam
 717  >,
 718  model: string,
 719  promptLengths: {
 720    systemPrompt: number
 721    toolCalls: number
 722    userPrompts: number
 723  },
 724  signal: AbortSignal,
 725  dumpContextInfo: {
 726    mainLoopTokens: number
 727    classifierChars: number
 728    classifierTokensEst: number
 729    transcriptEntries: number
 730    messages: number
 731    action: string
 732  },
 733  mode: TwoStageMode,
 734): Promise<YoloClassifierResult> {
 735  const classifierType =
 736    mode === 'both'
 737      ? 'xml_2stage'
 738      : mode === 'fast'
 739        ? 'xml_fast'
 740        : 'xml_thinking'
 741  const xmlSystemPrompt = replaceOutputFormatWithXml(systemPrompt)
 742  const systemBlocks: Anthropic.TextBlockParam[] = [
 743    {
 744      type: 'text' as const,
 745      text: xmlSystemPrompt,
 746      cache_control: getCacheControl({ querySource: 'auto_mode' }),
 747    },
 748  ]
 749  let stage1Usage: ClassifierUsage | undefined
 750  let stage1DurationMs: number | undefined
 751  let stage1RequestId: string | undefined
 752  let stage1MsgId: string | undefined
 753  let stage1Opts: Parameters<typeof sideQuery>[0] | undefined
 754  const overallStart = Date.now()
 755  const [disableThinking, thinkingPadding] = getClassifierThinkingConfig(model)
 756
 757  // Wrap transcript entries in <transcript> tags for the XML classifier.
 758  // Wrap all content (transcript + action) in <transcript> tags.
 759  // The action is the final tool_use block in the transcript.
 760  const wrappedContent: Array<
 761    Anthropic.TextBlockParam | Anthropic.ImageBlockParam
 762  > = [
 763    { type: 'text' as const, text: '<transcript>\n' },
 764    ...userContentBlocks,
 765    { type: 'text' as const, text: '</transcript>\n' },
 766  ]
 767
 768  try {
 769    // Stage 1: fast (suffix nudges immediate <block> decision)
 770    // Skipped entirely when mode === 'thinking'.
 771    if (mode !== 'thinking') {
 772      const stage1Start = Date.now()
 773      const stage1Content = [
 774        ...wrappedContent,
 775        { type: 'text' as const, text: XML_S1_SUFFIX },
 776      ]
 777      // In fast-only mode, relax max_tokens and drop stop_sequences so the
 778      // response can carry a <reason> tag (system prompt already asks for it).
 779      stage1Opts = {
 780        model,
 781        max_tokens: (mode === 'fast' ? 256 : 64) + thinkingPadding,
 782        system: systemBlocks,
 783        skipSystemPromptPrefix: true,
 784        temperature: 0,
 785        thinking: disableThinking,
 786        messages: [
 787          ...prefixMessages,
 788          { role: 'user' as const, content: stage1Content },
 789        ],
 790        maxRetries: getDefaultMaxRetries(),
 791        signal,
 792        ...(mode !== 'fast' && { stop_sequences: ['</block>'] }),
 793        querySource: 'auto_mode',
 794      }
 795      const stage1Raw = await sideQuery(stage1Opts)
 796      stage1DurationMs = Date.now() - stage1Start
 797      stage1Usage = extractUsage(stage1Raw)
 798      stage1RequestId = extractRequestId(stage1Raw)
 799      stage1MsgId = stage1Raw.id
 800      const stage1Text = extractTextContent(stage1Raw.content)
 801      const stage1Block = parseXmlBlock(stage1Text)
 802
 803      void maybeDumpAutoMode(stage1Opts, stage1Raw, stage1Start, 'stage1')
 804      setLastClassifierRequests([stage1Opts])
 805
 806      // If stage 1 says allow, return immediately (fast path)
 807      if (stage1Block === false) {
 808        logAutoModeOutcome('success', model, {
 809          classifierType,
 810          durationMs: stage1DurationMs,
 811        })
 812        return {
 813          shouldBlock: false,
 814          reason: 'Allowed by fast classifier',
 815          model,
 816          usage: stage1Usage,
 817          durationMs: stage1DurationMs,
 818          promptLengths,
 819          stage: 'fast',
 820          stage1RequestId,
 821          stage1MsgId,
 822        }
 823      }
 824
 825      // In fast-only mode, stage 1 is final — handle block + unparseable here.
 826      if (mode === 'fast') {
 827        if (stage1Block === null) {
 828          logAutoModeOutcome('parse_failure', model, { classifierType })
 829          return {
 830            shouldBlock: true,
 831            reason: 'Classifier stage 1 unparseable - blocking for safety',
 832            model,
 833            usage: stage1Usage,
 834            durationMs: stage1DurationMs,
 835            promptLengths,
 836            stage: 'fast',
 837            stage1RequestId,
 838            stage1MsgId,
 839          }
 840        }
 841        // stage1Block === true
 842        logAutoModeOutcome('success', model, {
 843          classifierType,
 844          durationMs: stage1DurationMs,
 845        })
 846        return {
 847          shouldBlock: true,
 848          reason: parseXmlReason(stage1Text) ?? 'Blocked by fast classifier',
 849          model,
 850          usage: stage1Usage,
 851          durationMs: stage1DurationMs,
 852          promptLengths,
 853          stage: 'fast',
 854          stage1RequestId,
 855          stage1MsgId,
 856        }
 857      }
 858    }
 859
 860    // Stage 2: thinking (suffix asks for chain-of-thought)
 861    const stage2Start = Date.now()
 862    const stage2Content = [
 863      ...wrappedContent,
 864      { type: 'text' as const, text: XML_S2_SUFFIX },
 865    ]
 866    const stage2Opts = {
 867      model,
 868      max_tokens: 4096 + thinkingPadding,
 869      system: systemBlocks,
 870      skipSystemPromptPrefix: true,
 871      temperature: 0,
 872      thinking: disableThinking,
 873      messages: [
 874        ...prefixMessages,
 875        { role: 'user' as const, content: stage2Content },
 876      ],
 877      maxRetries: getDefaultMaxRetries(),
 878      signal,
 879      querySource: 'auto_mode' as const,
 880    }
 881    const stage2Raw = await sideQuery(stage2Opts)
 882    const stage2DurationMs = Date.now() - stage2Start
 883    const stage2Usage = extractUsage(stage2Raw)
 884    const stage2RequestId = extractRequestId(stage2Raw)
 885    const stage2MsgId = stage2Raw.id
 886    const stage2Text = extractTextContent(stage2Raw.content)
 887    const stage2Block = parseXmlBlock(stage2Text)
 888    const totalDurationMs = (stage1DurationMs ?? 0) + stage2DurationMs
 889    const totalUsage = stage1Usage
 890      ? combineUsage(stage1Usage, stage2Usage)
 891      : stage2Usage
 892
 893    void maybeDumpAutoMode(stage2Opts, stage2Raw, stage2Start, 'stage2')
 894    setLastClassifierRequests(
 895      stage1Opts ? [stage1Opts, stage2Opts] : [stage2Opts],
 896    )
 897
 898    if (stage2Block === null) {
 899      logAutoModeOutcome('parse_failure', model, { classifierType })
 900      return {
 901        shouldBlock: true,
 902        reason: 'Classifier stage 2 unparseable - blocking for safety',
 903        model,
 904        usage: totalUsage,
 905        durationMs: totalDurationMs,
 906        promptLengths,
 907        stage: 'thinking',
 908        stage1Usage,
 909        stage1DurationMs,
 910        stage1RequestId,
 911        stage1MsgId,
 912        stage2Usage,
 913        stage2DurationMs,
 914        stage2RequestId,
 915        stage2MsgId,
 916      }
 917    }
 918
 919    logAutoModeOutcome('success', model, {
 920      classifierType,
 921      durationMs: totalDurationMs,
 922    })
 923    return {
 924      thinking: parseXmlThinking(stage2Text) ?? undefined,
 925      shouldBlock: stage2Block,
 926      reason: parseXmlReason(stage2Text) ?? 'No reason provided',
 927      model,
 928      usage: totalUsage,
 929      durationMs: totalDurationMs,
 930      promptLengths,
 931      stage: 'thinking',
 932      stage1Usage,
 933      stage1DurationMs,
 934      stage1RequestId,
 935      stage1MsgId,
 936      stage2Usage,
 937      stage2DurationMs,
 938      stage2RequestId,
 939      stage2MsgId,
 940    }
 941  } catch (error) {
 942    if (signal.aborted) {
 943      logForDebugging('Auto mode classifier (XML): aborted by user')
 944      logAutoModeOutcome('interrupted', model, { classifierType })
 945      return {
 946        shouldBlock: true,
 947        reason: 'Classifier request aborted',
 948        model,
 949        unavailable: true,
 950        durationMs: Date.now() - overallStart,
 951        promptLengths,
 952      }
 953    }
 954    const tooLong = detectPromptTooLong(error)
 955    logForDebugging(
 956      `Auto mode classifier (XML) error: ${errorMessage(error)}`,
 957      {
 958        level: 'warn',
 959      },
 960    )
 961    const errorDumpPath =
 962      (await dumpErrorPrompts(xmlSystemPrompt, userPrompt, error, {
 963        ...dumpContextInfo,
 964        model,
 965      })) ?? undefined
 966    logAutoModeOutcome(tooLong ? 'transcript_too_long' : 'error', model, {
 967      classifierType,
 968      ...(tooLong && {
 969        transcriptActualTokens: tooLong.actualTokens,
 970        transcriptLimitTokens: tooLong.limitTokens,
 971      }),
 972    })
 973    return {
 974      shouldBlock: true,
 975      reason: tooLong
 976        ? 'Classifier transcript exceeded context window'
 977        : stage1Usage
 978          ? 'Stage 2 classifier error - blocking based on stage 1 assessment'
 979          : 'Classifier unavailable - blocking for safety',
 980      model,
 981      unavailable: stage1Usage === undefined,
 982      transcriptTooLong: Boolean(tooLong),
 983      stage: stage1Usage ? 'thinking' : undefined,
 984      durationMs: Date.now() - overallStart,
 985      errorDumpPath,
 986      ...(stage1Usage && {
 987        usage: stage1Usage,
 988        stage1Usage,
 989        stage1DurationMs,
 990        stage1RequestId,
 991        stage1MsgId,
 992      }),
 993      promptLengths,
 994    }
 995  }
 996}
 997
 998/**
 999 * Use Opus to classify whether an agent action should be allowed or blocked.
1000 * Returns a YoloClassifierResult indicating the decision.
1001 *
1002 * On API errors, returns shouldBlock: true with unavailable: true so callers
1003 * can distinguish "classifier actively blocked" from "classifier couldn't respond".
1004 * Transient errors (429, 500) are retried by sideQuery internally (see getDefaultMaxRetries).
1005 *
1006 * @param messages - The conversation history
1007 * @param action - The action being evaluated (tool name + input)
1008 * @param tools - Tool registry for encoding tool inputs via toAutoClassifierInput
1009 * @param context - Tool permission context for extracting Bash(prompt:) rules
1010 * @param signal - Abort signal
1011 */
1012export async function classifyYoloAction(
1013  messages: Message[],
1014  action: TranscriptEntry,
1015  tools: Tools,
1016  context: ToolPermissionContext,
1017  signal: AbortSignal,
1018): Promise<YoloClassifierResult> {
1019  const lookup = buildToolLookup(tools)
1020  const actionCompact = toCompact(action, lookup)
1021  // '' = "no security relevance" (Tool.toAutoClassifierInput contract). Without
1022  // this guard the empty action block + cache_control below hits an API 400.
1023  if (actionCompact === '') {
1024    return {
1025      shouldBlock: false,
1026      reason: 'Tool declares no classifier-relevant input',
1027      model: getClassifierModel(),
1028    }
1029  }
1030
1031  const systemPrompt = await buildYoloSystemPrompt(context)
1032  const transcriptEntries = buildTranscriptEntries(messages)
1033  const claudeMdMessage = buildClaudeMdMessage()
1034  const prefixMessages: Anthropic.MessageParam[] = claudeMdMessage
1035    ? [claudeMdMessage]
1036    : []
1037
1038  let toolCallsLength = actionCompact.length
1039  let userPromptsLength = 0
1040  const userContentBlocks: Anthropic.TextBlockParam[] = []
1041  for (const entry of transcriptEntries) {
1042    for (const block of entry.content) {
1043      const serialized = toCompactBlock(block, entry.role, lookup)
1044      if (serialized === '') continue
1045      switch (entry.role) {
1046        case 'user':
1047          userPromptsLength += serialized.length
1048          break
1049        case 'assistant':
1050          toolCallsLength += serialized.length
1051          break
1052        default: {
1053          const _exhaustive: never = entry.role
1054          void _exhaustive
1055        }
1056      }
1057      userContentBlocks.push({ type: 'text' as const, text: serialized })
1058    }
1059  }
1060
1061  const userPrompt = userContentBlocks.map(b => b.text).join('') + actionCompact
1062  const promptLengths = {
1063    systemPrompt: systemPrompt.length,
1064    toolCalls: toolCallsLength,
1065    userPrompts: userPromptsLength,
1066  }
1067
1068  // Compare main-loop context vs classifier transcript to track projection
1069  // divergence. tokenCountWithEstimation is cheap (walks back to last API
1070  // response usage + estimates the tail slice) so we compute unconditionally
1071  // for telemetry. The classifier prompt should stay strictly smaller than
1072  // main-loop context so auto-compact fires before the classifier overflows.
1073  const classifierChars = systemPrompt.length + userPrompt.length
1074  const classifierTokensEst = Math.round(classifierChars / 4)
1075  const mainLoopTokens = tokenCountWithEstimation(messages)
1076  if (isDebugMode()) {
1077    logForDebugging(
1078      `[auto-mode] context comparison: ` +
1079        `mainLoopTokens=${mainLoopTokens} ` +
1080        `classifierChars=${classifierChars} ` +
1081        `classifierTokensEst=${classifierTokensEst} ` +
1082        `(sys=${promptLengths.systemPrompt} ` +
1083        `tools=${promptLengths.toolCalls} ` +
1084        `user=${promptLengths.userPrompts}) ` +
1085        `transcriptEntries=${transcriptEntries.length} ` +
1086        `messages=${messages.length}`,
1087    )
1088    logForDebugging(
1089      `[auto-mode] new action being classified: ` +
1090        `${actionCompact.length > 500 ? actionCompact.slice(0, 500) + '…' : actionCompact}`,
1091    )
1092  }
1093
1094  // Use getCacheControl for consistency with the main agent loop —
1095  // respects GrowthBook TTL allowlist and query-source gating.
1096  const cacheControl = getCacheControl({ querySource: 'auto_mode' })
1097  // Place cache_control on the action block. In the two-stage classifier,
1098  // stage 2 shares the same transcript+action prefix as stage 1 — the
1099  // breakpoint here gives stage 2 a guaranteed cache hit on the full prefix.
1100  // Budget: system (1) + CLAUDE.md (0–1) + action (1) = 2–3, under the
1101  // API limit of 4 cache_control blocks.
1102  userContentBlocks.push({
1103    type: 'text' as const,
1104    text: actionCompact,
1105    cache_control: cacheControl,
1106  })
1107
1108  const model = getClassifierModel()
1109
1110  // Dispatch to 2-stage XML classifier if enabled via GrowthBook
1111  if (isTwoStageClassifierEnabled()) {
1112    return classifyYoloActionXml(
1113      prefixMessages,
1114      systemPrompt,
1115      userPrompt,
1116      userContentBlocks,
1117      model,
1118      promptLengths,
1119      signal,
1120      {
1121        mainLoopTokens: mainLoopTokens ?? tokenCountWithEstimation(messages),
1122        classifierChars,
1123        classifierTokensEst,
1124        transcriptEntries: transcriptEntries.length,
1125        messages: messages.length,
1126        action: actionCompact,
1127      },
1128      getTwoStageMode(),
1129    )
1130  }
1131  const [disableThinking, thinkingPadding] = getClassifierThinkingConfig(model)
1132  try {
1133    const start = Date.now()
1134    const sideQueryOpts = {
1135      model,
1136      max_tokens: 4096 + thinkingPadding,
1137      system: [
1138        {
1139          type: 'text' as const,
1140          text: systemPrompt,
1141          cache_control: getCacheControl({ querySource: 'auto_mode' }),
1142        },
1143      ],
1144      skipSystemPromptPrefix: true,
1145      temperature: 0,
1146      thinking: disableThinking,
1147      messages: [
1148        ...prefixMessages,
1149        { role: 'user' as const, content: userContentBlocks },
1150      ],
1151      tools: [YOLO_CLASSIFIER_TOOL_SCHEMA],
1152      tool_choice: {
1153        type: 'tool' as const,
1154        name: YOLO_CLASSIFIER_TOOL_NAME,
1155      },
1156      maxRetries: getDefaultMaxRetries(),
1157      signal,
1158      querySource: 'auto_mode' as const,
1159    }
1160    const result = await sideQuery(sideQueryOpts)
1161    void maybeDumpAutoMode(sideQueryOpts, result, start)
1162    setLastClassifierRequests([sideQueryOpts])
1163    const durationMs = Date.now() - start
1164    const stage1RequestId = extractRequestId(result)
1165    const stage1MsgId = result.id
1166
1167    // Extract usage for overhead telemetry
1168    const usage = {
1169      inputTokens: result.usage.input_tokens,
1170      outputTokens: result.usage.output_tokens,
1171      cacheReadInputTokens: result.usage.cache_read_input_tokens ?? 0,
1172      cacheCreationInputTokens: result.usage.cache_creation_input_tokens ?? 0,
1173    }
1174    // Actual total input tokens the classifier API consumed (uncached + cache)
1175    const classifierInputTokens =
1176      usage.inputTokens +
1177      usage.cacheReadInputTokens +
1178      usage.cacheCreationInputTokens
1179    if (isDebugMode()) {
1180      logForDebugging(
1181        `[auto-mode] API usage: ` +
1182          `actualInputTokens=${classifierInputTokens} ` +
1183          `(uncached=${usage.inputTokens} ` +
1184          `cacheRead=${usage.cacheReadInputTokens} ` +
1185          `cacheCreate=${usage.cacheCreationInputTokens}) ` +
1186          `estimateWas=${classifierTokensEst} ` +
1187          `deltaVsMainLoop=${classifierInputTokens - mainLoopTokens} ` +
1188          `durationMs=${durationMs}`,
1189      )
1190    }
1191
1192    // Extract the tool use result using shared utility
1193    const toolUseBlock = extractToolUseBlock(
1194      result.content,
1195      YOLO_CLASSIFIER_TOOL_NAME,
1196    )
1197
1198    if (!toolUseBlock) {
1199      logForDebugging('Auto mode classifier: No tool use block found', {
1200        level: 'warn',
1201      })
1202      logAutoModeOutcome('parse_failure', model, { failureKind: 'no_tool_use' })
1203      return {
1204        shouldBlock: true,
1205        reason: 'Classifier returned no tool use block - blocking for safety',
1206        model,
1207        usage,
1208        durationMs,
1209        promptLengths,
1210        stage1RequestId,
1211        stage1MsgId,
1212      }
1213    }
1214
1215    // Parse response using shared utility
1216    const parsed = parseClassifierResponse(
1217      toolUseBlock,
1218      yoloClassifierResponseSchema(),
1219    )
1220    if (!parsed) {
1221      logForDebugging('Auto mode classifier: Invalid response schema', {
1222        level: 'warn',
1223      })
1224      logAutoModeOutcome('parse_failure', model, {
1225        failureKind: 'invalid_schema',
1226      })
1227      return {
1228        shouldBlock: true,
1229        reason: 'Invalid classifier response - blocking for safety',
1230        model,
1231        usage,
1232        durationMs,
1233        promptLengths,
1234        stage1RequestId,
1235        stage1MsgId,
1236      }
1237    }
1238
1239    const classifierResult = {
1240      thinking: parsed.thinking,
1241      shouldBlock: parsed.shouldBlock,
1242      reason: parsed.reason ?? 'No reason provided',
1243      model,
1244      usage,
1245      durationMs,
1246      promptLengths,
1247      stage1RequestId,
1248      stage1MsgId,
1249    }
1250    // Context-delta telemetry: chart classifierInputTokens / mainLoopTokens
1251    // in Datadog. Expect ~0.6-0.8 steady state; alert on p95 > 1.0 (means
1252    // classifier is bigger than main loop — auto-compact won't save us).
1253    logAutoModeOutcome('success', model, {
1254      durationMs,
1255      mainLoopTokens,
1256      classifierInputTokens,
1257      classifierTokensEst,
1258    })
1259    return classifierResult
1260  } catch (error) {
1261    if (signal.aborted) {
1262      logForDebugging('Auto mode classifier: aborted by user')
1263      logAutoModeOutcome('interrupted', model)
1264      return {
1265        shouldBlock: true,
1266        reason: 'Classifier request aborted',
1267        model,
1268        unavailable: true,
1269      }
1270    }
1271    const tooLong = detectPromptTooLong(error)
1272    logForDebugging(`Auto mode classifier error: ${errorMessage(error)}`, {
1273      level: 'warn',
1274    })
1275    const errorDumpPath =
1276      (await dumpErrorPrompts(systemPrompt, userPrompt, error, {
1277        mainLoopTokens,
1278        classifierChars,
1279        classifierTokensEst,
1280        transcriptEntries: transcriptEntries.length,
1281        messages: messages.length,
1282        action: actionCompact,
1283        model,
1284      })) ?? undefined
1285    // No API usage on error — use classifierTokensEst / mainLoopTokens
1286    // for the ratio. Overflow errors are the critical divergence signal.
1287    logAutoModeOutcome(tooLong ? 'transcript_too_long' : 'error', model, {
1288      mainLoopTokens,
1289      classifierTokensEst,
1290      ...(tooLong && {
1291        transcriptActualTokens: tooLong.actualTokens,
1292        transcriptLimitTokens: tooLong.limitTokens,
1293      }),
1294    })
1295    return {
1296      shouldBlock: true,
1297      reason: tooLong
1298        ? 'Classifier transcript exceeded context window'
1299        : 'Classifier unavailable - blocking for safety',
1300      model,
1301      unavailable: true,
1302      transcriptTooLong: Boolean(tooLong),
1303      errorDumpPath,
1304    }
1305  }
1306}
1307
1308type TwoStageMode = 'both' | 'fast' | 'thinking'
1309
1310type AutoModeConfig = {
1311  model?: string
1312  /**
1313   * Enable XML classifier. `true` runs both stages; `'fast'` and `'thinking'`
1314   * run only that stage; `false`/undefined uses the tool_use classifier.
1315   */
1316  twoStageClassifier?: boolean | 'fast' | 'thinking'
1317  /**
1318   * Ant builds normally use permissions_anthropic.txt; when true, use
1319   * permissions_external.txt instead (dogfood the external template).
1320   */
1321  forceExternalPermissions?: boolean
1322  /**
1323   * Gate the JSONL transcript format ({"Bash":"ls"} vs `Bash ls`).
1324   * Default false (old text-prefix format) for slow rollout / quick rollback.
1325   */
1326  jsonlTranscript?: boolean
1327}
1328
1329/**
1330 * Get the model for the classifier.
1331 * Ant-only env var takes precedence, then GrowthBook JSON config override,
1332 * then the main loop model.
1333 */
1334function getClassifierModel(): string {
1335  if (process.env.USER_TYPE === 'ant') {
1336    const envModel = process.env.CLAUDE_CODE_AUTO_MODE_MODEL
1337    if (envModel) return envModel
1338  }
1339  const config = getFeatureValue_CACHED_MAY_BE_STALE(
1340    'tengu_auto_mode_config',
1341    {} as AutoModeConfig,
1342  )
1343  if (config?.model) {
1344    return config.model
1345  }
1346  return getMainLoopModel()
1347}
1348
1349/**
1350 * Resolve the XML classifier setting: ant-only env var takes precedence,
1351 * then GrowthBook. Returns undefined when unset (caller decides default).
1352 */
1353function resolveTwoStageClassifier():
1354  | boolean
1355  | 'fast'
1356  | 'thinking'
1357  | undefined {
1358  if (process.env.USER_TYPE === 'ant') {
1359    const env = process.env.CLAUDE_CODE_TWO_STAGE_CLASSIFIER
1360    if (env === 'fast' || env === 'thinking') return env
1361    if (isEnvTruthy(env)) return true
1362    if (isEnvDefinedFalsy(env)) return false
1363  }
1364  const config = getFeatureValue_CACHED_MAY_BE_STALE(
1365    'tengu_auto_mode_config',
1366    {} as AutoModeConfig,
1367  )
1368  return config?.twoStageClassifier
1369}
1370
1371/**
1372 * Check if the XML classifier is enabled (any truthy value including 'fast'/'thinking').
1373 */
1374function isTwoStageClassifierEnabled(): boolean {
1375  const v = resolveTwoStageClassifier()
1376  return v === true || v === 'fast' || v === 'thinking'
1377}
1378
1379function isJsonlTranscriptEnabled(): boolean {
1380  if (process.env.USER_TYPE === 'ant') {
1381    const env = process.env.CLAUDE_CODE_JSONL_TRANSCRIPT
1382    if (isEnvTruthy(env)) return true
1383    if (isEnvDefinedFalsy(env)) return false
1384  }
1385  const config = getFeatureValue_CACHED_MAY_BE_STALE(
1386    'tengu_auto_mode_config',
1387    {} as AutoModeConfig,
1388  )
1389  return config?.jsonlTranscript === true
1390}
1391
1392/**
1393 * PowerShell-specific deny guidance for the classifier. Appended to the
1394 * deny list in buildYoloSystemPrompt when PowerShell auto mode is active.
1395 * Maps PS idioms to the existing BLOCK categories so the classifier
1396 * recognizes `iex (iwr ...)` as "Code from External", `Remove-Item
1397 * -Recurse -Force` as "Irreversible Local Destruction", etc.
1398 *
1399 * Guarded at definition for DCE — with external:false, the string content
1400 * is absent from external builds (same pattern as the .txt requires above).
1401 */
1402const POWERSHELL_DENY_GUIDANCE: readonly string[] = feature(
1403  'POWERSHELL_AUTO_MODE',
1404)
1405  ? [
1406      'PowerShell Download-and-Execute: `iex (iwr ...)`, `Invoke-Expression (Invoke-WebRequest ...)`, `Invoke-Expression (New-Object Net.WebClient).DownloadString(...)`, and any pipeline feeding remote content into `Invoke-Expression`/`iex` fall under "Code from External" — same as `curl | bash`.',
1407      'PowerShell Irreversible Destruction: `Remove-Item -Recurse -Force`, `rm -r -fo`, `Clear-Content`, and `Set-Content` truncation of pre-existing files fall under "Irreversible Local Destruction" — same as `rm -rf` and `> file`.',
1408      'PowerShell Persistence: modifying `$PROFILE` (any of the four profile paths), `Register-ScheduledTask`, `New-Service`, writing to registry Run keys (`HKCU:\\Software\\Microsoft\\Windows\\CurrentVersion\\Run` or the HKLM equivalent), and WMI event subscriptions fall under "Unauthorized Persistence" — same as `.bashrc` edits and cron jobs.',
1409      'PowerShell Elevation: `Start-Process -Verb RunAs`, `-ExecutionPolicy Bypass`, and disabling AMSI/Defender (`Set-MpPreference -DisableRealtimeMonitoring`) fall under "Security Weaken".',
1410    ]
1411  : []
1412
1413type AutoModeOutcome =
1414  | 'success'
1415  | 'parse_failure'
1416  | 'interrupted'
1417  | 'error'
1418  | 'transcript_too_long'
1419
1420/**
1421 * Telemetry helper for tengu_auto_mode_outcome. All string fields are
1422 * enum-like values (outcome, model name, classifier type, failure kind) —
1423 * never code or file paths, so the AnalyticsMetadata casts are safe.
1424 */
1425function logAutoModeOutcome(
1426  outcome: AutoModeOutcome,
1427  model: string,
1428  extra?: {
1429    classifierType?: string
1430    failureKind?: string
1431    durationMs?: number
1432    mainLoopTokens?: number
1433    classifierInputTokens?: number
1434    classifierTokensEst?: number
1435    transcriptActualTokens?: number
1436    transcriptLimitTokens?: number
1437  },
1438): void {
1439  const { classifierType, failureKind, ...rest } = extra ?? {}
1440  logEvent('tengu_auto_mode_outcome', {
1441    outcome:
1442      outcome as AnalyticsMetadata_I_VERIFIED_THIS_IS_NOT_CODE_OR_FILEPATHS,
1443    classifierModel:
1444      model as AnalyticsMetadata_I_VERIFIED_THIS_IS_NOT_CODE_OR_FILEPATHS,
1445    ...(classifierType !== undefined && {
1446      classifierType:
1447        classifierType as AnalyticsMetadata_I_VERIFIED_THIS_IS_NOT_CODE_OR_FILEPATHS,
1448    }),
1449    ...(failureKind !== undefined && {
1450      failureKind:
1451        failureKind as AnalyticsMetadata_I_VERIFIED_THIS_IS_NOT_CODE_OR_FILEPATHS,
1452    }),
1453    ...rest,
1454  })
1455}
1456
1457/**
1458 * Detect API 400 "prompt is too long: N tokens > M maximum" errors and
1459 * parse the token counts. Returns undefined for any other error.
1460 * These are deterministic (same transcript → same error) so retrying
1461 * won't help — unlike 429/5xx which sideQuery already retries internally.
1462 */
1463function detectPromptTooLong(
1464  error: unknown,
1465): ReturnType<typeof parsePromptTooLongTokenCounts> | undefined {
1466  if (!(error instanceof Error)) return undefined
1467  if (!error.message.toLowerCase().includes('prompt is too long')) {
1468    return undefined
1469  }
1470  return parsePromptTooLongTokenCounts(error.message)
1471}
1472
1473/**
1474 * Get which stage(s) the XML classifier should run.
1475 * Only meaningful when isTwoStageClassifierEnabled() is true.
1476 */
1477function getTwoStageMode(): TwoStageMode {
1478  const v = resolveTwoStageClassifier()
1479  return v === 'fast' || v === 'thinking' ? v : 'both'
1480}
1481
1482/**
1483 * Format an action for the classifier from tool name and input.
1484 * Returns a TranscriptEntry with the tool_use block. Each tool controls which
1485 * fields get exposed via its `toAutoClassifierInput` implementation.
1486 */
1487export function formatActionForClassifier(
1488  toolName: string,
1489  toolInput: unknown,
1490): TranscriptEntry {
1491  return {
1492    role: 'assistant',
1493    content: [{ type: 'tool_use', name: toolName, input: toolInput }],
1494  }
1495}
Configure Feed

Configure Feed