the claude code sourcemaps leaked march 31
0
fork

Configure Feed

Select the types of activity you want to include in your feed.

at main 319 lines 9.3 kB view raw
1/** 2 * Input Tokenizer - Escape sequence boundary detection 3 * 4 * Splits terminal input into tokens: text chunks and raw escape sequences. 5 * Unlike the Parser which interprets sequences semantically, this just 6 * identifies boundaries for use by keyboard input parsing. 7 */ 8 9import { C0, ESC_TYPE, isEscFinal } from './ansi.js' 10import { isCSIFinal, isCSIIntermediate, isCSIParam } from './csi.js' 11 12export type Token = 13 | { type: 'text'; value: string } 14 | { type: 'sequence'; value: string } 15 16type State = 17 | 'ground' 18 | 'escape' 19 | 'escapeIntermediate' 20 | 'csi' 21 | 'ss3' 22 | 'osc' 23 | 'dcs' 24 | 'apc' 25 26export type Tokenizer = { 27 /** Feed input and get resulting tokens */ 28 feed(input: string): Token[] 29 /** Flush any buffered incomplete sequences */ 30 flush(): Token[] 31 /** Reset tokenizer state */ 32 reset(): void 33 /** Get any buffered incomplete sequence */ 34 buffer(): string 35} 36 37type TokenizerOptions = { 38 /** 39 * Treat `CSI M` as an X10 mouse event prefix and consume 3 payload bytes. 40 * Only enable for stdin input — `\x1b[M` is also CSI DL (Delete Lines) in 41 * output streams, and enabling this there swallows display text. Default false. 42 */ 43 x10Mouse?: boolean 44} 45 46/** 47 * Create a streaming tokenizer for terminal input. 48 * 49 * Usage: 50 * ```typescript 51 * const tokenizer = createTokenizer() 52 * const tokens1 = tokenizer.feed('hello\x1b[') 53 * const tokens2 = tokenizer.feed('A') // completes the escape sequence 54 * const remaining = tokenizer.flush() // force output incomplete sequences 55 * ``` 56 */ 57export function createTokenizer(options?: TokenizerOptions): Tokenizer { 58 let currentState: State = 'ground' 59 let currentBuffer = '' 60 const x10Mouse = options?.x10Mouse ?? false 61 62 return { 63 feed(input: string): Token[] { 64 const result = tokenize( 65 input, 66 currentState, 67 currentBuffer, 68 false, 69 x10Mouse, 70 ) 71 currentState = result.state.state 72 currentBuffer = result.state.buffer 73 return result.tokens 74 }, 75 76 flush(): Token[] { 77 const result = tokenize('', currentState, currentBuffer, true, x10Mouse) 78 currentState = result.state.state 79 currentBuffer = result.state.buffer 80 return result.tokens 81 }, 82 83 reset(): void { 84 currentState = 'ground' 85 currentBuffer = '' 86 }, 87 88 buffer(): string { 89 return currentBuffer 90 }, 91 } 92} 93 94type InternalState = { 95 state: State 96 buffer: string 97} 98 99function tokenize( 100 input: string, 101 initialState: State, 102 initialBuffer: string, 103 flush: boolean, 104 x10Mouse: boolean, 105): { tokens: Token[]; state: InternalState } { 106 const tokens: Token[] = [] 107 const result: InternalState = { 108 state: initialState, 109 buffer: '', 110 } 111 112 const data = initialBuffer + input 113 let i = 0 114 let textStart = 0 115 let seqStart = 0 116 117 const flushText = (): void => { 118 if (i > textStart) { 119 const text = data.slice(textStart, i) 120 if (text) { 121 tokens.push({ type: 'text', value: text }) 122 } 123 } 124 textStart = i 125 } 126 127 const emitSequence = (seq: string): void => { 128 if (seq) { 129 tokens.push({ type: 'sequence', value: seq }) 130 } 131 result.state = 'ground' 132 textStart = i 133 } 134 135 while (i < data.length) { 136 const code = data.charCodeAt(i) 137 138 switch (result.state) { 139 case 'ground': 140 if (code === C0.ESC) { 141 flushText() 142 seqStart = i 143 result.state = 'escape' 144 i++ 145 } else { 146 i++ 147 } 148 break 149 150 case 'escape': 151 if (code === ESC_TYPE.CSI) { 152 result.state = 'csi' 153 i++ 154 } else if (code === ESC_TYPE.OSC) { 155 result.state = 'osc' 156 i++ 157 } else if (code === ESC_TYPE.DCS) { 158 result.state = 'dcs' 159 i++ 160 } else if (code === ESC_TYPE.APC) { 161 result.state = 'apc' 162 i++ 163 } else if (code === 0x4f) { 164 // 'O' - SS3 165 result.state = 'ss3' 166 i++ 167 } else if (isCSIIntermediate(code)) { 168 // Intermediate byte (e.g., ESC ( for charset) - continue buffering 169 result.state = 'escapeIntermediate' 170 i++ 171 } else if (isEscFinal(code)) { 172 // Two-character escape sequence 173 i++ 174 emitSequence(data.slice(seqStart, i)) 175 } else if (code === C0.ESC) { 176 // Double escape - emit first, start new 177 emitSequence(data.slice(seqStart, i)) 178 seqStart = i 179 result.state = 'escape' 180 i++ 181 } else { 182 // Invalid - treat ESC as text 183 result.state = 'ground' 184 textStart = seqStart 185 } 186 break 187 188 case 'escapeIntermediate': 189 // After intermediate byte(s), wait for final byte 190 if (isCSIIntermediate(code)) { 191 // More intermediate bytes 192 i++ 193 } else if (isEscFinal(code)) { 194 // Final byte - complete the sequence 195 i++ 196 emitSequence(data.slice(seqStart, i)) 197 } else { 198 // Invalid - treat as text 199 result.state = 'ground' 200 textStart = seqStart 201 } 202 break 203 204 case 'csi': 205 // X10 mouse: CSI M + 3 raw payload bytes (Cb+32, Cx+32, Cy+32). 206 // M immediately after [ (offset 2) means no params — SGR mouse 207 // (CSI < … M) has a `<` param byte first and reaches M at offset > 2. 208 // Terminals that ignore DECSET 1006 but honor 1000/1002 emit this 209 // legacy encoding; without this branch the 3 payload bytes leak 210 // through as text (`` `rK `` / `arK` garbage in the prompt). 211 // 212 // Gated on x10Mouse — `\x1b[M` is also CSI DL (Delete Lines) and 213 // blindly consuming 3 chars corrupts output rendering (Parser/Ansi) 214 // and fragments bracketed-paste PASTE_END. Only stdin enables this. 215 // The ≥0x20 check on each payload slot is belt-and-suspenders: X10 216 // guarantees Cb≥32, Cx≥33, Cy≥33, so a control byte (ESC=0x1B) in 217 // any slot means this is CSI DL adjacent to another sequence, not a 218 // mouse event. Checking all three slots prevents PASTE_END's ESC 219 // from being consumed when paste content ends in `\x1b[M`+0-2 chars. 220 // 221 // Known limitation: this counts JS string chars, but X10 is byte- 222 // oriented and stdin uses utf8 encoding (App.tsx). At col 162-191 × 223 // row 96-159 the two coord bytes (0xC2-0xDF, 0x80-0xBF) form a valid 224 // UTF-8 2-byte sequence and collapse to one char — the length check 225 // fails and the event buffers until the next keypress absorbs it. 226 // Fixing this requires latin1 stdin; X10's 223-coord cap is exactly 227 // why SGR was invented, and no-SGR terminals at 162+ cols are rare. 228 if ( 229 x10Mouse && 230 code === 0x4d /* M */ && 231 i - seqStart === 2 && 232 (i + 1 >= data.length || data.charCodeAt(i + 1) >= 0x20) && 233 (i + 2 >= data.length || data.charCodeAt(i + 2) >= 0x20) && 234 (i + 3 >= data.length || data.charCodeAt(i + 3) >= 0x20) 235 ) { 236 if (i + 4 <= data.length) { 237 i += 4 238 emitSequence(data.slice(seqStart, i)) 239 } else { 240 // Incomplete — exit loop; end-of-input buffers from seqStart. 241 // Re-entry re-tokenizes from ground via the invalid-CSI fallthrough. 242 i = data.length 243 } 244 break 245 } 246 if (isCSIFinal(code)) { 247 i++ 248 emitSequence(data.slice(seqStart, i)) 249 } else if (isCSIParam(code) || isCSIIntermediate(code)) { 250 i++ 251 } else { 252 // Invalid CSI - abort, treat as text 253 result.state = 'ground' 254 textStart = seqStart 255 } 256 break 257 258 case 'ss3': 259 // SS3 sequences: ESC O followed by a single final byte 260 if (code >= 0x40 && code <= 0x7e) { 261 i++ 262 emitSequence(data.slice(seqStart, i)) 263 } else { 264 // Invalid - treat as text 265 result.state = 'ground' 266 textStart = seqStart 267 } 268 break 269 270 case 'osc': 271 if (code === C0.BEL) { 272 i++ 273 emitSequence(data.slice(seqStart, i)) 274 } else if ( 275 code === C0.ESC && 276 i + 1 < data.length && 277 data.charCodeAt(i + 1) === ESC_TYPE.ST 278 ) { 279 i += 2 280 emitSequence(data.slice(seqStart, i)) 281 } else { 282 i++ 283 } 284 break 285 286 case 'dcs': 287 case 'apc': 288 if (code === C0.BEL) { 289 i++ 290 emitSequence(data.slice(seqStart, i)) 291 } else if ( 292 code === C0.ESC && 293 i + 1 < data.length && 294 data.charCodeAt(i + 1) === ESC_TYPE.ST 295 ) { 296 i += 2 297 emitSequence(data.slice(seqStart, i)) 298 } else { 299 i++ 300 } 301 break 302 } 303 } 304 305 // Handle end of input 306 if (result.state === 'ground') { 307 flushText() 308 } else if (flush) { 309 // Force output incomplete sequence 310 const remaining = data.slice(seqStart) 311 if (remaining) tokens.push({ type: 'sequence', value: remaining }) 312 result.state = 'ground' 313 } else { 314 // Buffer incomplete sequence for next call 315 result.buffer = data.slice(seqStart) 316 } 317 318 return { tokens, state: result } 319}