source dump of claude code
23
fork

Configure Feed

Select the types of activity you want to include in your feed.

at main 530 lines 17 kB view raw
1import axios, { type AxiosResponse } from 'axios' 2import { LRUCache } from 'lru-cache' 3import { 4 type AnalyticsMetadata_I_VERIFIED_THIS_IS_NOT_CODE_OR_FILEPATHS, 5 logEvent, 6} from '../../services/analytics/index.js' 7import { queryHaiku } from '../../services/api/claude.js' 8import { AbortError } from '../../utils/errors.js' 9import { getWebFetchUserAgent } from '../../utils/http.js' 10import { logError } from '../../utils/log.js' 11import { 12 isBinaryContentType, 13 persistBinaryContent, 14} from '../../utils/mcpOutputStorage.js' 15import { getSettings_DEPRECATED } from '../../utils/settings/settings.js' 16import { asSystemPrompt } from '../../utils/systemPromptType.js' 17import { isPreapprovedHost } from './preapproved.js' 18import { makeSecondaryModelPrompt } from './prompt.js' 19 20// Custom error classes for domain blocking 21class DomainBlockedError extends Error { 22 constructor(domain: string) { 23 super(`Claude Code is unable to fetch from ${domain}`) 24 this.name = 'DomainBlockedError' 25 } 26} 27 28class DomainCheckFailedError extends Error { 29 constructor(domain: string) { 30 super( 31 `Unable to verify if domain ${domain} is safe to fetch. This may be due to network restrictions or enterprise security policies blocking claude.ai.`, 32 ) 33 this.name = 'DomainCheckFailedError' 34 } 35} 36 37class EgressBlockedError extends Error { 38 constructor(public readonly domain: string) { 39 super( 40 JSON.stringify({ 41 error_type: 'EGRESS_BLOCKED', 42 domain, 43 message: `Access to ${domain} is blocked by the network egress proxy.`, 44 }), 45 ) 46 this.name = 'EgressBlockedError' 47 } 48} 49 50// Cache for storing fetched URL content 51type CacheEntry = { 52 bytes: number 53 code: number 54 codeText: string 55 content: string 56 contentType: string 57 persistedPath?: string 58 persistedSize?: number 59} 60 61// Cache with 15-minute TTL and 50MB size limit 62// LRUCache handles automatic expiration and eviction 63const CACHE_TTL_MS = 15 * 60 * 1000 // 15 minutes 64const MAX_CACHE_SIZE_BYTES = 50 * 1024 * 1024 // 50MB 65 66const URL_CACHE = new LRUCache<string, CacheEntry>({ 67 maxSize: MAX_CACHE_SIZE_BYTES, 68 ttl: CACHE_TTL_MS, 69}) 70 71// Separate cache for preflight domain checks. URL_CACHE is URL-keyed, so 72// fetching two paths on the same domain triggers two identical preflight 73// HTTP round-trips to api.anthropic.com. This hostname-keyed cache avoids 74// that. Only 'allowed' is cached — blocked/failed re-check on next attempt. 75const DOMAIN_CHECK_CACHE = new LRUCache<string, true>({ 76 max: 128, 77 ttl: 5 * 60 * 1000, // 5 minutes — shorter than URL_CACHE TTL 78}) 79 80export function clearWebFetchCache(): void { 81 URL_CACHE.clear() 82 DOMAIN_CHECK_CACHE.clear() 83} 84 85// Lazy singleton — defers the turndown → @mixmark-io/domino import (~1.4MB 86// retained heap) until the first HTML fetch, and reuses one instance across 87// calls (construction builds 15 rule objects; .turndown() is stateless). 88// @types/turndown ships only `export =` (no .d.mts), so TS types the import 89// as the class itself while Bun wraps CJS in { default } — hence the cast. 90type TurndownCtor = typeof import('turndown') 91let turndownServicePromise: Promise<InstanceType<TurndownCtor>> | undefined 92function getTurndownService(): Promise<InstanceType<TurndownCtor>> { 93 return (turndownServicePromise ??= import('turndown').then(m => { 94 const Turndown = (m as unknown as { default: TurndownCtor }).default 95 return new Turndown() 96 })) 97} 98 99// PSR requested limiting the length of URLs to 250 to lower the potential 100// for a data exfiltration. However, this is too restrictive for some customers' 101// legitimate use cases, such as JWT-signed URLs (e.g., cloud service signed URLs) 102// that can be much longer. We already require user approval for each domain, 103// which provides a primary security boundary. In addition, Claude Code has 104// other data exfil channels, and this one does not seem relatively high risk, 105// so I'm removing that length restriction. -ab 106const MAX_URL_LENGTH = 2000 107 108// Per PSR: 109// "Implement resource consumption controls because setting limits on CPU, 110// memory, and network usage for the Web Fetch tool can prevent a single 111// request or user from overwhelming the system." 112const MAX_HTTP_CONTENT_LENGTH = 10 * 1024 * 1024 113 114// Timeout for the main HTTP fetch request (60 seconds). 115// Prevents hanging indefinitely on slow/unresponsive servers. 116const FETCH_TIMEOUT_MS = 60_000 117 118// Timeout for the domain blocklist preflight check (10 seconds). 119const DOMAIN_CHECK_TIMEOUT_MS = 10_000 120 121// Cap same-host redirect hops. Without this a malicious server can return 122// a redirect loop (/a → /b → /a …) and the per-request FETCH_TIMEOUT_MS 123// resets on every hop, hanging the tool until user interrupt. 10 matches 124// common client defaults (axios=5, follow-redirects=21, Chrome=20). 125const MAX_REDIRECTS = 10 126 127// Truncate to not spend too many tokens 128export const MAX_MARKDOWN_LENGTH = 100_000 129 130export function isPreapprovedUrl(url: string): boolean { 131 try { 132 const parsedUrl = new URL(url) 133 return isPreapprovedHost(parsedUrl.hostname, parsedUrl.pathname) 134 } catch { 135 return false 136 } 137} 138 139export function validateURL(url: string): boolean { 140 if (url.length > MAX_URL_LENGTH) { 141 return false 142 } 143 144 let parsed 145 try { 146 parsed = new URL(url) 147 } catch { 148 return false 149 } 150 151 // We don't need to check protocol here, as we'll upgrade http to https when making the request 152 153 // As long as we aren't supporting aiming to cookies or internal domains, 154 // we should block URLs with usernames/passwords too, even though these 155 // seem exceedingly unlikely. 156 if (parsed.username || parsed.password) { 157 return false 158 } 159 160 // Initial filter that this isn't a privileged, company-internal URL 161 // by checking that the hostname is publicly resolvable 162 const hostname = parsed.hostname 163 const parts = hostname.split('.') 164 if (parts.length < 2) { 165 return false 166 } 167 168 return true 169} 170 171type DomainCheckResult = 172 | { status: 'allowed' } 173 | { status: 'blocked' } 174 | { status: 'check_failed'; error: Error } 175 176export async function checkDomainBlocklist( 177 domain: string, 178): Promise<DomainCheckResult> { 179 if (DOMAIN_CHECK_CACHE.has(domain)) { 180 return { status: 'allowed' } 181 } 182 try { 183 const response = await axios.get( 184 `https://api.anthropic.com/api/web/domain_info?domain=${encodeURIComponent(domain)}`, 185 { timeout: DOMAIN_CHECK_TIMEOUT_MS }, 186 ) 187 if (response.status === 200) { 188 if (response.data.can_fetch === true) { 189 DOMAIN_CHECK_CACHE.set(domain, true) 190 return { status: 'allowed' } 191 } 192 return { status: 'blocked' } 193 } 194 // Non-200 status but didn't throw 195 return { 196 status: 'check_failed', 197 error: new Error(`Domain check returned status ${response.status}`), 198 } 199 } catch (e) { 200 logError(e) 201 return { status: 'check_failed', error: e as Error } 202 } 203} 204 205/** 206 * Check if a redirect is safe to follow 207 * Allows redirects that: 208 * - Add or remove "www." in the hostname 209 * - Keep the origin the same but change path/query params 210 * - Or both of the above 211 */ 212export function isPermittedRedirect( 213 originalUrl: string, 214 redirectUrl: string, 215): boolean { 216 try { 217 const parsedOriginal = new URL(originalUrl) 218 const parsedRedirect = new URL(redirectUrl) 219 220 if (parsedRedirect.protocol !== parsedOriginal.protocol) { 221 return false 222 } 223 224 if (parsedRedirect.port !== parsedOriginal.port) { 225 return false 226 } 227 228 if (parsedRedirect.username || parsedRedirect.password) { 229 return false 230 } 231 232 // Now check hostname conditions 233 // 1. Adding www. is allowed: example.com -> www.example.com 234 // 2. Removing www. is allowed: www.example.com -> example.com 235 // 3. Same host (with or without www.) is allowed: paths can change 236 const stripWww = (hostname: string) => hostname.replace(/^www\./, '') 237 const originalHostWithoutWww = stripWww(parsedOriginal.hostname) 238 const redirectHostWithoutWww = stripWww(parsedRedirect.hostname) 239 return originalHostWithoutWww === redirectHostWithoutWww 240 } catch (_error) { 241 return false 242 } 243} 244 245/** 246 * Helper function to handle fetching URLs with custom redirect handling 247 * Recursively follows redirects if they pass the redirectChecker function 248 * 249 * Per PSR: 250 * "Do not automatically follow redirects because following redirects could 251 * allow for an attacker to exploit an open redirect vulnerability in a 252 * trusted domain to force a user to make a request to a malicious domain 253 * unknowingly" 254 */ 255type RedirectInfo = { 256 type: 'redirect' 257 originalUrl: string 258 redirectUrl: string 259 statusCode: number 260} 261 262export async function getWithPermittedRedirects( 263 url: string, 264 signal: AbortSignal, 265 redirectChecker: (originalUrl: string, redirectUrl: string) => boolean, 266 depth = 0, 267): Promise<AxiosResponse<ArrayBuffer> | RedirectInfo> { 268 if (depth > MAX_REDIRECTS) { 269 throw new Error(`Too many redirects (exceeded ${MAX_REDIRECTS})`) 270 } 271 try { 272 return await axios.get(url, { 273 signal, 274 timeout: FETCH_TIMEOUT_MS, 275 maxRedirects: 0, 276 responseType: 'arraybuffer', 277 maxContentLength: MAX_HTTP_CONTENT_LENGTH, 278 headers: { 279 Accept: 'text/markdown, text/html, */*', 280 'User-Agent': getWebFetchUserAgent(), 281 }, 282 }) 283 } catch (error) { 284 if ( 285 axios.isAxiosError(error) && 286 error.response && 287 [301, 302, 307, 308].includes(error.response.status) 288 ) { 289 const redirectLocation = error.response.headers.location 290 if (!redirectLocation) { 291 throw new Error('Redirect missing Location header') 292 } 293 294 // Resolve relative URLs against the original URL 295 const redirectUrl = new URL(redirectLocation, url).toString() 296 297 if (redirectChecker(url, redirectUrl)) { 298 // Recursively follow the permitted redirect 299 return getWithPermittedRedirects( 300 redirectUrl, 301 signal, 302 redirectChecker, 303 depth + 1, 304 ) 305 } else { 306 // Return redirect information to the caller 307 return { 308 type: 'redirect', 309 originalUrl: url, 310 redirectUrl, 311 statusCode: error.response.status, 312 } 313 } 314 } 315 316 // Detect egress proxy blocks: the proxy returns 403 with 317 // X-Proxy-Error: blocked-by-allowlist when egress is restricted 318 if ( 319 axios.isAxiosError(error) && 320 error.response?.status === 403 && 321 error.response.headers['x-proxy-error'] === 'blocked-by-allowlist' 322 ) { 323 const hostname = new URL(url).hostname 324 throw new EgressBlockedError(hostname) 325 } 326 327 throw error 328 } 329} 330 331function isRedirectInfo( 332 response: AxiosResponse<ArrayBuffer> | RedirectInfo, 333): response is RedirectInfo { 334 return 'type' in response && response.type === 'redirect' 335} 336 337export type FetchedContent = { 338 content: string 339 bytes: number 340 code: number 341 codeText: string 342 contentType: string 343 persistedPath?: string 344 persistedSize?: number 345} 346 347export async function getURLMarkdownContent( 348 url: string, 349 abortController: AbortController, 350): Promise<FetchedContent | RedirectInfo> { 351 if (!validateURL(url)) { 352 throw new Error('Invalid URL') 353 } 354 355 // Check cache (LRUCache handles TTL automatically) 356 const cachedEntry = URL_CACHE.get(url) 357 if (cachedEntry) { 358 return { 359 bytes: cachedEntry.bytes, 360 code: cachedEntry.code, 361 codeText: cachedEntry.codeText, 362 content: cachedEntry.content, 363 contentType: cachedEntry.contentType, 364 persistedPath: cachedEntry.persistedPath, 365 persistedSize: cachedEntry.persistedSize, 366 } 367 } 368 369 let parsedUrl: URL 370 let upgradedUrl = url 371 372 try { 373 parsedUrl = new URL(url) 374 375 // Upgrade http to https if needed 376 if (parsedUrl.protocol === 'http:') { 377 parsedUrl.protocol = 'https:' 378 upgradedUrl = parsedUrl.toString() 379 } 380 381 const hostname = parsedUrl.hostname 382 383 // Check if the user has opted to skip the blocklist check 384 // This is for enterprise customers with restrictive security policies 385 // that prevent outbound connections to claude.ai 386 const settings = getSettings_DEPRECATED() 387 if (!settings.skipWebFetchPreflight) { 388 const checkResult = await checkDomainBlocklist(hostname) 389 switch (checkResult.status) { 390 case 'allowed': 391 // Continue with the fetch 392 break 393 case 'blocked': 394 throw new DomainBlockedError(hostname) 395 case 'check_failed': 396 throw new DomainCheckFailedError(hostname) 397 } 398 } 399 400 if (process.env.USER_TYPE === 'ant') { 401 logEvent('tengu_web_fetch_host', { 402 hostname: 403 hostname as AnalyticsMetadata_I_VERIFIED_THIS_IS_NOT_CODE_OR_FILEPATHS, 404 }) 405 } 406 } catch (e) { 407 if ( 408 e instanceof DomainBlockedError || 409 e instanceof DomainCheckFailedError 410 ) { 411 // Expected user-facing failures - re-throw without logging as internal error 412 throw e 413 } 414 logError(e) 415 } 416 417 const response = await getWithPermittedRedirects( 418 upgradedUrl, 419 abortController.signal, 420 isPermittedRedirect, 421 ) 422 423 // Check if we got a redirect response 424 if (isRedirectInfo(response)) { 425 return response 426 } 427 428 const rawBuffer = Buffer.from(response.data) 429 // Release the axios-held ArrayBuffer copy; rawBuffer owns the bytes now. 430 // This lets GC reclaim up to MAX_HTTP_CONTENT_LENGTH (10MB) before Turndown 431 // builds its DOM tree (which can be 3-5x the HTML size). 432 ;(response as { data: unknown }).data = null 433 const contentType = response.headers['content-type'] ?? '' 434 435 // Binary content: save raw bytes to disk with a proper extension so Claude 436 // can inspect the file later. We still fall through to the utf-8 decode + 437 // Haiku path below — for PDFs in particular the decoded string has enough 438 // ASCII structure (/Title, text streams) that Haiku can summarize it, and 439 // the saved file is a supplement rather than a replacement. 440 let persistedPath: string | undefined 441 let persistedSize: number | undefined 442 if (isBinaryContentType(contentType)) { 443 const persistId = `webfetch-${Date.now()}-${Math.random().toString(36).slice(2, 8)}` 444 const result = await persistBinaryContent(rawBuffer, contentType, persistId) 445 if (!('error' in result)) { 446 persistedPath = result.filepath 447 persistedSize = result.size 448 } 449 } 450 451 const bytes = rawBuffer.length 452 const htmlContent = rawBuffer.toString('utf-8') 453 454 let markdownContent: string 455 let contentBytes: number 456 if (contentType.includes('text/html')) { 457 markdownContent = (await getTurndownService()).turndown(htmlContent) 458 contentBytes = Buffer.byteLength(markdownContent) 459 } else { 460 // It's not HTML - just use it raw. The decoded string's UTF-8 byte 461 // length equals rawBuffer.length (modulo U+FFFD replacement on invalid 462 // bytes — negligible for cache eviction accounting), so skip the O(n) 463 // Buffer.byteLength scan. 464 markdownContent = htmlContent 465 contentBytes = bytes 466 } 467 468 // Store the fetched content in cache. Note that it's stored under 469 // the original URL, not the upgraded or redirected URL. 470 const entry: CacheEntry = { 471 bytes, 472 code: response.status, 473 codeText: response.statusText, 474 content: markdownContent, 475 contentType, 476 persistedPath, 477 persistedSize, 478 } 479 // lru-cache requires positive integers; clamp to 1 for empty responses. 480 URL_CACHE.set(url, entry, { size: Math.max(1, contentBytes) }) 481 return entry 482} 483 484export async function applyPromptToMarkdown( 485 prompt: string, 486 markdownContent: string, 487 signal: AbortSignal, 488 isNonInteractiveSession: boolean, 489 isPreapprovedDomain: boolean, 490): Promise<string> { 491 // Truncate content to avoid "Prompt is too long" errors from the secondary model 492 const truncatedContent = 493 markdownContent.length > MAX_MARKDOWN_LENGTH 494 ? markdownContent.slice(0, MAX_MARKDOWN_LENGTH) + 495 '\n\n[Content truncated due to length...]' 496 : markdownContent 497 498 const modelPrompt = makeSecondaryModelPrompt( 499 truncatedContent, 500 prompt, 501 isPreapprovedDomain, 502 ) 503 const assistantMessage = await queryHaiku({ 504 systemPrompt: asSystemPrompt([]), 505 userPrompt: modelPrompt, 506 signal, 507 options: { 508 querySource: 'web_fetch_apply', 509 agents: [], 510 isNonInteractiveSession, 511 hasAppendSystemPrompt: false, 512 mcpTools: [], 513 }, 514 }) 515 516 // We need to bubble this up, so that the tool call throws, causing us to return 517 // an is_error tool_use block to the server, and render a red dot in the UI. 518 if (signal.aborted) { 519 throw new AbortError() 520 } 521 522 const { content } = assistantMessage.message 523 if (content.length > 0) { 524 const contentBlock = content[0] 525 if ('text' in contentBlock!) { 526 return contentBlock.text 527 } 528 } 529 return 'No response from model' 530}