fix(security)(CVG-57): bound PDF/DOCX parser execution time via withTimeout

+7 -1

packages/file-upload/src/extractors/docx.extractor.ts

··· 1 1 import * as mammoth from 'mammoth'; 2 2 import type { TextExtractor, TextExtractionResult } from '../types'; 3 + import { withTimeout } from './with-timeout'; 3 4 4 5 /** 5 - * Text extractor for DOCX files 6 + * Text extractor for DOCX files. Wrapped in withTimeout against malformed 7 + * docx hangs (CVG-57; see CVG-60 for decompressed-size / zip-bomb cap). 6 8 */ 7 9 export class DOCXExtractor implements TextExtractor { 8 10 readonly name = 'docx'; ··· 15 17 } 16 18 17 19 async extract(buffer: Buffer): Promise<TextExtractionResult> { 20 + return withTimeout(this.run(buffer), 'docx'); 21 + } 22 + 23 + private async run(buffer: Buffer): Promise<TextExtractionResult> { 18 24 try { 19 25 const result = await mammoth.extractRawText({ buffer }); 20 26 return { success: true, text: result.value };

+8

packages/file-upload/src/extractors/pdf/strategies/pdf-parse.strategy.ts

··· 1 1 import type { TextExtractionResult } from '../../../types'; 2 2 import type { PdfExtractionStrategy } from '../types'; 3 + import { withTimeout } from '../../with-timeout'; 3 4 4 5 // eslint-disable-next-line @typescript-eslint/no-require-imports 5 6 const pdfParse = require('pdf-parse') as (buffer: Buffer) => Promise<{ text: string }>; ··· 7 8 /** 8 9 * Fast text extraction using pdf-parse. 9 10 * Works well for text-based PDFs but fails on design-heavy / vector-path PDFs. 11 + * 12 + * Wrapped in withTimeout: a malformed PDF can hang pdf-parse indefinitely, 13 + * which would cascade into request timeouts and worker starvation. CVG-57. 10 14 */ 11 15 export class PdfParseStrategy implements PdfExtractionStrategy { 12 16 readonly name = 'pdf-parse'; 13 17 readonly priority = 0; 14 18 15 19 async extract(buffer: Buffer): Promise<TextExtractionResult> { 20 + return withTimeout(this.run(buffer), 'pdf-parse'); 21 + } 22 + 23 + private async run(buffer: Buffer): Promise<TextExtractionResult> { 16 24 try { 17 25 const data = await pdfParse(buffer); 18 26 return { success: true, text: data.text };

+7

packages/file-upload/src/extractors/pdf/strategies/pdf2json.strategy.ts

··· 1 1 import type { TextExtractionResult } from '../../../types'; 2 2 import type { PdfExtractionStrategy } from '../types'; 3 + import { withTimeout } from '../../with-timeout'; 3 4 4 5 /** 5 6 * Text extraction using pdf2json. 6 7 * Better at parsing layout-rich PDFs and some vector-rendered text. 8 + * 9 + * Wrapped in withTimeout against malformed-PDF hangs (CVG-57). 7 10 */ 8 11 export class Pdf2JsonStrategy implements PdfExtractionStrategy { 9 12 readonly name = 'pdf2json'; 10 13 readonly priority = 1; 11 14 12 15 async extract(buffer: Buffer): Promise<TextExtractionResult> { 16 + return withTimeout(this.run(buffer), 'pdf2json'); 17 + } 18 + 19 + private async run(buffer: Buffer): Promise<TextExtractionResult> { 13 20 try { 14 21 const { default: PDFParser } = await import('pdf2json'); 15 22 const text = await this.parseBuffer(PDFParser, buffer);

+10

packages/file-upload/src/extractors/pdf/strategies/tesseract-ocr.strategy.ts

··· 1 1 import type { TextExtractionResult } from '../../../types'; 2 2 import type { PdfExtractionStrategy } from '../types'; 3 + import { withTimeout } from '../../with-timeout'; 4 + 5 + const OCR_TIMEOUT_MS = 60_000; 3 6 4 7 /** 5 8 * OCR fallback for image-based or vector-path PDFs. 6 9 * Renders each page to a PNG via pdfjs-dist (legacy/Node build) + canvas, 7 10 * then OCRs with tesseract.js. Heavy deps are lazy-loaded on first use. 11 + * 12 + * Longer timeout than text extraction (CVG-57): tesseract is genuinely slow 13 + * on multi-page scans. Still bounded to avoid worker starvation. 8 14 */ 9 15 export class TesseractOcrStrategy implements PdfExtractionStrategy { 10 16 readonly name = 'tesseract-ocr'; 11 17 readonly priority = 2; 12 18 13 19 async extract(buffer: Buffer): Promise<TextExtractionResult> { 20 + return withTimeout(this.run(buffer), 'tesseract-ocr', OCR_TIMEOUT_MS); 21 + } 22 + 23 + private async run(buffer: Buffer): Promise<TextExtractionResult> { 14 24 try { 15 25 const pageImages = await this.renderPagesToImages(buffer); 16 26

+42

packages/file-upload/src/extractors/with-timeout.ts

··· 1 + /** 2 + * Race a parsing promise against a hard timeout. Returns a failure result 3 + * when the timeout fires. The underlying parse keeps running in the 4 + * background (Node has no preemption) but the request path no longer waits 5 + * on it, and the next garbage cycle reclaims the result. 6 + * 7 + * For true OOM protection a `node:worker_threads` worker with 8 + * `resourceLimits.maxOldGenerationSizeMb` is needed. Today the input is 9 + * already capped at 10MB by validators and Node's default --max-old-space 10 + * limits the process; the timeout closes the practical hang vector. 11 + * 12 + * Used by every extractor that parses user-supplied buffers. 13 + */ 14 + import type { TextExtractionResult } from "../types"; 15 + 16 + export const DEFAULT_PARSE_TIMEOUT_MS = 15_000; 17 + 18 + const timeoutResult = ( 19 + taskName: string, 20 + ms: number, 21 + ): TextExtractionResult => ({ 22 + success: false, 23 + error: `${taskName} exceeded ${ms}ms timeout`, 24 + }); 25 + 26 + export const withTimeout = async ( 27 + task: Promise<TextExtractionResult>, 28 + taskName: string, 29 + timeoutMs: number = DEFAULT_PARSE_TIMEOUT_MS, 30 + ): Promise<TextExtractionResult> => { 31 + let timer: NodeJS.Timeout | undefined; 32 + const timeout = new Promise<TextExtractionResult>((resolve) => { 33 + timer = setTimeout(() => resolve(timeoutResult(taskName, timeoutMs)), timeoutMs); 34 + }); 35 + try { 36 + return await Promise.race([task, timeout]); 37 + } finally { 38 + if (timer) { 39 + clearTimeout(timer); 40 + } 41 + } 42 + };

Configure Feed

Configure Feed