refactor(file-upload): replace monolithic PDF extractor with strategy pattern

mokkenstorm.dev / cv-manager

fork

Configure Feed

Issues Pull Requests Commits Tags

Feed URL

Select the types of activity you want to include in your feed.

because I got bored of customising my CV for every job

fork

Configure Feed

Issues Pull Requests Commits Tags

Feed URL

Select the types of activity you want to include in your feed.

refactor(file-upload): replace monolithic PDF extractor with strategy pattern

Niels Mokkenstorm 2 months ago 08724976 8b59e1df

+626 -32

19 changed files

expand all collapse all

apps

server

eng.traineddata

test

assets

.gitkeep

pdf-extraction.integration.ts

packages

file-upload

eng.traineddata

jest.config.ts

package.json

src

__tests__

fixtures

.gitkeep

pdf-extraction.integration.test.ts

extractors

pdf

__tests__

pdf-extractor.spec.ts

strategies.spec.ts

pdf.extractor.ts

strategies

pdf-parse.strategy.ts

pdf2json.strategy.ts

tesseract-ocr.strategy.ts

types.ts

pdf.extractor.ts

file-extraction.module.ts

index.ts

types.ts

apps/server/eng.traineddata

reviewed

This is a binary file and will not be displayed.

apps/server/test/assets/.gitkeep

reviewed

This is a binary file and will not be displayed.

+93

apps/server/test/pdf-extraction.integration.ts

reviewed

··· 1 1 + import { readdirSync, readFileSync } from "node:fs"; 2 2 + import { join } from "node:path"; 3 3 + import { PDFExtractor } from "@cv/file-upload"; 4 4 + import type { PdfExtractionStrategy } from "@cv/file-upload"; 5 5 + import { describe, expect, it } from "vitest"; 6 6 + 7 7 + /** 8 8 + * Lazy-import strategies to avoid pulling heavy deps at module level. 9 9 + * Each factory returns a fresh strategy instance. 10 10 + */ 11 11 + const strategyFactories: Record<string, () => Promise<PdfExtractionStrategy>> = { 12 12 + "pdf-parse": async () => { 13 13 + const { PdfParseStrategy } = await import( 14 14 + "@cv/file-upload/src/extractors/pdf/strategies/pdf-parse.strategy" 15 15 + ); 16 16 + return new PdfParseStrategy(); 17 17 + }, 18 18 + pdf2json: async () => { 19 19 + const { Pdf2JsonStrategy } = await import( 20 20 + "@cv/file-upload/src/extractors/pdf/strategies/pdf2json.strategy" 21 21 + ); 22 22 + return new Pdf2JsonStrategy(); 23 23 + }, 24 24 + "tesseract-ocr": async () => { 25 25 + const { TesseractOcrStrategy } = await import( 26 26 + "@cv/file-upload/src/extractors/pdf/strategies/tesseract-ocr.strategy" 27 27 + ); 28 28 + return new TesseractOcrStrategy(); 29 29 + }, 30 30 + }; 31 31 + 32 32 + const strategyNames = Object.keys(strategyFactories); 33 33 + 34 34 + const ASSETS_DIR = join(__dirname, "assets"); 35 35 + 36 36 + const pdfFixtures = readdirSync(ASSETS_DIR).filter((f) => 37 37 + f.toLowerCase().endsWith(".pdf"), 38 38 + ); 39 39 + 40 40 + describe("PDFExtractor (real-world CVs)", () => { 41 41 + if (pdfFixtures.length === 0) { 42 42 + it.skip("no PDF fixtures — drop real CVs into apps/server/test/assets/", () => {}); 43 43 + return; 44 44 + } 45 45 + 46 46 + describe("composite extractor", () => { 47 47 + const extractor = new PDFExtractor(); 48 48 + 49 49 + it.each(pdfFixtures)("extracts non-empty text from %s", async (filename) => { 50 50 + const buffer = readFileSync(join(ASSETS_DIR, filename)); 51 51 + 52 52 + expect(buffer.slice(0, 5).toString()).toBe("%PDF-"); 53 53 + 54 54 + const result = await extractor.extract(buffer); 55 55 + 56 56 + console.log( 57 57 + `[composite][${filename}] strategy=${extractor.lastUsedStrategy} | ${buffer.length} bytes -> success=${result.success}`, 58 58 + ); 59 59 + 60 60 + if (result.success) { 61 61 + const trimmed = result.text.trim(); 62 62 + console.log(`[composite][${filename}] ${trimmed.length} chars extracted`); 63 63 + console.log(`[composite][${filename}] first 300 chars: ${trimmed.slice(0, 300)}`); 64 64 + expect(trimmed.length).toBeGreaterThan(50); 65 65 + } else { 66 66 + console.log(`[composite][${filename}] extraction failed: ${result.error}`); 67 67 + expect.fail(`Extraction failed for ${filename}: ${result.error}`); 68 68 + } 69 69 + }); 70 70 + }); 71 71 + 72 72 + describe.each(strategyNames)("strategy: %s", (strategyName) => { 73 73 + it.each(pdfFixtures)(`extracts text from %s`, async (filename) => { 74 74 + const strategy = await strategyFactories[strategyName](); 75 75 + const buffer = readFileSync(join(ASSETS_DIR, filename)); 76 76 + 77 77 + const result = await strategy.extract(buffer); 78 78 + 79 79 + if (result.success) { 80 80 + const trimmed = result.text.trim(); 81 81 + console.log( 82 82 + `[${strategyName}][${filename}] ${trimmed.length} chars | first 200: ${trimmed.slice(0, 200)}`, 83 83 + ); 84 84 + } else { 85 85 + console.log(`[${strategyName}][${filename}] failed: ${result.error}`); 86 86 + } 87 87 + 88 88 + // Log result but don't fail — individual strategies may legitimately fail 89 89 + // on certain PDFs (that's why we have the composite) 90 90 + expect(result).toBeDefined(); 91 91 + }); 92 92 + }); 93 93 + });

packages/file-upload/eng.traineddata

reviewed

This is a binary file and will not be displayed.

+23

packages/file-upload/jest.config.ts

reviewed

··· 1 1 + import type { Config } from "jest"; 2 2 + 3 3 + const config: Config = { 4 4 + preset: "ts-jest", 5 5 + testEnvironment: "node", 6 6 + rootDir: ".", 7 7 + moduleFileExtensions: ["ts", "tsx", "js", "json"], 8 8 + testMatch: ["<rootDir>/src/**/*.spec.ts", "<rootDir>/src/**/*.test.ts"], 9 9 + transform: { 10 10 + "^.+\\.(t|j)sx?$": [ 11 11 + "ts-jest", 12 12 + { 13 13 + diagnostics: false, 14 14 + tsconfig: { 15 15 + sourceMap: true, 16 16 + inlineSourceMap: true, 17 17 + }, 18 18 + }, 19 19 + ], 20 20 + }, 21 21 + }; 22 22 + 23 23 + export default config;

+5 -1

packages/file-upload/package.json

reviewed

··· 10 10 }, 11 11 "dependencies": { 12 12 "@nestjs/common": "^11.1.3", 13 13 + "canvas": "^3.1.0", 13 14 "mammoth": "^1.6.0", 14 15 "pdf-parse": "^1.1.1", 15 15 - "zod": "^3.23.8" 16 16 + "pdf2json": "^3.1.4", 17 17 + "pdfjs-dist": "^3.11.174", 18 18 + "tesseract.js": "^5.1.1", 19 19 + "zod": "^4.3.6" 16 20 }, 17 21 "peerDependencies": { 18 22 "@nestjs/common": "^11.0.0"

packages/file-upload/src/__tests__/fixtures/.gitkeep

reviewed

This is a binary file and will not be displayed.

+39

packages/file-upload/src/__tests__/pdf-extraction.integration.test.ts

reviewed

··· 1 1 + import { readdirSync, readFileSync } from "node:fs"; 2 2 + import { join } from "node:path"; 3 3 + import { PDFExtractor } from "../extractors/pdf/pdf.extractor"; 4 4 + 5 5 + const FIXTURES_DIR = join(__dirname, "fixtures"); 6 6 + 7 7 + const pdfFixtures = readdirSync(FIXTURES_DIR).filter((f) => 8 8 + f.toLowerCase().endsWith(".pdf"), 9 9 + ); 10 10 + 11 11 + const extractor = new PDFExtractor(); 12 12 + 13 13 + describe("PDFExtractor (real-world CVs)", () => { 14 14 + if (pdfFixtures.length === 0) { 15 15 + it.skip("no PDF fixtures found — drop real CVs into packages/file-upload/src/__tests__/fixtures/", () => {}); 16 16 + return; 17 17 + } 18 18 + 19 19 + it.each(pdfFixtures)("extracts non-empty text from %s", async (filename) => { 20 20 + const buffer = readFileSync(join(FIXTURES_DIR, filename)); 21 21 + 22 22 + expect(Buffer.isBuffer(buffer)).toBe(true); 23 23 + expect(buffer.slice(0, 5).toString()).toBe("%PDF-"); 24 24 + 25 25 + const result = await extractor.extract(buffer); 26 26 + 27 27 + expect(result.success).toBe(true); 28 28 + 29 29 + if (!result.success) return; 30 30 + 31 31 + const trimmed = result.text.trim(); 32 32 + console.log( 33 33 + `[${filename}] strategy=${extractor.lastUsedStrategy} | ${buffer.length} bytes -> ${trimmed.length} chars`, 34 34 + ); 35 35 + console.log(`[${filename}] first 200 chars: ${trimmed.slice(0, 200)}`); 36 36 + 37 37 + expect(trimmed.length).toBeGreaterThan(0); 38 38 + }); 39 39 + });

-28

packages/file-upload/src/extractors/pdf.extractor.ts

reviewed

··· 1 1 - import type { TextExtractor, TextExtractionResult } from '../types'; 2 2 - 3 3 - // eslint-disable-next-line @typescript-eslint/no-require-imports 4 4 - const pdfParse = require('pdf-parse') as (buffer: Buffer) => Promise<{ text: string }>; 5 5 - 6 6 - /** 7 7 - * Text extractor for PDF files 8 8 - */ 9 9 - export class PDFExtractor implements TextExtractor { 10 10 - readonly name = 'pdf'; 11 11 - readonly supportedMimeTypes = ['application/pdf'] as const; 12 12 - 13 13 - canHandle(mimeType: string): boolean { 14 14 - return this.supportedMimeTypes.includes(mimeType as typeof this.supportedMimeTypes[number]); 15 15 - } 16 16 - 17 17 - async extract(buffer: Buffer): Promise<TextExtractionResult> { 18 18 - try { 19 19 - const data = await pdfParse(buffer); 20 20 - return { success: true, text: data.text }; 21 21 - } catch (error) { 22 22 - return { 23 23 - success: false, 24 24 - error: `PDF extraction failed: ${error instanceof Error ? error.message : 'Unknown error'}`, 25 25 - }; 26 26 - } 27 27 - } 28 28 - }

+137

packages/file-upload/src/extractors/pdf/__tests__/pdf-extractor.spec.ts

reviewed

··· 1 1 + import type { TextExtractionResult } from '../../../types'; 2 2 + import type { PdfExtractionStrategy } from '../types'; 3 3 + import { PDFExtractor } from '../pdf.extractor'; 4 4 + 5 5 + const makeStrategy = ( 6 6 + name: string, 7 7 + priority: number, 8 8 + result: TextExtractionResult, 9 9 + ): PdfExtractionStrategy & { extract: jest.Mock } => ({ 10 10 + name, 11 11 + priority, 12 12 + extract: jest.fn().mockResolvedValue(result), 13 13 + }); 14 14 + 15 15 + const buf = Buffer.from('fake-pdf'); 16 16 + 17 17 + describe('PDFExtractor (composite)', () => { 18 18 + it('picks the first strategy that meets the threshold', async () => { 19 19 + const stratA = makeStrategy('a', 0, { success: true, text: 'x'.repeat(100) }); 20 20 + const stratB = makeStrategy('b', 1, { success: true, text: 'y'.repeat(200) }); 21 21 + 22 22 + const extractor = new PDFExtractor({ 23 23 + strategies: [stratA, stratB], 24 24 + minCharThreshold: 50, 25 25 + }); 26 26 + 27 27 + const result = await extractor.extract(buf); 28 28 + 29 29 + expect(result).toEqual({ success: true, text: 'x'.repeat(100) }); 30 30 + expect(stratA.extract).toHaveBeenCalledWith(buf); 31 31 + expect(stratB.extract).not.toHaveBeenCalled(); 32 32 + expect(extractor.lastUsedStrategy).toBe('a'); 33 33 + }); 34 34 + 35 35 + it('skips a strategy below threshold and picks the next one that meets it', async () => { 36 36 + const stratA = makeStrategy('a', 0, { success: true, text: 'x'.repeat(10) }); 37 37 + const stratB = makeStrategy('b', 1, { success: true, text: 'y'.repeat(200) }); 38 38 + 39 39 + const extractor = new PDFExtractor({ 40 40 + strategies: [stratA, stratB], 41 41 + minCharThreshold: 50, 42 42 + }); 43 43 + 44 44 + const result = await extractor.extract(buf); 45 45 + 46 46 + expect(result).toEqual({ success: true, text: 'y'.repeat(200) }); 47 47 + expect(extractor.lastUsedStrategy).toBe('b'); 48 48 + }); 49 49 + 50 50 + it('falls back to the longest non-empty result when nothing meets threshold', async () => { 51 51 + const stratA = makeStrategy('short', 0, { success: true, text: 'x'.repeat(10) }); 52 52 + const stratB = makeStrategy('longer', 1, { success: true, text: 'y'.repeat(30) }); 53 53 + const stratC = makeStrategy('medium', 2, { success: true, text: 'z'.repeat(20) }); 54 54 + 55 55 + const extractor = new PDFExtractor({ 56 56 + strategies: [stratA, stratB, stratC], 57 57 + minCharThreshold: 100, 58 58 + }); 59 59 + 60 60 + const result = await extractor.extract(buf); 61 61 + 62 62 + expect(result).toEqual({ success: true, text: 'y'.repeat(30) }); 63 63 + expect(extractor.lastUsedStrategy).toBe('longer'); 64 64 + }); 65 65 + 66 66 + it('returns failure when all strategies fail', async () => { 67 67 + const stratA = makeStrategy('a', 0, { success: false, error: 'boom' }); 68 68 + const stratB = makeStrategy('b', 1, { success: false, error: 'bang' }); 69 69 + 70 70 + const extractor = new PDFExtractor({ strategies: [stratA, stratB] }); 71 71 + 72 72 + const result = await extractor.extract(buf); 73 73 + 74 74 + expect(result).toEqual({ success: false, error: 'All PDF extraction strategies failed' }); 75 75 + expect(extractor.lastUsedStrategy).toBeUndefined(); 76 76 + }); 77 77 + 78 78 + it('returns failure with empty strategies array', async () => { 79 79 + const extractor = new PDFExtractor({ strategies: [] }); 80 80 + 81 81 + const result = await extractor.extract(buf); 82 82 + 83 83 + expect(result).toEqual({ success: false, error: 'No PDF extraction strategies configured' }); 84 84 + }); 85 85 + 86 86 + it('respects custom threshold', async () => { 87 87 + const strat = makeStrategy('a', 0, { success: true, text: 'x'.repeat(5) }); 88 88 + 89 89 + const extractor = new PDFExtractor({ 90 90 + strategies: [strat], 91 91 + minCharThreshold: 3, 92 92 + }); 93 93 + 94 94 + const result = await extractor.extract(buf); 95 95 + 96 96 + expect(result).toEqual({ success: true, text: 'x'.repeat(5) }); 97 97 + expect(extractor.lastUsedStrategy).toBe('a'); 98 98 + }); 99 99 + 100 100 + it('sorts strategies by priority regardless of insertion order', async () => { 101 101 + const stratHigh = makeStrategy('high', 10, { success: true, text: 'x'.repeat(100) }); 102 102 + const stratLow = makeStrategy('low', 0, { success: true, text: 'y'.repeat(100) }); 103 103 + 104 104 + const extractor = new PDFExtractor({ 105 105 + strategies: [stratHigh, stratLow], 106 106 + minCharThreshold: 50, 107 107 + }); 108 108 + 109 109 + const result = await extractor.extract(buf); 110 110 + 111 111 + expect(result).toEqual({ success: true, text: 'y'.repeat(100) }); 112 112 + expect(extractor.lastUsedStrategy).toBe('low'); 113 113 + expect(stratHigh.extract).not.toHaveBeenCalled(); 114 114 + }); 115 115 + 116 116 + it('skips failed strategies and continues to the next', async () => { 117 117 + const stratFail = makeStrategy('fail', 0, { success: false, error: 'oops' }); 118 118 + const stratOk = makeStrategy('ok', 1, { success: true, text: 'x'.repeat(100) }); 119 119 + 120 120 + const extractor = new PDFExtractor({ 121 121 + strategies: [stratFail, stratOk], 122 122 + minCharThreshold: 50, 123 123 + }); 124 124 + 125 125 + const result = await extractor.extract(buf); 126 126 + 127 127 + expect(result).toEqual({ success: true, text: 'x'.repeat(100) }); 128 128 + expect(extractor.lastUsedStrategy).toBe('ok'); 129 129 + }); 130 130 + 131 131 + it('reports canHandle correctly', () => { 132 132 + const extractor = new PDFExtractor({ strategies: [] }); 133 133 + 134 134 + expect(extractor.canHandle('application/pdf')).toBe(true); 135 135 + expect(extractor.canHandle('text/plain')).toBe(false); 136 136 + }); 137 137 + });

+57

packages/file-upload/src/extractors/pdf/__tests__/strategies.spec.ts

reviewed

··· 1 1 + import { PdfParseStrategy } from '../strategies/pdf-parse.strategy'; 2 2 + import { Pdf2JsonStrategy } from '../strategies/pdf2json.strategy'; 3 3 + import { TesseractOcrStrategy } from '../strategies/tesseract-ocr.strategy'; 4 4 + 5 5 + describe('PdfParseStrategy', () => { 6 6 + const strategy = new PdfParseStrategy(); 7 7 + 8 8 + it('has correct name and priority', () => { 9 9 + expect(strategy.name).toBe('pdf-parse'); 10 10 + expect(strategy.priority).toBe(0); 11 11 + }); 12 12 + 13 13 + it('returns failure on invalid buffer', async () => { 14 14 + const result = await strategy.extract(Buffer.from('not-a-pdf')); 15 15 + 16 16 + expect(result.success).toBe(false); 17 17 + if (!result.success) { 18 18 + expect(result.error).toContain('pdf-parse failed'); 19 19 + } 20 20 + }); 21 21 + }); 22 22 + 23 23 + describe('Pdf2JsonStrategy', () => { 24 24 + const strategy = new Pdf2JsonStrategy(); 25 25 + 26 26 + it('has correct name and priority', () => { 27 27 + expect(strategy.name).toBe('pdf2json'); 28 28 + expect(strategy.priority).toBe(1); 29 29 + }); 30 30 + 31 31 + it('returns failure on invalid buffer', async () => { 32 32 + const result = await strategy.extract(Buffer.from('not-a-pdf')); 33 33 + 34 34 + expect(result.success).toBe(false); 35 35 + if (!result.success) { 36 36 + expect(result.error).toContain('pdf2json failed'); 37 37 + } 38 38 + }); 39 39 + }); 40 40 + 41 41 + describe('TesseractOcrStrategy', () => { 42 42 + const strategy = new TesseractOcrStrategy(); 43 43 + 44 44 + it('has correct name and priority', () => { 45 45 + expect(strategy.name).toBe('tesseract-ocr'); 46 46 + expect(strategy.priority).toBe(2); 47 47 + }); 48 48 + 49 49 + it('returns failure on invalid buffer', async () => { 50 50 + const result = await strategy.extract(Buffer.from('not-a-pdf')); 51 51 + 52 52 + expect(result.success).toBe(false); 53 53 + if (!result.success) { 54 54 + expect(result.error).toContain('tesseract-ocr failed'); 55 55 + } 56 56 + }); 57 57 + });

+86

packages/file-upload/src/extractors/pdf/pdf.extractor.ts

reviewed

··· 1 1 + import { Logger } from '@nestjs/common'; 2 2 + import type { TextExtractor, TextExtractionResult } from '../../types'; 3 3 + import type { PdfExtractionStrategy, PdfExtractorConfig } from './types'; 4 4 + import { PdfParseStrategy } from './strategies/pdf-parse.strategy'; 5 5 + import { Pdf2JsonStrategy } from './strategies/pdf2json.strategy'; 6 6 + import { TesseractOcrStrategy } from './strategies/tesseract-ocr.strategy'; 7 7 + 8 8 + const DEFAULT_MIN_CHAR_THRESHOLD = 50; 9 9 + 10 10 + const defaultStrategies = (): PdfExtractionStrategy[] => [ 11 11 + new PdfParseStrategy(), 12 12 + new Pdf2JsonStrategy(), 13 13 + new TesseractOcrStrategy(), 14 14 + ]; 15 15 + 16 16 + /** 17 17 + * Composite PDF text extractor. 18 18 + * Runs multiple strategies in priority order and returns the first result 19 19 + * meeting the minimum character threshold. 20 20 + */ 21 21 + export class PDFExtractor implements TextExtractor { 22 22 + readonly name = 'pdf'; 23 23 + readonly supportedMimeTypes = ['application/pdf'] as const; 24 24 + 25 25 + private readonly strategies: PdfExtractionStrategy[]; 26 26 + private readonly minCharThreshold: number; 27 27 + private readonly logger = new Logger(PDFExtractor.name); 28 28 + 29 29 + /** Name of the strategy that produced the last successful result */ 30 30 + lastUsedStrategy: string | undefined; 31 31 + 32 32 + constructor(config: PdfExtractorConfig = {}) { 33 33 + const raw = config.strategies ?? defaultStrategies(); 34 34 + this.strategies = [...raw].sort((a, b) => a.priority - b.priority); 35 35 + this.minCharThreshold = config.minCharThreshold ?? DEFAULT_MIN_CHAR_THRESHOLD; 36 36 + } 37 37 + 38 38 + canHandle(mimeType: string): boolean { 39 39 + return this.supportedMimeTypes.includes(mimeType as typeof this.supportedMimeTypes[number]); 40 40 + } 41 41 + 42 42 + async extract(buffer: Buffer): Promise<TextExtractionResult> { 43 43 + if (this.strategies.length === 0) { 44 44 + return { success: false, error: 'No PDF extraction strategies configured' }; 45 45 + } 46 46 + 47 47 + let bestResult: TextExtractionResult | undefined; 48 48 + let bestStrategyName: string | undefined; 49 49 + let bestLength = 0; 50 50 + 51 51 + for (const strategy of this.strategies) { 52 52 + const result = await strategy.extract(buffer); 53 53 + 54 54 + if (!result.success) { 55 55 + this.logger.debug(`Strategy '${strategy.name}' failed: ${result.error}`); 56 56 + continue; 57 57 + } 58 58 + 59 59 + const charCount = result.text.trim().length; 60 60 + this.logger.debug(`Strategy '${strategy.name}' extracted ${charCount} chars`); 61 61 + 62 62 + if (charCount >= this.minCharThreshold) { 63 63 + this.logger.log(`Selected strategy '${strategy.name}' (${charCount} chars)`); 64 64 + this.lastUsedStrategy = strategy.name; 65 65 + return result; 66 66 + } 67 67 + 68 68 + if (charCount > bestLength) { 69 69 + bestResult = result; 70 70 + bestStrategyName = strategy.name; 71 71 + bestLength = charCount; 72 72 + } 73 73 + } 74 74 + 75 75 + if (bestResult) { 76 76 + this.logger.warn( 77 77 + `No strategy met threshold (${this.minCharThreshold}), falling back to '${bestStrategyName}' (${bestLength} chars)`, 78 78 + ); 79 79 + this.lastUsedStrategy = bestStrategyName; 80 80 + return bestResult; 81 81 + } 82 82 + 83 83 + this.lastUsedStrategy = undefined; 84 84 + return { success: false, error: 'All PDF extraction strategies failed' }; 85 85 + } 86 86 + }

+26

packages/file-upload/src/extractors/pdf/strategies/pdf-parse.strategy.ts

reviewed

··· 1 1 + import type { TextExtractionResult } from '../../../types'; 2 2 + import type { PdfExtractionStrategy } from '../types'; 3 3 + 4 4 + // eslint-disable-next-line @typescript-eslint/no-require-imports 5 5 + const pdfParse = require('pdf-parse') as (buffer: Buffer) => Promise<{ text: string }>; 6 6 + 7 7 + /** 8 8 + * Fast text extraction using pdf-parse. 9 9 + * Works well for text-based PDFs but fails on design-heavy / vector-path PDFs. 10 10 + */ 11 11 + export class PdfParseStrategy implements PdfExtractionStrategy { 12 12 + readonly name = 'pdf-parse'; 13 13 + readonly priority = 0; 14 14 + 15 15 + async extract(buffer: Buffer): Promise<TextExtractionResult> { 16 16 + try { 17 17 + const data = await pdfParse(buffer); 18 18 + return { success: true, text: data.text }; 19 19 + } catch (error) { 20 20 + return { 21 21 + success: false, 22 22 + error: `pdf-parse failed: ${error instanceof Error ? error.message : 'Unknown error'}`, 23 23 + }; 24 24 + } 25 25 + } 26 26 + }

+59

packages/file-upload/src/extractors/pdf/strategies/pdf2json.strategy.ts

reviewed

··· 1 1 + import type { TextExtractionResult } from '../../../types'; 2 2 + import type { PdfExtractionStrategy } from '../types'; 3 3 + 4 4 + /** 5 5 + * Text extraction using pdf2json. 6 6 + * Better at parsing layout-rich PDFs and some vector-rendered text. 7 7 + */ 8 8 + export class Pdf2JsonStrategy implements PdfExtractionStrategy { 9 9 + readonly name = 'pdf2json'; 10 10 + readonly priority = 1; 11 11 + 12 12 + async extract(buffer: Buffer): Promise<TextExtractionResult> { 13 13 + try { 14 14 + const { default: PDFParser } = await import('pdf2json'); 15 15 + const text = await this.parseBuffer(PDFParser, buffer); 16 16 + return { success: true, text }; 17 17 + } catch (error) { 18 18 + return { 19 19 + success: false, 20 20 + error: `pdf2json failed: ${error instanceof Error ? error.message : 'Unknown error'}`, 21 21 + }; 22 22 + } 23 23 + } 24 24 + 25 25 + private parseBuffer( 26 26 + PDFParser: typeof import('pdf2json').default, 27 27 + buffer: Buffer, 28 28 + ): Promise<string> { 29 29 + return new Promise((resolve, reject) => { 30 30 + const parser = new PDFParser(null, true); 31 31 + 32 32 + parser.on('pdfParser_dataReady', (pdfData) => { 33 33 + try { 34 34 + const text = pdfData.Pages.map((page) => 35 35 + page.Texts.map((textItem) => 36 36 + textItem.R.map((run) => decodeURIComponent(run.T)).join(''), 37 37 + ).join(' '), 38 38 + ).join('\n'); 39 39 + 40 40 + resolve(text); 41 41 + } catch (err) { 42 42 + reject(err); 43 43 + } 44 44 + }); 45 45 + 46 46 + parser.on('pdfParser_dataError', (errData) => { 47 47 + const message = 48 48 + errData instanceof Error 49 49 + ? errData.message 50 50 + : 'parserError' in errData 51 51 + ? String(errData.parserError) 52 52 + : 'pdf2json parse error'; 53 53 + reject(new Error(message)); 54 54 + }); 55 55 + 56 56 + parser.parseBuffer(buffer); 57 57 + }); 58 58 + } 59 59 + }

+75

packages/file-upload/src/extractors/pdf/strategies/tesseract-ocr.strategy.ts

reviewed

··· 1 1 + import type { TextExtractionResult } from '../../../types'; 2 2 + import type { PdfExtractionStrategy } from '../types'; 3 3 + 4 4 + /** 5 5 + * OCR fallback for image-based or vector-path PDFs. 6 6 + * Renders each page to a PNG via pdfjs-dist (legacy/Node build) + canvas, 7 7 + * then OCRs with tesseract.js. Heavy deps are lazy-loaded on first use. 8 8 + */ 9 9 + export class TesseractOcrStrategy implements PdfExtractionStrategy { 10 10 + readonly name = 'tesseract-ocr'; 11 11 + readonly priority = 2; 12 12 + 13 13 + async extract(buffer: Buffer): Promise<TextExtractionResult> { 14 14 + try { 15 15 + const pageImages = await this.renderPagesToImages(buffer); 16 16 + 17 17 + if (pageImages.length === 0) { 18 18 + return { success: false, error: 'tesseract-ocr: no pages rendered from PDF' }; 19 19 + } 20 20 + 21 21 + const text = await this.ocrImages(pageImages); 22 22 + return { success: true, text }; 23 23 + } catch (error) { 24 24 + return { 25 25 + success: false, 26 26 + error: `tesseract-ocr failed: ${error instanceof Error ? error.message : 'Unknown error'}`, 27 27 + }; 28 28 + } 29 29 + } 30 30 + 31 31 + private async renderPagesToImages(buffer: Buffer): Promise<Buffer[]> { 32 32 + // eslint-disable-next-line @typescript-eslint/no-require-imports 33 33 + const pdfjsLib = require('pdfjs-dist/legacy/build/pdf.js') as typeof import('pdfjs-dist'); 34 34 + const { createCanvas } = await import('canvas'); 35 35 + 36 36 + const data = new Uint8Array(buffer); 37 37 + const doc = await pdfjsLib.getDocument({ data }).promise; 38 38 + const images: Buffer[] = []; 39 39 + 40 40 + for (let i = 1; i <= doc.numPages; i++) { 41 41 + const page = await doc.getPage(i); 42 42 + const viewport = page.getViewport({ scale: 2.0 }); 43 43 + 44 44 + const canvas = createCanvas(viewport.width, viewport.height); 45 45 + const context = canvas.getContext('2d'); 46 46 + 47 47 + await page.render({ 48 48 + canvasContext: context as unknown as CanvasRenderingContext2D, 49 49 + viewport, 50 50 + }).promise; 51 51 + 52 52 + images.push(canvas.toBuffer('image/png')); 53 53 + } 54 54 + 55 55 + return images; 56 56 + } 57 57 + 58 58 + private async ocrImages(images: Buffer[]): Promise<string> { 59 59 + const Tesseract = await import('tesseract.js'); 60 60 + const worker = await Tesseract.createWorker('eng'); 61 61 + 62 62 + try { 63 63 + const pages: string[] = []; 64 64 + 65 65 + for (const image of images) { 66 66 + const { data } = await worker.recognize(image); 67 67 + pages.push(data.text); 68 68 + } 69 69 + 70 70 + return pages.join('\n'); 71 71 + } finally { 72 72 + await worker.terminate(); 73 73 + } 74 74 + } 75 75 + }

+20

packages/file-upload/src/extractors/pdf/types.ts

reviewed

··· 1 1 + import type { TextExtractionResult } from '../../types'; 2 2 + 3 3 + /** 4 4 + * Strategy for extracting text from a PDF buffer. 5 5 + * Distinct from the top-level TextExtractor which routes by MIME type — 6 6 + * these are PDF-specific backends tried in priority order. 7 7 + */ 8 8 + export interface PdfExtractionStrategy { 9 9 + readonly name: string; 10 10 + /** Lower values are tried first */ 11 11 + readonly priority: number; 12 12 + extract(buffer: Buffer): Promise<TextExtractionResult>; 13 13 + } 14 14 + 15 15 + export interface PdfExtractorConfig { 16 16 + /** Minimum extracted character count to accept a strategy result (default: 50) */ 17 17 + minCharThreshold?: number; 18 18 + /** Override default strategies */ 19 19 + strategies?: PdfExtractionStrategy[]; 20 20 + }

+1 -1

packages/file-upload/src/file-extraction.module.ts

reviewed

··· 1 1 import { DynamicModule, Module } from '@nestjs/common'; 2 2 import type { TextExtractor } from './types'; 3 3 import { TextExtractorRegistry } from './extractor-registry'; 4 4 - import { PDFExtractor } from './extractors/pdf.extractor'; 4 4 + import { PDFExtractor } from './extractors/pdf/pdf.extractor'; 5 5 import { DOCXExtractor } from './extractors/docx.extractor'; 6 6 import { PlainTextExtractor } from './extractors/plain-text.extractor'; 7 7

+4 -1

packages/file-upload/src/index.ts

reviewed

··· 10 10 export { SupportedMimeTypes, FileUploadSchema } from './types'; 11 11 12 12 // Extractor implementations 13 13 - export { PDFExtractor } from './extractors/pdf.extractor'; 13 13 + export { PDFExtractor } from './extractors/pdf/pdf.extractor'; 14 14 export { DOCXExtractor } from './extractors/docx.extractor'; 15 15 export { PlainTextExtractor } from './extractors/plain-text.extractor'; 16 16 + 17 17 + // PDF strategy types (for extensibility) 18 18 + export type { PdfExtractionStrategy, PdfExtractorConfig } from './extractors/pdf/types'; 16 19 17 20 // Extractor registry 18 21 export { TextExtractorRegistry } from './extractor-registry';

+1 -1

packages/file-upload/src/types.ts

reviewed

··· 1 1 - import { z } from 'zod'; 1 1 + import { z } from "zod/v4"; 2 2 3 3 /** 4 4 * Supported MIME types for CV uploads