because I got bored of customising my CV for every job
1
fork

Configure Feed

Select the types of activity you want to include in your feed.

refactor(file-upload): replace monolithic PDF extractor with strategy pattern

+626 -32
apps/server/eng.traineddata

This is a binary file and will not be displayed.

apps/server/test/assets/.gitkeep

This is a binary file and will not be displayed.

+93
apps/server/test/pdf-extraction.integration.ts
··· 1 + import { readdirSync, readFileSync } from "node:fs"; 2 + import { join } from "node:path"; 3 + import { PDFExtractor } from "@cv/file-upload"; 4 + import type { PdfExtractionStrategy } from "@cv/file-upload"; 5 + import { describe, expect, it } from "vitest"; 6 + 7 + /** 8 + * Lazy-import strategies to avoid pulling heavy deps at module level. 9 + * Each factory returns a fresh strategy instance. 10 + */ 11 + const strategyFactories: Record<string, () => Promise<PdfExtractionStrategy>> = { 12 + "pdf-parse": async () => { 13 + const { PdfParseStrategy } = await import( 14 + "@cv/file-upload/src/extractors/pdf/strategies/pdf-parse.strategy" 15 + ); 16 + return new PdfParseStrategy(); 17 + }, 18 + pdf2json: async () => { 19 + const { Pdf2JsonStrategy } = await import( 20 + "@cv/file-upload/src/extractors/pdf/strategies/pdf2json.strategy" 21 + ); 22 + return new Pdf2JsonStrategy(); 23 + }, 24 + "tesseract-ocr": async () => { 25 + const { TesseractOcrStrategy } = await import( 26 + "@cv/file-upload/src/extractors/pdf/strategies/tesseract-ocr.strategy" 27 + ); 28 + return new TesseractOcrStrategy(); 29 + }, 30 + }; 31 + 32 + const strategyNames = Object.keys(strategyFactories); 33 + 34 + const ASSETS_DIR = join(__dirname, "assets"); 35 + 36 + const pdfFixtures = readdirSync(ASSETS_DIR).filter((f) => 37 + f.toLowerCase().endsWith(".pdf"), 38 + ); 39 + 40 + describe("PDFExtractor (real-world CVs)", () => { 41 + if (pdfFixtures.length === 0) { 42 + it.skip("no PDF fixtures — drop real CVs into apps/server/test/assets/", () => {}); 43 + return; 44 + } 45 + 46 + describe("composite extractor", () => { 47 + const extractor = new PDFExtractor(); 48 + 49 + it.each(pdfFixtures)("extracts non-empty text from %s", async (filename) => { 50 + const buffer = readFileSync(join(ASSETS_DIR, filename)); 51 + 52 + expect(buffer.slice(0, 5).toString()).toBe("%PDF-"); 53 + 54 + const result = await extractor.extract(buffer); 55 + 56 + console.log( 57 + `[composite][${filename}] strategy=${extractor.lastUsedStrategy} | ${buffer.length} bytes -> success=${result.success}`, 58 + ); 59 + 60 + if (result.success) { 61 + const trimmed = result.text.trim(); 62 + console.log(`[composite][${filename}] ${trimmed.length} chars extracted`); 63 + console.log(`[composite][${filename}] first 300 chars: ${trimmed.slice(0, 300)}`); 64 + expect(trimmed.length).toBeGreaterThan(50); 65 + } else { 66 + console.log(`[composite][${filename}] extraction failed: ${result.error}`); 67 + expect.fail(`Extraction failed for ${filename}: ${result.error}`); 68 + } 69 + }); 70 + }); 71 + 72 + describe.each(strategyNames)("strategy: %s", (strategyName) => { 73 + it.each(pdfFixtures)(`extracts text from %s`, async (filename) => { 74 + const strategy = await strategyFactories[strategyName](); 75 + const buffer = readFileSync(join(ASSETS_DIR, filename)); 76 + 77 + const result = await strategy.extract(buffer); 78 + 79 + if (result.success) { 80 + const trimmed = result.text.trim(); 81 + console.log( 82 + `[${strategyName}][${filename}] ${trimmed.length} chars | first 200: ${trimmed.slice(0, 200)}`, 83 + ); 84 + } else { 85 + console.log(`[${strategyName}][${filename}] failed: ${result.error}`); 86 + } 87 + 88 + // Log result but don't fail — individual strategies may legitimately fail 89 + // on certain PDFs (that's why we have the composite) 90 + expect(result).toBeDefined(); 91 + }); 92 + }); 93 + });
packages/file-upload/eng.traineddata

This is a binary file and will not be displayed.

+23
packages/file-upload/jest.config.ts
··· 1 + import type { Config } from "jest"; 2 + 3 + const config: Config = { 4 + preset: "ts-jest", 5 + testEnvironment: "node", 6 + rootDir: ".", 7 + moduleFileExtensions: ["ts", "tsx", "js", "json"], 8 + testMatch: ["<rootDir>/src/**/*.spec.ts", "<rootDir>/src/**/*.test.ts"], 9 + transform: { 10 + "^.+\\.(t|j)sx?$": [ 11 + "ts-jest", 12 + { 13 + diagnostics: false, 14 + tsconfig: { 15 + sourceMap: true, 16 + inlineSourceMap: true, 17 + }, 18 + }, 19 + ], 20 + }, 21 + }; 22 + 23 + export default config;
+5 -1
packages/file-upload/package.json
··· 10 10 }, 11 11 "dependencies": { 12 12 "@nestjs/common": "^11.1.3", 13 + "canvas": "^3.1.0", 13 14 "mammoth": "^1.6.0", 14 15 "pdf-parse": "^1.1.1", 15 - "zod": "^3.23.8" 16 + "pdf2json": "^3.1.4", 17 + "pdfjs-dist": "^3.11.174", 18 + "tesseract.js": "^5.1.1", 19 + "zod": "^4.3.6" 16 20 }, 17 21 "peerDependencies": { 18 22 "@nestjs/common": "^11.0.0"
packages/file-upload/src/__tests__/fixtures/.gitkeep

This is a binary file and will not be displayed.

+39
packages/file-upload/src/__tests__/pdf-extraction.integration.test.ts
··· 1 + import { readdirSync, readFileSync } from "node:fs"; 2 + import { join } from "node:path"; 3 + import { PDFExtractor } from "../extractors/pdf/pdf.extractor"; 4 + 5 + const FIXTURES_DIR = join(__dirname, "fixtures"); 6 + 7 + const pdfFixtures = readdirSync(FIXTURES_DIR).filter((f) => 8 + f.toLowerCase().endsWith(".pdf"), 9 + ); 10 + 11 + const extractor = new PDFExtractor(); 12 + 13 + describe("PDFExtractor (real-world CVs)", () => { 14 + if (pdfFixtures.length === 0) { 15 + it.skip("no PDF fixtures found — drop real CVs into packages/file-upload/src/__tests__/fixtures/", () => {}); 16 + return; 17 + } 18 + 19 + it.each(pdfFixtures)("extracts non-empty text from %s", async (filename) => { 20 + const buffer = readFileSync(join(FIXTURES_DIR, filename)); 21 + 22 + expect(Buffer.isBuffer(buffer)).toBe(true); 23 + expect(buffer.slice(0, 5).toString()).toBe("%PDF-"); 24 + 25 + const result = await extractor.extract(buffer); 26 + 27 + expect(result.success).toBe(true); 28 + 29 + if (!result.success) return; 30 + 31 + const trimmed = result.text.trim(); 32 + console.log( 33 + `[${filename}] strategy=${extractor.lastUsedStrategy} | ${buffer.length} bytes -> ${trimmed.length} chars`, 34 + ); 35 + console.log(`[${filename}] first 200 chars: ${trimmed.slice(0, 200)}`); 36 + 37 + expect(trimmed.length).toBeGreaterThan(0); 38 + }); 39 + });
-28
packages/file-upload/src/extractors/pdf.extractor.ts
··· 1 - import type { TextExtractor, TextExtractionResult } from '../types'; 2 - 3 - // eslint-disable-next-line @typescript-eslint/no-require-imports 4 - const pdfParse = require('pdf-parse') as (buffer: Buffer) => Promise<{ text: string }>; 5 - 6 - /** 7 - * Text extractor for PDF files 8 - */ 9 - export class PDFExtractor implements TextExtractor { 10 - readonly name = 'pdf'; 11 - readonly supportedMimeTypes = ['application/pdf'] as const; 12 - 13 - canHandle(mimeType: string): boolean { 14 - return this.supportedMimeTypes.includes(mimeType as typeof this.supportedMimeTypes[number]); 15 - } 16 - 17 - async extract(buffer: Buffer): Promise<TextExtractionResult> { 18 - try { 19 - const data = await pdfParse(buffer); 20 - return { success: true, text: data.text }; 21 - } catch (error) { 22 - return { 23 - success: false, 24 - error: `PDF extraction failed: ${error instanceof Error ? error.message : 'Unknown error'}`, 25 - }; 26 - } 27 - } 28 - }
+137
packages/file-upload/src/extractors/pdf/__tests__/pdf-extractor.spec.ts
··· 1 + import type { TextExtractionResult } from '../../../types'; 2 + import type { PdfExtractionStrategy } from '../types'; 3 + import { PDFExtractor } from '../pdf.extractor'; 4 + 5 + const makeStrategy = ( 6 + name: string, 7 + priority: number, 8 + result: TextExtractionResult, 9 + ): PdfExtractionStrategy & { extract: jest.Mock } => ({ 10 + name, 11 + priority, 12 + extract: jest.fn().mockResolvedValue(result), 13 + }); 14 + 15 + const buf = Buffer.from('fake-pdf'); 16 + 17 + describe('PDFExtractor (composite)', () => { 18 + it('picks the first strategy that meets the threshold', async () => { 19 + const stratA = makeStrategy('a', 0, { success: true, text: 'x'.repeat(100) }); 20 + const stratB = makeStrategy('b', 1, { success: true, text: 'y'.repeat(200) }); 21 + 22 + const extractor = new PDFExtractor({ 23 + strategies: [stratA, stratB], 24 + minCharThreshold: 50, 25 + }); 26 + 27 + const result = await extractor.extract(buf); 28 + 29 + expect(result).toEqual({ success: true, text: 'x'.repeat(100) }); 30 + expect(stratA.extract).toHaveBeenCalledWith(buf); 31 + expect(stratB.extract).not.toHaveBeenCalled(); 32 + expect(extractor.lastUsedStrategy).toBe('a'); 33 + }); 34 + 35 + it('skips a strategy below threshold and picks the next one that meets it', async () => { 36 + const stratA = makeStrategy('a', 0, { success: true, text: 'x'.repeat(10) }); 37 + const stratB = makeStrategy('b', 1, { success: true, text: 'y'.repeat(200) }); 38 + 39 + const extractor = new PDFExtractor({ 40 + strategies: [stratA, stratB], 41 + minCharThreshold: 50, 42 + }); 43 + 44 + const result = await extractor.extract(buf); 45 + 46 + expect(result).toEqual({ success: true, text: 'y'.repeat(200) }); 47 + expect(extractor.lastUsedStrategy).toBe('b'); 48 + }); 49 + 50 + it('falls back to the longest non-empty result when nothing meets threshold', async () => { 51 + const stratA = makeStrategy('short', 0, { success: true, text: 'x'.repeat(10) }); 52 + const stratB = makeStrategy('longer', 1, { success: true, text: 'y'.repeat(30) }); 53 + const stratC = makeStrategy('medium', 2, { success: true, text: 'z'.repeat(20) }); 54 + 55 + const extractor = new PDFExtractor({ 56 + strategies: [stratA, stratB, stratC], 57 + minCharThreshold: 100, 58 + }); 59 + 60 + const result = await extractor.extract(buf); 61 + 62 + expect(result).toEqual({ success: true, text: 'y'.repeat(30) }); 63 + expect(extractor.lastUsedStrategy).toBe('longer'); 64 + }); 65 + 66 + it('returns failure when all strategies fail', async () => { 67 + const stratA = makeStrategy('a', 0, { success: false, error: 'boom' }); 68 + const stratB = makeStrategy('b', 1, { success: false, error: 'bang' }); 69 + 70 + const extractor = new PDFExtractor({ strategies: [stratA, stratB] }); 71 + 72 + const result = await extractor.extract(buf); 73 + 74 + expect(result).toEqual({ success: false, error: 'All PDF extraction strategies failed' }); 75 + expect(extractor.lastUsedStrategy).toBeUndefined(); 76 + }); 77 + 78 + it('returns failure with empty strategies array', async () => { 79 + const extractor = new PDFExtractor({ strategies: [] }); 80 + 81 + const result = await extractor.extract(buf); 82 + 83 + expect(result).toEqual({ success: false, error: 'No PDF extraction strategies configured' }); 84 + }); 85 + 86 + it('respects custom threshold', async () => { 87 + const strat = makeStrategy('a', 0, { success: true, text: 'x'.repeat(5) }); 88 + 89 + const extractor = new PDFExtractor({ 90 + strategies: [strat], 91 + minCharThreshold: 3, 92 + }); 93 + 94 + const result = await extractor.extract(buf); 95 + 96 + expect(result).toEqual({ success: true, text: 'x'.repeat(5) }); 97 + expect(extractor.lastUsedStrategy).toBe('a'); 98 + }); 99 + 100 + it('sorts strategies by priority regardless of insertion order', async () => { 101 + const stratHigh = makeStrategy('high', 10, { success: true, text: 'x'.repeat(100) }); 102 + const stratLow = makeStrategy('low', 0, { success: true, text: 'y'.repeat(100) }); 103 + 104 + const extractor = new PDFExtractor({ 105 + strategies: [stratHigh, stratLow], 106 + minCharThreshold: 50, 107 + }); 108 + 109 + const result = await extractor.extract(buf); 110 + 111 + expect(result).toEqual({ success: true, text: 'y'.repeat(100) }); 112 + expect(extractor.lastUsedStrategy).toBe('low'); 113 + expect(stratHigh.extract).not.toHaveBeenCalled(); 114 + }); 115 + 116 + it('skips failed strategies and continues to the next', async () => { 117 + const stratFail = makeStrategy('fail', 0, { success: false, error: 'oops' }); 118 + const stratOk = makeStrategy('ok', 1, { success: true, text: 'x'.repeat(100) }); 119 + 120 + const extractor = new PDFExtractor({ 121 + strategies: [stratFail, stratOk], 122 + minCharThreshold: 50, 123 + }); 124 + 125 + const result = await extractor.extract(buf); 126 + 127 + expect(result).toEqual({ success: true, text: 'x'.repeat(100) }); 128 + expect(extractor.lastUsedStrategy).toBe('ok'); 129 + }); 130 + 131 + it('reports canHandle correctly', () => { 132 + const extractor = new PDFExtractor({ strategies: [] }); 133 + 134 + expect(extractor.canHandle('application/pdf')).toBe(true); 135 + expect(extractor.canHandle('text/plain')).toBe(false); 136 + }); 137 + });
+57
packages/file-upload/src/extractors/pdf/__tests__/strategies.spec.ts
··· 1 + import { PdfParseStrategy } from '../strategies/pdf-parse.strategy'; 2 + import { Pdf2JsonStrategy } from '../strategies/pdf2json.strategy'; 3 + import { TesseractOcrStrategy } from '../strategies/tesseract-ocr.strategy'; 4 + 5 + describe('PdfParseStrategy', () => { 6 + const strategy = new PdfParseStrategy(); 7 + 8 + it('has correct name and priority', () => { 9 + expect(strategy.name).toBe('pdf-parse'); 10 + expect(strategy.priority).toBe(0); 11 + }); 12 + 13 + it('returns failure on invalid buffer', async () => { 14 + const result = await strategy.extract(Buffer.from('not-a-pdf')); 15 + 16 + expect(result.success).toBe(false); 17 + if (!result.success) { 18 + expect(result.error).toContain('pdf-parse failed'); 19 + } 20 + }); 21 + }); 22 + 23 + describe('Pdf2JsonStrategy', () => { 24 + const strategy = new Pdf2JsonStrategy(); 25 + 26 + it('has correct name and priority', () => { 27 + expect(strategy.name).toBe('pdf2json'); 28 + expect(strategy.priority).toBe(1); 29 + }); 30 + 31 + it('returns failure on invalid buffer', async () => { 32 + const result = await strategy.extract(Buffer.from('not-a-pdf')); 33 + 34 + expect(result.success).toBe(false); 35 + if (!result.success) { 36 + expect(result.error).toContain('pdf2json failed'); 37 + } 38 + }); 39 + }); 40 + 41 + describe('TesseractOcrStrategy', () => { 42 + const strategy = new TesseractOcrStrategy(); 43 + 44 + it('has correct name and priority', () => { 45 + expect(strategy.name).toBe('tesseract-ocr'); 46 + expect(strategy.priority).toBe(2); 47 + }); 48 + 49 + it('returns failure on invalid buffer', async () => { 50 + const result = await strategy.extract(Buffer.from('not-a-pdf')); 51 + 52 + expect(result.success).toBe(false); 53 + if (!result.success) { 54 + expect(result.error).toContain('tesseract-ocr failed'); 55 + } 56 + }); 57 + });
+86
packages/file-upload/src/extractors/pdf/pdf.extractor.ts
··· 1 + import { Logger } from '@nestjs/common'; 2 + import type { TextExtractor, TextExtractionResult } from '../../types'; 3 + import type { PdfExtractionStrategy, PdfExtractorConfig } from './types'; 4 + import { PdfParseStrategy } from './strategies/pdf-parse.strategy'; 5 + import { Pdf2JsonStrategy } from './strategies/pdf2json.strategy'; 6 + import { TesseractOcrStrategy } from './strategies/tesseract-ocr.strategy'; 7 + 8 + const DEFAULT_MIN_CHAR_THRESHOLD = 50; 9 + 10 + const defaultStrategies = (): PdfExtractionStrategy[] => [ 11 + new PdfParseStrategy(), 12 + new Pdf2JsonStrategy(), 13 + new TesseractOcrStrategy(), 14 + ]; 15 + 16 + /** 17 + * Composite PDF text extractor. 18 + * Runs multiple strategies in priority order and returns the first result 19 + * meeting the minimum character threshold. 20 + */ 21 + export class PDFExtractor implements TextExtractor { 22 + readonly name = 'pdf'; 23 + readonly supportedMimeTypes = ['application/pdf'] as const; 24 + 25 + private readonly strategies: PdfExtractionStrategy[]; 26 + private readonly minCharThreshold: number; 27 + private readonly logger = new Logger(PDFExtractor.name); 28 + 29 + /** Name of the strategy that produced the last successful result */ 30 + lastUsedStrategy: string | undefined; 31 + 32 + constructor(config: PdfExtractorConfig = {}) { 33 + const raw = config.strategies ?? defaultStrategies(); 34 + this.strategies = [...raw].sort((a, b) => a.priority - b.priority); 35 + this.minCharThreshold = config.minCharThreshold ?? DEFAULT_MIN_CHAR_THRESHOLD; 36 + } 37 + 38 + canHandle(mimeType: string): boolean { 39 + return this.supportedMimeTypes.includes(mimeType as typeof this.supportedMimeTypes[number]); 40 + } 41 + 42 + async extract(buffer: Buffer): Promise<TextExtractionResult> { 43 + if (this.strategies.length === 0) { 44 + return { success: false, error: 'No PDF extraction strategies configured' }; 45 + } 46 + 47 + let bestResult: TextExtractionResult | undefined; 48 + let bestStrategyName: string | undefined; 49 + let bestLength = 0; 50 + 51 + for (const strategy of this.strategies) { 52 + const result = await strategy.extract(buffer); 53 + 54 + if (!result.success) { 55 + this.logger.debug(`Strategy '${strategy.name}' failed: ${result.error}`); 56 + continue; 57 + } 58 + 59 + const charCount = result.text.trim().length; 60 + this.logger.debug(`Strategy '${strategy.name}' extracted ${charCount} chars`); 61 + 62 + if (charCount >= this.minCharThreshold) { 63 + this.logger.log(`Selected strategy '${strategy.name}' (${charCount} chars)`); 64 + this.lastUsedStrategy = strategy.name; 65 + return result; 66 + } 67 + 68 + if (charCount > bestLength) { 69 + bestResult = result; 70 + bestStrategyName = strategy.name; 71 + bestLength = charCount; 72 + } 73 + } 74 + 75 + if (bestResult) { 76 + this.logger.warn( 77 + `No strategy met threshold (${this.minCharThreshold}), falling back to '${bestStrategyName}' (${bestLength} chars)`, 78 + ); 79 + this.lastUsedStrategy = bestStrategyName; 80 + return bestResult; 81 + } 82 + 83 + this.lastUsedStrategy = undefined; 84 + return { success: false, error: 'All PDF extraction strategies failed' }; 85 + } 86 + }
+26
packages/file-upload/src/extractors/pdf/strategies/pdf-parse.strategy.ts
··· 1 + import type { TextExtractionResult } from '../../../types'; 2 + import type { PdfExtractionStrategy } from '../types'; 3 + 4 + // eslint-disable-next-line @typescript-eslint/no-require-imports 5 + const pdfParse = require('pdf-parse') as (buffer: Buffer) => Promise<{ text: string }>; 6 + 7 + /** 8 + * Fast text extraction using pdf-parse. 9 + * Works well for text-based PDFs but fails on design-heavy / vector-path PDFs. 10 + */ 11 + export class PdfParseStrategy implements PdfExtractionStrategy { 12 + readonly name = 'pdf-parse'; 13 + readonly priority = 0; 14 + 15 + async extract(buffer: Buffer): Promise<TextExtractionResult> { 16 + try { 17 + const data = await pdfParse(buffer); 18 + return { success: true, text: data.text }; 19 + } catch (error) { 20 + return { 21 + success: false, 22 + error: `pdf-parse failed: ${error instanceof Error ? error.message : 'Unknown error'}`, 23 + }; 24 + } 25 + } 26 + }
+59
packages/file-upload/src/extractors/pdf/strategies/pdf2json.strategy.ts
··· 1 + import type { TextExtractionResult } from '../../../types'; 2 + import type { PdfExtractionStrategy } from '../types'; 3 + 4 + /** 5 + * Text extraction using pdf2json. 6 + * Better at parsing layout-rich PDFs and some vector-rendered text. 7 + */ 8 + export class Pdf2JsonStrategy implements PdfExtractionStrategy { 9 + readonly name = 'pdf2json'; 10 + readonly priority = 1; 11 + 12 + async extract(buffer: Buffer): Promise<TextExtractionResult> { 13 + try { 14 + const { default: PDFParser } = await import('pdf2json'); 15 + const text = await this.parseBuffer(PDFParser, buffer); 16 + return { success: true, text }; 17 + } catch (error) { 18 + return { 19 + success: false, 20 + error: `pdf2json failed: ${error instanceof Error ? error.message : 'Unknown error'}`, 21 + }; 22 + } 23 + } 24 + 25 + private parseBuffer( 26 + PDFParser: typeof import('pdf2json').default, 27 + buffer: Buffer, 28 + ): Promise<string> { 29 + return new Promise((resolve, reject) => { 30 + const parser = new PDFParser(null, true); 31 + 32 + parser.on('pdfParser_dataReady', (pdfData) => { 33 + try { 34 + const text = pdfData.Pages.map((page) => 35 + page.Texts.map((textItem) => 36 + textItem.R.map((run) => decodeURIComponent(run.T)).join(''), 37 + ).join(' '), 38 + ).join('\n'); 39 + 40 + resolve(text); 41 + } catch (err) { 42 + reject(err); 43 + } 44 + }); 45 + 46 + parser.on('pdfParser_dataError', (errData) => { 47 + const message = 48 + errData instanceof Error 49 + ? errData.message 50 + : 'parserError' in errData 51 + ? String(errData.parserError) 52 + : 'pdf2json parse error'; 53 + reject(new Error(message)); 54 + }); 55 + 56 + parser.parseBuffer(buffer); 57 + }); 58 + } 59 + }
+75
packages/file-upload/src/extractors/pdf/strategies/tesseract-ocr.strategy.ts
··· 1 + import type { TextExtractionResult } from '../../../types'; 2 + import type { PdfExtractionStrategy } from '../types'; 3 + 4 + /** 5 + * OCR fallback for image-based or vector-path PDFs. 6 + * Renders each page to a PNG via pdfjs-dist (legacy/Node build) + canvas, 7 + * then OCRs with tesseract.js. Heavy deps are lazy-loaded on first use. 8 + */ 9 + export class TesseractOcrStrategy implements PdfExtractionStrategy { 10 + readonly name = 'tesseract-ocr'; 11 + readonly priority = 2; 12 + 13 + async extract(buffer: Buffer): Promise<TextExtractionResult> { 14 + try { 15 + const pageImages = await this.renderPagesToImages(buffer); 16 + 17 + if (pageImages.length === 0) { 18 + return { success: false, error: 'tesseract-ocr: no pages rendered from PDF' }; 19 + } 20 + 21 + const text = await this.ocrImages(pageImages); 22 + return { success: true, text }; 23 + } catch (error) { 24 + return { 25 + success: false, 26 + error: `tesseract-ocr failed: ${error instanceof Error ? error.message : 'Unknown error'}`, 27 + }; 28 + } 29 + } 30 + 31 + private async renderPagesToImages(buffer: Buffer): Promise<Buffer[]> { 32 + // eslint-disable-next-line @typescript-eslint/no-require-imports 33 + const pdfjsLib = require('pdfjs-dist/legacy/build/pdf.js') as typeof import('pdfjs-dist'); 34 + const { createCanvas } = await import('canvas'); 35 + 36 + const data = new Uint8Array(buffer); 37 + const doc = await pdfjsLib.getDocument({ data }).promise; 38 + const images: Buffer[] = []; 39 + 40 + for (let i = 1; i <= doc.numPages; i++) { 41 + const page = await doc.getPage(i); 42 + const viewport = page.getViewport({ scale: 2.0 }); 43 + 44 + const canvas = createCanvas(viewport.width, viewport.height); 45 + const context = canvas.getContext('2d'); 46 + 47 + await page.render({ 48 + canvasContext: context as unknown as CanvasRenderingContext2D, 49 + viewport, 50 + }).promise; 51 + 52 + images.push(canvas.toBuffer('image/png')); 53 + } 54 + 55 + return images; 56 + } 57 + 58 + private async ocrImages(images: Buffer[]): Promise<string> { 59 + const Tesseract = await import('tesseract.js'); 60 + const worker = await Tesseract.createWorker('eng'); 61 + 62 + try { 63 + const pages: string[] = []; 64 + 65 + for (const image of images) { 66 + const { data } = await worker.recognize(image); 67 + pages.push(data.text); 68 + } 69 + 70 + return pages.join('\n'); 71 + } finally { 72 + await worker.terminate(); 73 + } 74 + } 75 + }
+20
packages/file-upload/src/extractors/pdf/types.ts
··· 1 + import type { TextExtractionResult } from '../../types'; 2 + 3 + /** 4 + * Strategy for extracting text from a PDF buffer. 5 + * Distinct from the top-level TextExtractor which routes by MIME type — 6 + * these are PDF-specific backends tried in priority order. 7 + */ 8 + export interface PdfExtractionStrategy { 9 + readonly name: string; 10 + /** Lower values are tried first */ 11 + readonly priority: number; 12 + extract(buffer: Buffer): Promise<TextExtractionResult>; 13 + } 14 + 15 + export interface PdfExtractorConfig { 16 + /** Minimum extracted character count to accept a strategy result (default: 50) */ 17 + minCharThreshold?: number; 18 + /** Override default strategies */ 19 + strategies?: PdfExtractionStrategy[]; 20 + }
+1 -1
packages/file-upload/src/file-extraction.module.ts
··· 1 1 import { DynamicModule, Module } from '@nestjs/common'; 2 2 import type { TextExtractor } from './types'; 3 3 import { TextExtractorRegistry } from './extractor-registry'; 4 - import { PDFExtractor } from './extractors/pdf.extractor'; 4 + import { PDFExtractor } from './extractors/pdf/pdf.extractor'; 5 5 import { DOCXExtractor } from './extractors/docx.extractor'; 6 6 import { PlainTextExtractor } from './extractors/plain-text.extractor'; 7 7
+4 -1
packages/file-upload/src/index.ts
··· 10 10 export { SupportedMimeTypes, FileUploadSchema } from './types'; 11 11 12 12 // Extractor implementations 13 - export { PDFExtractor } from './extractors/pdf.extractor'; 13 + export { PDFExtractor } from './extractors/pdf/pdf.extractor'; 14 14 export { DOCXExtractor } from './extractors/docx.extractor'; 15 15 export { PlainTextExtractor } from './extractors/plain-text.extractor'; 16 + 17 + // PDF strategy types (for extensibility) 18 + export type { PdfExtractionStrategy, PdfExtractorConfig } from './extractors/pdf/types'; 16 19 17 20 // Extractor registry 18 21 export { TextExtractorRegistry } from './extractor-registry';
+1 -1
packages/file-upload/src/types.ts
··· 1 - import { z } from 'zod'; 1 + import { z } from "zod/v4"; 2 2 3 3 /** 4 4 * Supported MIME types for CV uploads