/** * PDF import module for Atmosphere Docs. * * Uses pdf.js to extract text content from a PDF, then converts it * to HTML suitable for the TipTap editor. Large text blocks become * headings; normal text becomes paragraphs; page breaks become
. */ import type { Editor } from '@tiptap/core'; /** One logical text chunk from a PDF page. */ interface PdfTextItem { str: string; /** Approximate font size in points (estimated from transform matrix height). */ fontSize: number; /** Y position on the page (ascending from bottom in PDF coordinates). */ y: number; } /** Collect text items from all pages and return them as structured chunks. */ async function extractTextItems(arrayBuffer: ArrayBuffer): Promise { // Dynamic import — pdfjs-dist is large; only load when needed. const pdfjsLib = await import('pdfjs-dist'); // Point the worker at the bundled worker file. Vite resolves ?url to the // asset path at build time; in dev the dev server serves it directly. if (!pdfjsLib.GlobalWorkerOptions.workerSrc) { // @ts-ignore — Vite ?url suffix is not typed in TypeScript const { default: workerUrl } = await import('pdfjs-dist/build/pdf.worker.min.mjs?url'); pdfjsLib.GlobalWorkerOptions.workerSrc = workerUrl as string; } const loadingTask = pdfjsLib.getDocument({ data: new Uint8Array(arrayBuffer) }); const pdf = await loadingTask.promise; const pages: PdfTextItem[][] = []; const MAX_PAGES = 500; const pageCount = Math.min(pdf.numPages, MAX_PAGES); for (let p = 1; p <= pageCount; p++) { const page = await pdf.getPage(p); const content = await page.getTextContent(); const viewport = page.getViewport({ scale: 1 }); const pageHeight = viewport.height; const items: PdfTextItem[] = []; for (const item of content.items) { if ('str' in item && item.str.trim()) { // transform is a 6-element matrix [a, b, c, d, e, f]. // d (index 3) is approximately the font height in user units. // e (index 4) = x, f (index 5) = y from bottom-left. const transform = (item as any).transform as number[]; const fontSize = Math.abs(transform?.[3] ?? 12); const yFromBottom = transform?.[5] ?? 0; // Convert to distance from top (PDF origin is bottom-left). const y = pageHeight - yFromBottom; items.push({ str: item.str, fontSize, y }); } } pages.push(items); } return pages; } /** Heuristically convert extracted PDF items to HTML. */ function itemsToHtml(pages: PdfTextItem[][]): string { if (pages.length === 0) return ''; // Find the median font size across all pages to calibrate heading detection. const allSizes = pages.flatMap(p => p.map(i => i.fontSize)).filter(s => s > 0); allSizes.sort((a, b) => a - b); const median = allSizes[Math.floor(allSizes.length / 2)] ?? 12; const headingThreshold = median * 1.4; // 40% larger than median = heading const parts: string[] = []; for (let pi = 0; pi < pages.length; pi++) { if (pi > 0) parts.push('
'); const items = pages[pi]!; if (items.length === 0) continue; // Group items into lines by proximity of y coordinates. const lines: PdfTextItem[][] = []; let currentLine: PdfTextItem[] = []; let lastY = items[0]!.y; for (const item of items) { // Items more than 2pt apart vertically are on different lines. if (Math.abs(item.y - lastY) > 2 && currentLine.length > 0) { lines.push(currentLine); currentLine = []; } currentLine.push(item); lastY = item.y; } if (currentLine.length > 0) lines.push(currentLine); // Merge lines into paragraphs (blank line gap = new paragraph). let prevLineY = lines[0]?.[0]?.y ?? 0; let paragraph: string[] = []; const flushParagraph = (lineItems: PdfTextItem[]) => { const text = lineItems.map(i => i.str).join(' ').trim(); if (!text) return; const avgFontSize = lineItems.reduce((s, i) => s + i.fontSize, 0) / lineItems.length; if (avgFontSize >= headingThreshold) { // Larger text = heading const level = avgFontSize >= headingThreshold * 1.4 ? 1 : 2; parts.push(`${escapeHtml(text)}`); } else { paragraph.push(text); } }; const flushBlock = () => { if (paragraph.length > 0) { parts.push(`

${paragraph.map(escapeHtml).join('
')}

`); paragraph = []; } }; for (const line of lines) { const y = line[0]?.y ?? 0; // Gap larger than 1.5× typical line spacing = new paragraph. const gap = Math.abs(y - prevLineY); const lineHeight = (line[0]?.fontSize ?? 12) * 1.5; if (gap > lineHeight && paragraph.length > 0) { flushBlock(); } const avgFontSize = line.reduce((s, i) => s + i.fontSize, 0) / line.length; if (avgFontSize >= headingThreshold) { flushBlock(); flushParagraph(line); } else { const text = line.map(i => i.str).join(' ').trim(); if (text) paragraph.push(text); } prevLineY = y; } flushBlock(); } return parts.join('\n'); } function escapeHtml(s: string): string { return s .replace(/&/g, '&') .replace(//g, '>') .replace(/"/g, '"'); } /** * Validate that an ArrayBuffer looks like a PDF (starts with %PDF-). */ export function isValidPdf(arrayBuffer: ArrayBuffer): boolean { if (!arrayBuffer || arrayBuffer.byteLength < 5) return false; const view = new Uint8Array(arrayBuffer); // %PDF- = 0x25 0x50 0x44 0x46 0x2D return view[0] === 0x25 && view[1] === 0x50 && view[2] === 0x44 && view[3] === 0x46 && view[4] === 0x2D; } /** * Extract text from a PDF ArrayBuffer and return as TipTap-compatible HTML. * Pure async — testable without DOM. */ export async function convertPdfToHtml(arrayBuffer: ArrayBuffer): Promise { const pages = await extractTextItems(arrayBuffer); const html = itemsToHtml(pages); return html || '

'; } /** * Import a PDF File into the TipTap editor. * DOM-coupled entry point. */ export async function importPdf( file: File, editor: Editor, showToast: (message: string, duration: number) => void, ): Promise { try { const arrayBuffer = await file.arrayBuffer(); if (!isValidPdf(arrayBuffer)) { showToast('Invalid PDF — the file appears to be corrupt or not a PDF', 5000); return; } const html = await convertPdfToHtml(arrayBuffer); editor.commands.setContent(html); showToast(`Imported "${file.name}" successfully`, 3000); } catch (err) { console.error('pdf import error:', err); showToast('Failed to import PDF — it may be corrupt, encrypted, or use unsupported features', 5000); } }