Full document, spreadsheet, slideshow, and diagram tooling
0
fork

Configure Feed

Select the types of activity you want to include in your feed.

at main 200 lines 6.9 kB view raw
1/** 2 * PDF import module for Atmosphere Docs. 3 * 4 * Uses pdf.js to extract text content from a PDF, then converts it 5 * to HTML suitable for the TipTap editor. Large text blocks become 6 * headings; normal text becomes paragraphs; page breaks become <hr>. 7 */ 8import type { Editor } from '@tiptap/core'; 9 10/** One logical text chunk from a PDF page. */ 11interface PdfTextItem { 12 str: string; 13 /** Approximate font size in points (estimated from transform matrix height). */ 14 fontSize: number; 15 /** Y position on the page (ascending from bottom in PDF coordinates). */ 16 y: number; 17} 18 19/** Collect text items from all pages and return them as structured chunks. */ 20async function extractTextItems(arrayBuffer: ArrayBuffer): Promise<PdfTextItem[][]> { 21 // Dynamic import — pdfjs-dist is large; only load when needed. 22 const pdfjsLib = await import('pdfjs-dist'); 23 24 // Point the worker at the bundled worker file. Vite resolves ?url to the 25 // asset path at build time; in dev the dev server serves it directly. 26 if (!pdfjsLib.GlobalWorkerOptions.workerSrc) { 27 // @ts-ignore — Vite ?url suffix is not typed in TypeScript 28 const { default: workerUrl } = await import('pdfjs-dist/build/pdf.worker.min.mjs?url'); 29 pdfjsLib.GlobalWorkerOptions.workerSrc = workerUrl as string; 30 } 31 32 const loadingTask = pdfjsLib.getDocument({ data: new Uint8Array(arrayBuffer) }); 33 const pdf = await loadingTask.promise; 34 const pages: PdfTextItem[][] = []; 35 36 const MAX_PAGES = 500; 37 const pageCount = Math.min(pdf.numPages, MAX_PAGES); 38 39 for (let p = 1; p <= pageCount; p++) { 40 const page = await pdf.getPage(p); 41 const content = await page.getTextContent(); 42 const viewport = page.getViewport({ scale: 1 }); 43 const pageHeight = viewport.height; 44 const items: PdfTextItem[] = []; 45 46 for (const item of content.items) { 47 if ('str' in item && item.str.trim()) { 48 // transform is a 6-element matrix [a, b, c, d, e, f]. 49 // d (index 3) is approximately the font height in user units. 50 // e (index 4) = x, f (index 5) = y from bottom-left. 51 const transform = (item as any).transform as number[]; 52 const fontSize = Math.abs(transform?.[3] ?? 12); 53 const yFromBottom = transform?.[5] ?? 0; 54 // Convert to distance from top (PDF origin is bottom-left). 55 const y = pageHeight - yFromBottom; 56 items.push({ str: item.str, fontSize, y }); 57 } 58 } 59 pages.push(items); 60 } 61 62 return pages; 63} 64 65/** Heuristically convert extracted PDF items to HTML. */ 66function itemsToHtml(pages: PdfTextItem[][]): string { 67 if (pages.length === 0) return ''; 68 69 // Find the median font size across all pages to calibrate heading detection. 70 const allSizes = pages.flatMap(p => p.map(i => i.fontSize)).filter(s => s > 0); 71 allSizes.sort((a, b) => a - b); 72 const median = allSizes[Math.floor(allSizes.length / 2)] ?? 12; 73 const headingThreshold = median * 1.4; // 40% larger than median = heading 74 75 const parts: string[] = []; 76 77 for (let pi = 0; pi < pages.length; pi++) { 78 if (pi > 0) parts.push('<hr>'); 79 80 const items = pages[pi]!; 81 if (items.length === 0) continue; 82 83 // Group items into lines by proximity of y coordinates. 84 const lines: PdfTextItem[][] = []; 85 let currentLine: PdfTextItem[] = []; 86 let lastY = items[0]!.y; 87 88 for (const item of items) { 89 // Items more than 2pt apart vertically are on different lines. 90 if (Math.abs(item.y - lastY) > 2 && currentLine.length > 0) { 91 lines.push(currentLine); 92 currentLine = []; 93 } 94 currentLine.push(item); 95 lastY = item.y; 96 } 97 if (currentLine.length > 0) lines.push(currentLine); 98 99 // Merge lines into paragraphs (blank line gap = new paragraph). 100 let prevLineY = lines[0]?.[0]?.y ?? 0; 101 let paragraph: string[] = []; 102 103 const flushParagraph = (lineItems: PdfTextItem[]) => { 104 const text = lineItems.map(i => i.str).join(' ').trim(); 105 if (!text) return; 106 const avgFontSize = lineItems.reduce((s, i) => s + i.fontSize, 0) / lineItems.length; 107 if (avgFontSize >= headingThreshold) { 108 // Larger text = heading 109 const level = avgFontSize >= headingThreshold * 1.4 ? 1 : 2; 110 parts.push(`<h${level}>${escapeHtml(text)}</h${level}>`); 111 } else { 112 paragraph.push(text); 113 } 114 }; 115 116 const flushBlock = () => { 117 if (paragraph.length > 0) { 118 parts.push(`<p>${paragraph.map(escapeHtml).join('<br>')}</p>`); 119 paragraph = []; 120 } 121 }; 122 123 for (const line of lines) { 124 const y = line[0]?.y ?? 0; 125 // Gap larger than 1.5× typical line spacing = new paragraph. 126 const gap = Math.abs(y - prevLineY); 127 const lineHeight = (line[0]?.fontSize ?? 12) * 1.5; 128 if (gap > lineHeight && paragraph.length > 0) { 129 flushBlock(); 130 } 131 132 const avgFontSize = line.reduce((s, i) => s + i.fontSize, 0) / line.length; 133 if (avgFontSize >= headingThreshold) { 134 flushBlock(); 135 flushParagraph(line); 136 } else { 137 const text = line.map(i => i.str).join(' ').trim(); 138 if (text) paragraph.push(text); 139 } 140 prevLineY = y; 141 } 142 flushBlock(); 143 } 144 145 return parts.join('\n'); 146} 147 148function escapeHtml(s: string): string { 149 return s 150 .replace(/&/g, '&amp;') 151 .replace(/</g, '&lt;') 152 .replace(/>/g, '&gt;') 153 .replace(/"/g, '&quot;'); 154} 155 156/** 157 * Validate that an ArrayBuffer looks like a PDF (starts with %PDF-). 158 */ 159export function isValidPdf(arrayBuffer: ArrayBuffer): boolean { 160 if (!arrayBuffer || arrayBuffer.byteLength < 5) return false; 161 const view = new Uint8Array(arrayBuffer); 162 // %PDF- = 0x25 0x50 0x44 0x46 0x2D 163 return view[0] === 0x25 && view[1] === 0x50 && view[2] === 0x44 && view[3] === 0x46 && view[4] === 0x2D; 164} 165 166/** 167 * Extract text from a PDF ArrayBuffer and return as TipTap-compatible HTML. 168 * Pure async — testable without DOM. 169 */ 170export async function convertPdfToHtml(arrayBuffer: ArrayBuffer): Promise<string> { 171 const pages = await extractTextItems(arrayBuffer); 172 const html = itemsToHtml(pages); 173 return html || '<p></p>'; 174} 175 176/** 177 * Import a PDF File into the TipTap editor. 178 * DOM-coupled entry point. 179 */ 180export async function importPdf( 181 file: File, 182 editor: Editor, 183 showToast: (message: string, duration: number) => void, 184): Promise<void> { 185 try { 186 const arrayBuffer = await file.arrayBuffer(); 187 188 if (!isValidPdf(arrayBuffer)) { 189 showToast('Invalid PDF — the file appears to be corrupt or not a PDF', 5000); 190 return; 191 } 192 193 const html = await convertPdfToHtml(arrayBuffer); 194 editor.commands.setContent(html); 195 showToast(`Imported "${file.name}" successfully`, 3000); 196 } catch (err) { 197 console.error('pdf import error:', err); 198 showToast('Failed to import PDF — it may be corrupt, encrypted, or use unsupported features', 5000); 199 } 200}