Full document, spreadsheet, slideshow, and diagram tooling
1/**
2 * PDF import module for Atmosphere Docs.
3 *
4 * Uses pdf.js to extract text content from a PDF, then converts it
5 * to HTML suitable for the TipTap editor. Large text blocks become
6 * headings; normal text becomes paragraphs; page breaks become <hr>.
7 */
8import type { Editor } from '@tiptap/core';
9
10/** One logical text chunk from a PDF page. */
11interface PdfTextItem {
12 str: string;
13 /** Approximate font size in points (estimated from transform matrix height). */
14 fontSize: number;
15 /** Y position on the page (ascending from bottom in PDF coordinates). */
16 y: number;
17}
18
19/** Collect text items from all pages and return them as structured chunks. */
20async function extractTextItems(arrayBuffer: ArrayBuffer): Promise<PdfTextItem[][]> {
21 // Dynamic import — pdfjs-dist is large; only load when needed.
22 const pdfjsLib = await import('pdfjs-dist');
23
24 // Point the worker at the bundled worker file. Vite resolves ?url to the
25 // asset path at build time; in dev the dev server serves it directly.
26 if (!pdfjsLib.GlobalWorkerOptions.workerSrc) {
27 // @ts-ignore — Vite ?url suffix is not typed in TypeScript
28 const { default: workerUrl } = await import('pdfjs-dist/build/pdf.worker.min.mjs?url');
29 pdfjsLib.GlobalWorkerOptions.workerSrc = workerUrl as string;
30 }
31
32 const loadingTask = pdfjsLib.getDocument({ data: new Uint8Array(arrayBuffer) });
33 const pdf = await loadingTask.promise;
34 const pages: PdfTextItem[][] = [];
35
36 const MAX_PAGES = 500;
37 const pageCount = Math.min(pdf.numPages, MAX_PAGES);
38
39 for (let p = 1; p <= pageCount; p++) {
40 const page = await pdf.getPage(p);
41 const content = await page.getTextContent();
42 const viewport = page.getViewport({ scale: 1 });
43 const pageHeight = viewport.height;
44 const items: PdfTextItem[] = [];
45
46 for (const item of content.items) {
47 if ('str' in item && item.str.trim()) {
48 // transform is a 6-element matrix [a, b, c, d, e, f].
49 // d (index 3) is approximately the font height in user units.
50 // e (index 4) = x, f (index 5) = y from bottom-left.
51 const transform = (item as any).transform as number[];
52 const fontSize = Math.abs(transform?.[3] ?? 12);
53 const yFromBottom = transform?.[5] ?? 0;
54 // Convert to distance from top (PDF origin is bottom-left).
55 const y = pageHeight - yFromBottom;
56 items.push({ str: item.str, fontSize, y });
57 }
58 }
59 pages.push(items);
60 }
61
62 return pages;
63}
64
65/** Heuristically convert extracted PDF items to HTML. */
66function itemsToHtml(pages: PdfTextItem[][]): string {
67 if (pages.length === 0) return '';
68
69 // Find the median font size across all pages to calibrate heading detection.
70 const allSizes = pages.flatMap(p => p.map(i => i.fontSize)).filter(s => s > 0);
71 allSizes.sort((a, b) => a - b);
72 const median = allSizes[Math.floor(allSizes.length / 2)] ?? 12;
73 const headingThreshold = median * 1.4; // 40% larger than median = heading
74
75 const parts: string[] = [];
76
77 for (let pi = 0; pi < pages.length; pi++) {
78 if (pi > 0) parts.push('<hr>');
79
80 const items = pages[pi]!;
81 if (items.length === 0) continue;
82
83 // Group items into lines by proximity of y coordinates.
84 const lines: PdfTextItem[][] = [];
85 let currentLine: PdfTextItem[] = [];
86 let lastY = items[0]!.y;
87
88 for (const item of items) {
89 // Items more than 2pt apart vertically are on different lines.
90 if (Math.abs(item.y - lastY) > 2 && currentLine.length > 0) {
91 lines.push(currentLine);
92 currentLine = [];
93 }
94 currentLine.push(item);
95 lastY = item.y;
96 }
97 if (currentLine.length > 0) lines.push(currentLine);
98
99 // Merge lines into paragraphs (blank line gap = new paragraph).
100 let prevLineY = lines[0]?.[0]?.y ?? 0;
101 let paragraph: string[] = [];
102
103 const flushParagraph = (lineItems: PdfTextItem[]) => {
104 const text = lineItems.map(i => i.str).join(' ').trim();
105 if (!text) return;
106 const avgFontSize = lineItems.reduce((s, i) => s + i.fontSize, 0) / lineItems.length;
107 if (avgFontSize >= headingThreshold) {
108 // Larger text = heading
109 const level = avgFontSize >= headingThreshold * 1.4 ? 1 : 2;
110 parts.push(`<h${level}>${escapeHtml(text)}</h${level}>`);
111 } else {
112 paragraph.push(text);
113 }
114 };
115
116 const flushBlock = () => {
117 if (paragraph.length > 0) {
118 parts.push(`<p>${paragraph.map(escapeHtml).join('<br>')}</p>`);
119 paragraph = [];
120 }
121 };
122
123 for (const line of lines) {
124 const y = line[0]?.y ?? 0;
125 // Gap larger than 1.5× typical line spacing = new paragraph.
126 const gap = Math.abs(y - prevLineY);
127 const lineHeight = (line[0]?.fontSize ?? 12) * 1.5;
128 if (gap > lineHeight && paragraph.length > 0) {
129 flushBlock();
130 }
131
132 const avgFontSize = line.reduce((s, i) => s + i.fontSize, 0) / line.length;
133 if (avgFontSize >= headingThreshold) {
134 flushBlock();
135 flushParagraph(line);
136 } else {
137 const text = line.map(i => i.str).join(' ').trim();
138 if (text) paragraph.push(text);
139 }
140 prevLineY = y;
141 }
142 flushBlock();
143 }
144
145 return parts.join('\n');
146}
147
148function escapeHtml(s: string): string {
149 return s
150 .replace(/&/g, '&')
151 .replace(/</g, '<')
152 .replace(/>/g, '>')
153 .replace(/"/g, '"');
154}
155
156/**
157 * Validate that an ArrayBuffer looks like a PDF (starts with %PDF-).
158 */
159export function isValidPdf(arrayBuffer: ArrayBuffer): boolean {
160 if (!arrayBuffer || arrayBuffer.byteLength < 5) return false;
161 const view = new Uint8Array(arrayBuffer);
162 // %PDF- = 0x25 0x50 0x44 0x46 0x2D
163 return view[0] === 0x25 && view[1] === 0x50 && view[2] === 0x44 && view[3] === 0x46 && view[4] === 0x2D;
164}
165
166/**
167 * Extract text from a PDF ArrayBuffer and return as TipTap-compatible HTML.
168 * Pure async — testable without DOM.
169 */
170export async function convertPdfToHtml(arrayBuffer: ArrayBuffer): Promise<string> {
171 const pages = await extractTextItems(arrayBuffer);
172 const html = itemsToHtml(pages);
173 return html || '<p></p>';
174}
175
176/**
177 * Import a PDF File into the TipTap editor.
178 * DOM-coupled entry point.
179 */
180export async function importPdf(
181 file: File,
182 editor: Editor,
183 showToast: (message: string, duration: number) => void,
184): Promise<void> {
185 try {
186 const arrayBuffer = await file.arrayBuffer();
187
188 if (!isValidPdf(arrayBuffer)) {
189 showToast('Invalid PDF — the file appears to be corrupt or not a PDF', 5000);
190 return;
191 }
192
193 const html = await convertPdfToHtml(arrayBuffer);
194 editor.commands.setContent(html);
195 showToast(`Imported "${file.name}" successfully`, 3000);
196 } catch (err) {
197 console.error('pdf import error:', err);
198 showToast('Failed to import PDF — it may be corrupt, encrypted, or use unsupported features', 5000);
199 }
200}