src/docs/pdf-import.ts at main · scottlanoue.com/atmosphere-office

scottlanoue.com / atmosphere-office
fork
Full document, spreadsheet, slideshow, and diagram tooling
fork
atmosphere-office / src / docs / pdf-import.ts
at main 200 lines 6.9 kB view raw
wrap content
Scott Lanoue feat: fork tools → atmosphere-docs (Phase 1 — local-only E2EE office) 13d ago
5b33e31d
  1/**
  2 * PDF import module for Atmosphere Docs.
  3 *
  4 * Uses pdf.js to extract text content from a PDF, then converts it
  5 * to HTML suitable for the TipTap editor. Large text blocks become
  6 * headings; normal text becomes paragraphs; page breaks become <hr>.
  7 */
  8import type { Editor } from '@tiptap/core';
  9
 10/** One logical text chunk from a PDF page. */
 11interface PdfTextItem {
 12  str: string;
 13  /** Approximate font size in points (estimated from transform matrix height). */
 14  fontSize: number;
 15  /** Y position on the page (ascending from bottom in PDF coordinates). */
 16  y: number;
 17}
 18
 19/** Collect text items from all pages and return them as structured chunks. */
 20async function extractTextItems(arrayBuffer: ArrayBuffer): Promise<PdfTextItem[][]> {
 21  // Dynamic import — pdfjs-dist is large; only load when needed.
 22  const pdfjsLib = await import('pdfjs-dist');
 23
 24  // Point the worker at the bundled worker file. Vite resolves ?url to the
 25  // asset path at build time; in dev the dev server serves it directly.
 26  if (!pdfjsLib.GlobalWorkerOptions.workerSrc) {
 27    // @ts-ignore — Vite ?url suffix is not typed in TypeScript
 28    const { default: workerUrl } = await import('pdfjs-dist/build/pdf.worker.min.mjs?url');
 29    pdfjsLib.GlobalWorkerOptions.workerSrc = workerUrl as string;
 30  }
 31
 32  const loadingTask = pdfjsLib.getDocument({ data: new Uint8Array(arrayBuffer) });
 33  const pdf = await loadingTask.promise;
 34  const pages: PdfTextItem[][] = [];
 35
 36  const MAX_PAGES = 500;
 37  const pageCount = Math.min(pdf.numPages, MAX_PAGES);
 38
 39  for (let p = 1; p <= pageCount; p++) {
 40    const page = await pdf.getPage(p);
 41    const content = await page.getTextContent();
 42    const viewport = page.getViewport({ scale: 1 });
 43    const pageHeight = viewport.height;
 44    const items: PdfTextItem[] = [];
 45
 46    for (const item of content.items) {
 47      if ('str' in item && item.str.trim()) {
 48        // transform is a 6-element matrix [a, b, c, d, e, f].
 49        // d (index 3) is approximately the font height in user units.
 50        // e (index 4) = x, f (index 5) = y from bottom-left.
 51        const transform = (item as any).transform as number[];
 52        const fontSize = Math.abs(transform?.[3] ?? 12);
 53        const yFromBottom = transform?.[5] ?? 0;
 54        // Convert to distance from top (PDF origin is bottom-left).
 55        const y = pageHeight - yFromBottom;
 56        items.push({ str: item.str, fontSize, y });
 57      }
 58    }
 59    pages.push(items);
 60  }
 61
 62  return pages;
 63}
 64
 65/** Heuristically convert extracted PDF items to HTML. */
 66function itemsToHtml(pages: PdfTextItem[][]): string {
 67  if (pages.length === 0) return '';
 68
 69  // Find the median font size across all pages to calibrate heading detection.
 70  const allSizes = pages.flatMap(p => p.map(i => i.fontSize)).filter(s => s > 0);
 71  allSizes.sort((a, b) => a - b);
 72  const median = allSizes[Math.floor(allSizes.length / 2)] ?? 12;
 73  const headingThreshold = median * 1.4; // 40% larger than median = heading
 74
 75  const parts: string[] = [];
 76
 77  for (let pi = 0; pi < pages.length; pi++) {
 78    if (pi > 0) parts.push('<hr>');
 79
 80    const items = pages[pi]!;
 81    if (items.length === 0) continue;
 82
 83    // Group items into lines by proximity of y coordinates.
 84    const lines: PdfTextItem[][] = [];
 85    let currentLine: PdfTextItem[] = [];
 86    let lastY = items[0]!.y;
 87
 88    for (const item of items) {
 89      // Items more than 2pt apart vertically are on different lines.
 90      if (Math.abs(item.y - lastY) > 2 && currentLine.length > 0) {
 91        lines.push(currentLine);
 92        currentLine = [];
 93      }
 94      currentLine.push(item);
 95      lastY = item.y;
 96    }
 97    if (currentLine.length > 0) lines.push(currentLine);
 98
 99    // Merge lines into paragraphs (blank line gap = new paragraph).
100    let prevLineY = lines[0]?.[0]?.y ?? 0;
101    let paragraph: string[] = [];
102
103    const flushParagraph = (lineItems: PdfTextItem[]) => {
104      const text = lineItems.map(i => i.str).join(' ').trim();
105      if (!text) return;
106      const avgFontSize = lineItems.reduce((s, i) => s + i.fontSize, 0) / lineItems.length;
107      if (avgFontSize >= headingThreshold) {
108        // Larger text = heading
109        const level = avgFontSize >= headingThreshold * 1.4 ? 1 : 2;
110        parts.push(`<h${level}>${escapeHtml(text)}</h${level}>`);
111      } else {
112        paragraph.push(text);
113      }
114    };
115
116    const flushBlock = () => {
117      if (paragraph.length > 0) {
118        parts.push(`<p>${paragraph.map(escapeHtml).join('<br>')}</p>`);
119        paragraph = [];
120      }
121    };
122
123    for (const line of lines) {
124      const y = line[0]?.y ?? 0;
125      // Gap larger than 1.5× typical line spacing = new paragraph.
126      const gap = Math.abs(y - prevLineY);
127      const lineHeight = (line[0]?.fontSize ?? 12) * 1.5;
128      if (gap > lineHeight && paragraph.length > 0) {
129        flushBlock();
130      }
131
132      const avgFontSize = line.reduce((s, i) => s + i.fontSize, 0) / line.length;
133      if (avgFontSize >= headingThreshold) {
134        flushBlock();
135        flushParagraph(line);
136      } else {
137        const text = line.map(i => i.str).join(' ').trim();
138        if (text) paragraph.push(text);
139      }
140      prevLineY = y;
141    }
142    flushBlock();
143  }
144
145  return parts.join('\n');
146}
147
148function escapeHtml(s: string): string {
149  return s
150    .replace(/&/g, '&amp;')
151    .replace(/</g, '&lt;')
152    .replace(/>/g, '&gt;')
153    .replace(/"/g, '&quot;');
154}
155
156/**
157 * Validate that an ArrayBuffer looks like a PDF (starts with %PDF-).
158 */
159export function isValidPdf(arrayBuffer: ArrayBuffer): boolean {
160  if (!arrayBuffer || arrayBuffer.byteLength < 5) return false;
161  const view = new Uint8Array(arrayBuffer);
162  // %PDF- = 0x25 0x50 0x44 0x46 0x2D
163  return view[0] === 0x25 && view[1] === 0x50 && view[2] === 0x44 && view[3] === 0x46 && view[4] === 0x2D;
164}
165
166/**
167 * Extract text from a PDF ArrayBuffer and return as TipTap-compatible HTML.
168 * Pure async — testable without DOM.
169 */
170export async function convertPdfToHtml(arrayBuffer: ArrayBuffer): Promise<string> {
171  const pages = await extractTextItems(arrayBuffer);
172  const html = itemsToHtml(pages);
173  return html || '<p></p>';
174}
175
176/**
177 * Import a PDF File into the TipTap editor.
178 * DOM-coupled entry point.
179 */
180export async function importPdf(
181  file: File,
182  editor: Editor,
183  showToast: (message: string, duration: number) => void,
184): Promise<void> {
185  try {
186    const arrayBuffer = await file.arrayBuffer();
187
188    if (!isValidPdf(arrayBuffer)) {
189      showToast('Invalid PDF — the file appears to be corrupt or not a PDF', 5000);
190      return;
191    }
192
193    const html = await convertPdfToHtml(arrayBuffer);
194    editor.commands.setContent(html);
195    showToast(`Imported "${file.name}" successfully`, 3000);
196  } catch (err) {
197    console.error('pdf import error:', err);
198    showToast('Failed to import PDF — it may be corrupt, encrypted, or use unsupported features', 5000);
199  }
200}
Configure Feed

Configure Feed