/**
* PDF import module for Atmosphere Docs.
*
* Uses pdf.js to extract text content from a PDF, then converts it
* to HTML suitable for the TipTap editor. Large text blocks become
* headings; normal text becomes paragraphs; page breaks become
.
*/
import type { Editor } from '@tiptap/core';
/** One logical text chunk from a PDF page. */
interface PdfTextItem {
str: string;
/** Approximate font size in points (estimated from transform matrix height). */
fontSize: number;
/** Y position on the page (ascending from bottom in PDF coordinates). */
y: number;
}
/** Collect text items from all pages and return them as structured chunks. */
async function extractTextItems(arrayBuffer: ArrayBuffer): Promise {
// Dynamic import — pdfjs-dist is large; only load when needed.
const pdfjsLib = await import('pdfjs-dist');
// Point the worker at the bundled worker file. Vite resolves ?url to the
// asset path at build time; in dev the dev server serves it directly.
if (!pdfjsLib.GlobalWorkerOptions.workerSrc) {
// @ts-ignore — Vite ?url suffix is not typed in TypeScript
const { default: workerUrl } = await import('pdfjs-dist/build/pdf.worker.min.mjs?url');
pdfjsLib.GlobalWorkerOptions.workerSrc = workerUrl as string;
}
const loadingTask = pdfjsLib.getDocument({ data: new Uint8Array(arrayBuffer) });
const pdf = await loadingTask.promise;
const pages: PdfTextItem[][] = [];
const MAX_PAGES = 500;
const pageCount = Math.min(pdf.numPages, MAX_PAGES);
for (let p = 1; p <= pageCount; p++) {
const page = await pdf.getPage(p);
const content = await page.getTextContent();
const viewport = page.getViewport({ scale: 1 });
const pageHeight = viewport.height;
const items: PdfTextItem[] = [];
for (const item of content.items) {
if ('str' in item && item.str.trim()) {
// transform is a 6-element matrix [a, b, c, d, e, f].
// d (index 3) is approximately the font height in user units.
// e (index 4) = x, f (index 5) = y from bottom-left.
const transform = (item as any).transform as number[];
const fontSize = Math.abs(transform?.[3] ?? 12);
const yFromBottom = transform?.[5] ?? 0;
// Convert to distance from top (PDF origin is bottom-left).
const y = pageHeight - yFromBottom;
items.push({ str: item.str, fontSize, y });
}
}
pages.push(items);
}
return pages;
}
/** Heuristically convert extracted PDF items to HTML. */
function itemsToHtml(pages: PdfTextItem[][]): string {
if (pages.length === 0) return '';
// Find the median font size across all pages to calibrate heading detection.
const allSizes = pages.flatMap(p => p.map(i => i.fontSize)).filter(s => s > 0);
allSizes.sort((a, b) => a - b);
const median = allSizes[Math.floor(allSizes.length / 2)] ?? 12;
const headingThreshold = median * 1.4; // 40% larger than median = heading
const parts: string[] = [];
for (let pi = 0; pi < pages.length; pi++) {
if (pi > 0) parts.push('
');
const items = pages[pi]!;
if (items.length === 0) continue;
// Group items into lines by proximity of y coordinates.
const lines: PdfTextItem[][] = [];
let currentLine: PdfTextItem[] = [];
let lastY = items[0]!.y;
for (const item of items) {
// Items more than 2pt apart vertically are on different lines.
if (Math.abs(item.y - lastY) > 2 && currentLine.length > 0) {
lines.push(currentLine);
currentLine = [];
}
currentLine.push(item);
lastY = item.y;
}
if (currentLine.length > 0) lines.push(currentLine);
// Merge lines into paragraphs (blank line gap = new paragraph).
let prevLineY = lines[0]?.[0]?.y ?? 0;
let paragraph: string[] = [];
const flushParagraph = (lineItems: PdfTextItem[]) => {
const text = lineItems.map(i => i.str).join(' ').trim();
if (!text) return;
const avgFontSize = lineItems.reduce((s, i) => s + i.fontSize, 0) / lineItems.length;
if (avgFontSize >= headingThreshold) {
// Larger text = heading
const level = avgFontSize >= headingThreshold * 1.4 ? 1 : 2;
parts.push(`${escapeHtml(text)}`);
} else {
paragraph.push(text);
}
};
const flushBlock = () => {
if (paragraph.length > 0) {
parts.push(`${paragraph.map(escapeHtml).join('
')}
`);
paragraph = [];
}
};
for (const line of lines) {
const y = line[0]?.y ?? 0;
// Gap larger than 1.5× typical line spacing = new paragraph.
const gap = Math.abs(y - prevLineY);
const lineHeight = (line[0]?.fontSize ?? 12) * 1.5;
if (gap > lineHeight && paragraph.length > 0) {
flushBlock();
}
const avgFontSize = line.reduce((s, i) => s + i.fontSize, 0) / line.length;
if (avgFontSize >= headingThreshold) {
flushBlock();
flushParagraph(line);
} else {
const text = line.map(i => i.str).join(' ').trim();
if (text) paragraph.push(text);
}
prevLineY = y;
}
flushBlock();
}
return parts.join('\n');
}
function escapeHtml(s: string): string {
return s
.replace(/&/g, '&')
.replace(//g, '>')
.replace(/"/g, '"');
}
/**
* Validate that an ArrayBuffer looks like a PDF (starts with %PDF-).
*/
export function isValidPdf(arrayBuffer: ArrayBuffer): boolean {
if (!arrayBuffer || arrayBuffer.byteLength < 5) return false;
const view = new Uint8Array(arrayBuffer);
// %PDF- = 0x25 0x50 0x44 0x46 0x2D
return view[0] === 0x25 && view[1] === 0x50 && view[2] === 0x44 && view[3] === 0x46 && view[4] === 0x2D;
}
/**
* Extract text from a PDF ArrayBuffer and return as TipTap-compatible HTML.
* Pure async — testable without DOM.
*/
export async function convertPdfToHtml(arrayBuffer: ArrayBuffer): Promise {
const pages = await extractTextItems(arrayBuffer);
const html = itemsToHtml(pages);
return html || '';
}
/**
* Import a PDF File into the TipTap editor.
* DOM-coupled entry point.
*/
export async function importPdf(
file: File,
editor: Editor,
showToast: (message: string, duration: number) => void,
): Promise {
try {
const arrayBuffer = await file.arrayBuffer();
if (!isValidPdf(arrayBuffer)) {
showToast('Invalid PDF — the file appears to be corrupt or not a PDF', 5000);
return;
}
const html = await convertPdfToHtml(arrayBuffer);
editor.commands.setContent(html);
showToast(`Imported "${file.name}" successfully`, 3000);
} catch (err) {
console.error('pdf import error:', err);
showToast('Failed to import PDF — it may be corrupt, encrypted, or use unsupported features', 5000);
}
}