feat: full-text search index across documents (#55) · scottlanoue.com/atmosphere-office@b942327

+417

2 changed files

expand all

src

search-index.ts

tests

search-index.test.ts

+235

src/search-index.ts

··· 1 + /** 2 + * Full-Text Search Index — client-side search across decrypted documents. 3 + * 4 + * Pure logic module: tokenization, indexing, ranked search. 5 + * Since all content is E2EE, search runs entirely in the browser 6 + * over decrypted document names and content snippets. 7 + */ 8 + 9 + export interface SearchDocument { 10 + id: string; 11 + name: string; 12 + type: 'doc' | 'sheet'; 13 + content: string; 14 + } 15 + 16 + export interface SearchResult { 17 + docId: string; 18 + name: string; 19 + type: 'doc' | 'sheet'; 20 + score: number; 21 + /** Snippet with match context */ 22 + snippet: string; 23 + /** Positions of matches in the content */ 24 + matches: { field: 'name' | 'content'; start: number; end: number }[]; 25 + } 26 + 27 + /** 28 + * Tokenize text into lowercase words for indexing. 29 + */ 30 + export function tokenize(text: string): string[] { 31 + if (!text) return []; 32 + return text 33 + .toLowerCase() 34 + .replace(/[^\w\s]/g, ' ') 35 + .split(/\s+/) 36 + .filter(w => w.length > 0); 37 + } 38 + 39 + /** 40 + * Strip HTML tags from content, returning plain text. 41 + */ 42 + export function stripHtml(html: string): string { 43 + if (!html) return ''; 44 + return html 45 + .replace(/<[^>]+>/g, ' ') 46 + .replace(/ /g, ' ') 47 + .replace(/&/g, '&') 48 + .replace(/</g, '<') 49 + .replace(/>/g, '>') 50 + .replace(/"/g, '"') 51 + .replace(/'/g, "'") 52 + .replace(/\s+/g, ' ') 53 + .trim(); 54 + } 55 + 56 + /** 57 + * Build an inverted index from a collection of documents. 58 + * Maps each token to the set of document IDs containing it. 59 + */ 60 + export function buildIndex( 61 + documents: SearchDocument[], 62 + ): Map<string, Set<string>> { 63 + const index = new Map<string, Set<string>>(); 64 + 65 + for (const doc of documents) { 66 + const nameTokens = tokenize(doc.name); 67 + const contentTokens = tokenize(stripHtml(doc.content)); 68 + const allTokens = new Set([...nameTokens, ...contentTokens]); 69 + 70 + for (const token of allTokens) { 71 + if (!index.has(token)) { 72 + index.set(token, new Set()); 73 + } 74 + index.get(token)!.add(doc.id); 75 + } 76 + } 77 + 78 + return index; 79 + } 80 + 81 + /** 82 + * Extract a snippet around the first match of a query in text. 83 + * Returns ~80 chars of context around the match. 84 + */ 85 + export function extractSnippet( 86 + text: string, 87 + query: string, 88 + contextChars = 40, 89 + ): string { 90 + if (!text || !query) return ''; 91 + 92 + const lower = text.toLowerCase(); 93 + const qLower = query.toLowerCase(); 94 + const idx = lower.indexOf(qLower); 95 + 96 + if (idx === -1) { 97 + // No exact match, return start of text 98 + return text.length > contextChars * 2 99 + ? text.slice(0, contextChars * 2) + '…' 100 + : text; 101 + } 102 + 103 + const start = Math.max(0, idx - contextChars); 104 + const end = Math.min(text.length, idx + query.length + contextChars); 105 + 106 + let snippet = text.slice(start, end); 107 + if (start > 0) snippet = '…' + snippet; 108 + if (end < text.length) snippet = snippet + '…'; 109 + 110 + return snippet; 111 + } 112 + 113 + /** 114 + * Find match positions of a query within text (case-insensitive). 115 + */ 116 + export function findMatchPositions( 117 + text: string, 118 + query: string, 119 + ): { start: number; end: number }[] { 120 + if (!text || !query) return []; 121 + 122 + const positions: { start: number; end: number }[] = []; 123 + const lower = text.toLowerCase(); 124 + const qLower = query.toLowerCase(); 125 + 126 + let pos = 0; 127 + while (pos < lower.length) { 128 + const idx = lower.indexOf(qLower, pos); 129 + if (idx === -1) break; 130 + positions.push({ start: idx, end: idx + query.length }); 131 + pos = idx + 1; 132 + } 133 + 134 + return positions; 135 + } 136 + 137 + /** 138 + * Search documents by query string. 139 + * Scores results by: 140 + * - Name match (exact): 10 points 141 + * - Name match (contains): 5 points 142 + * - Content token matches: 1 point per unique token match 143 + * - Content phrase match: 3 bonus points 144 + */ 145 + export function searchDocuments( 146 + query: string, 147 + documents: SearchDocument[], 148 + index: Map<string, Set<string>>, 149 + ): SearchResult[] { 150 + if (!query || !query.trim()) return []; 151 + 152 + const queryLower = query.toLowerCase().trim(); 153 + const queryTokens = tokenize(query); 154 + 155 + if (queryTokens.length === 0) return []; 156 + 157 + // Find candidate documents from inverted index 158 + const candidates = new Map<string, number>(); 159 + 160 + for (const token of queryTokens) { 161 + // Exact token match 162 + const exact = index.get(token); 163 + if (exact) { 164 + for (const docId of exact) { 165 + candidates.set(docId, (candidates.get(docId) || 0) + 1); 166 + } 167 + } 168 + 169 + // Prefix match for partial words 170 + for (const [indexToken, docIds] of index) { 171 + if (indexToken.startsWith(token) && indexToken !== token) { 172 + for (const docId of docIds) { 173 + candidates.set(docId, (candidates.get(docId) || 0) + 0.5); 174 + } 175 + } 176 + } 177 + } 178 + 179 + // Score and rank candidates 180 + const results: SearchResult[] = []; 181 + 182 + for (const [docId, tokenScore] of candidates) { 183 + const doc = documents.find(d => d.id === docId); 184 + if (!doc) continue; 185 + 186 + let score = tokenScore; 187 + const matches: SearchResult['matches'] = []; 188 + 189 + const nameLower = doc.name.toLowerCase(); 190 + const plainContent = stripHtml(doc.content); 191 + 192 + // Name scoring 193 + if (nameLower === queryLower) { 194 + score += 10; 195 + } else if (nameLower.includes(queryLower)) { 196 + score += 5; 197 + } 198 + 199 + // Name match positions 200 + const namePositions = findMatchPositions(doc.name, query); 201 + for (const pos of namePositions) { 202 + matches.push({ field: 'name', ...pos }); 203 + } 204 + 205 + // Content phrase match bonus 206 + if (plainContent.toLowerCase().includes(queryLower)) { 207 + score += 3; 208 + } 209 + 210 + // Content match positions 211 + const contentPositions = findMatchPositions(plainContent, query); 212 + for (const pos of contentPositions) { 213 + matches.push({ field: 'content', ...pos }); 214 + } 215 + 216 + const snippet = extractSnippet(plainContent, query); 217 + 218 + results.push({ 219 + docId: doc.id, 220 + name: doc.name, 221 + type: doc.type, 222 + score, 223 + snippet, 224 + matches, 225 + }); 226 + } 227 + 228 + // Sort by score descending, then by name 229 + results.sort((a, b) => { 230 + if (b.score !== a.score) return b.score - a.score; 231 + return a.name.localeCompare(b.name); 232 + }); 233 + 234 + return results; 235 + }

+182

tests/search-index.test.ts

··· 1 + import { describe, it, expect } from 'vitest'; 2 + import { 3 + tokenize, 4 + stripHtml, 5 + buildIndex, 6 + extractSnippet, 7 + findMatchPositions, 8 + searchDocuments, 9 + type SearchDocument, 10 + } from '../src/search-index.js'; 11 + 12 + const DOCS: SearchDocument[] = [ 13 + { id: 'a1', name: 'Meeting Notes', type: 'doc', content: '<h1>Team Meeting</h1><p>Discussed the project roadmap and Q2 goals.</p>' }, 14 + { id: 'b2', name: 'Budget 2026', type: 'sheet', content: 'Revenue projections and expense tracking for fiscal year 2026.' }, 15 + { id: 'c3', name: 'Project Roadmap', type: 'doc', content: '<h1>Roadmap</h1><p>Phase 1: Research. Phase 2: Development. Phase 3: Launch.</p>' }, 16 + { id: 'd4', name: 'Weekly Planner', type: 'doc', content: 'Monday tasks, Tuesday tasks, Wednesday review meeting.' }, 17 + ]; 18 + 19 + describe('Search Index', () => { 20 + describe('tokenize', () => { 21 + it('splits text into lowercase words', () => { 22 + expect(tokenize('Hello World')).toEqual(['hello', 'world']); 23 + }); 24 + 25 + it('strips punctuation', () => { 26 + expect(tokenize("it's a test.")).toEqual(['it', 's', 'a', 'test']); 27 + }); 28 + 29 + it('returns empty array for empty string', () => { 30 + expect(tokenize('')).toEqual([]); 31 + }); 32 + 33 + it('handles multiple spaces', () => { 34 + expect(tokenize(' hello world ')).toEqual(['hello', 'world']); 35 + }); 36 + }); 37 + 38 + describe('stripHtml', () => { 39 + it('removes HTML tags', () => { 40 + expect(stripHtml('<h1>Title</h1><p>Text</p>')).toBe('Title Text'); 41 + }); 42 + 43 + it('decodes HTML entities', () => { 44 + expect(stripHtml('& < > " '')).toBe("& < > \" '"); 45 + }); 46 + 47 + it('handles empty string', () => { 48 + expect(stripHtml('')).toBe(''); 49 + }); 50 + 51 + it('collapses whitespace', () => { 52 + expect(stripHtml('<p> lots of space </p>')).toBe('lots of space'); 53 + }); 54 + }); 55 + 56 + describe('buildIndex', () => { 57 + it('builds inverted index from documents', () => { 58 + const index = buildIndex(DOCS); 59 + expect(index.has('meeting')).toBe(true); 60 + expect(index.get('meeting')!.has('a1')).toBe(true); 61 + expect(index.get('meeting')!.has('d4')).toBe(true); 62 + }); 63 + 64 + it('indexes document names', () => { 65 + const index = buildIndex(DOCS); 66 + expect(index.get('budget')!.has('b2')).toBe(true); 67 + }); 68 + 69 + it('returns empty map for empty documents', () => { 70 + const index = buildIndex([]); 71 + expect(index.size).toBe(0); 72 + }); 73 + }); 74 + 75 + describe('extractSnippet', () => { 76 + const text = 'The quick brown fox jumps over the lazy dog near the river bank.'; 77 + 78 + it('extracts context around match', () => { 79 + const snippet = extractSnippet(text, 'fox', 10); 80 + expect(snippet).toContain('fox'); 81 + expect(snippet.length).toBeLessThan(text.length + 5); 82 + }); 83 + 84 + it('returns start of text when no match', () => { 85 + const snippet = extractSnippet(text, 'xyz', 20); 86 + expect(snippet).toContain('The quick'); 87 + }); 88 + 89 + it('handles empty inputs', () => { 90 + expect(extractSnippet('', 'query')).toBe(''); 91 + expect(extractSnippet('text', '')).toBe(''); 92 + }); 93 + 94 + it('adds ellipsis for truncated snippets', () => { 95 + const snippet = extractSnippet(text, 'jumps', 5); 96 + expect(snippet.startsWith('…')).toBe(true); 97 + expect(snippet.endsWith('…')).toBe(true); 98 + }); 99 + }); 100 + 101 + describe('findMatchPositions', () => { 102 + it('finds all occurrences', () => { 103 + const positions = findMatchPositions('the cat and the dog', 'the'); 104 + expect(positions).toHaveLength(2); 105 + expect(positions[0]).toEqual({ start: 0, end: 3 }); 106 + expect(positions[1]).toEqual({ start: 12, end: 15 }); 107 + }); 108 + 109 + it('is case-insensitive', () => { 110 + const positions = findMatchPositions('Hello HELLO hello', 'hello'); 111 + expect(positions).toHaveLength(3); 112 + }); 113 + 114 + it('returns empty for no match', () => { 115 + expect(findMatchPositions('abc', 'xyz')).toEqual([]); 116 + }); 117 + 118 + it('handles empty inputs', () => { 119 + expect(findMatchPositions('', 'test')).toEqual([]); 120 + expect(findMatchPositions('test', '')).toEqual([]); 121 + }); 122 + }); 123 + 124 + describe('searchDocuments', () => { 125 + const index = buildIndex(DOCS); 126 + 127 + it('finds documents by name', () => { 128 + const results = searchDocuments('Budget', DOCS, index); 129 + expect(results.length).toBeGreaterThan(0); 130 + expect(results[0].docId).toBe('b2'); 131 + }); 132 + 133 + it('finds documents by content', () => { 134 + const results = searchDocuments('roadmap', DOCS, index); 135 + expect(results.length).toBeGreaterThanOrEqual(2); 136 + const ids = results.map(r => r.docId); 137 + expect(ids).toContain('a1'); 138 + expect(ids).toContain('c3'); 139 + }); 140 + 141 + it('ranks exact name match highest', () => { 142 + const results = searchDocuments('Project Roadmap', DOCS, index); 143 + expect(results[0].docId).toBe('c3'); 144 + }); 145 + 146 + it('returns empty for empty query', () => { 147 + expect(searchDocuments('', DOCS, index)).toEqual([]); 148 + expect(searchDocuments(' ', DOCS, index)).toEqual([]); 149 + }); 150 + 151 + it('returns empty when no matches', () => { 152 + const results = searchDocuments('xyznonexistent', DOCS, index); 153 + expect(results).toEqual([]); 154 + }); 155 + 156 + it('includes snippets with match context', () => { 157 + const results = searchDocuments('meeting', DOCS, index); 158 + const meetingResult = results.find(r => r.docId === 'a1'); 159 + expect(meetingResult).toBeDefined(); 160 + expect(meetingResult!.snippet).toBeTruthy(); 161 + }); 162 + 163 + it('includes match positions', () => { 164 + const results = searchDocuments('meeting', DOCS, index); 165 + const meetingResult = results.find(r => r.docId === 'a1'); 166 + expect(meetingResult!.matches.length).toBeGreaterThan(0); 167 + }); 168 + 169 + it('supports partial word matching via prefix', () => { 170 + const results = searchDocuments('meet', DOCS, index); 171 + expect(results.length).toBeGreaterThan(0); 172 + const ids = results.map(r => r.docId); 173 + expect(ids).toContain('a1'); 174 + }); 175 + 176 + it('finds across multiple query tokens', () => { 177 + const results = searchDocuments('team goals', DOCS, index); 178 + expect(results.length).toBeGreaterThan(0); 179 + expect(results[0].docId).toBe('a1'); 180 + }); 181 + }); 182 + });

Configure Feed

Configure Feed