src/search-index.ts at main · scottlanoue.com/atmosphere-office

scottlanoue.com / atmosphere-office
fork
Full document, spreadsheet, slideshow, and diagram tooling
fork
atmosphere-office / src / search-index.ts
at main 235 lines 5.9 kB view raw
wrap content
Scott Lanoue feat: full-text search index across documents (#55) 5w ago
b9423275
  1/**
  2 * Full-Text Search Index — client-side search across decrypted documents.
  3 *
  4 * Pure logic module: tokenization, indexing, ranked search.
  5 * Since all content is E2EE, search runs entirely in the browser
  6 * over decrypted document names and content snippets.
  7 */
  8
  9export interface SearchDocument {
 10  id: string;
 11  name: string;
 12  type: 'doc' | 'sheet';
 13  content: string;
 14}
 15
 16export interface SearchResult {
 17  docId: string;
 18  name: string;
 19  type: 'doc' | 'sheet';
 20  score: number;
 21  /** Snippet with match context */
 22  snippet: string;
 23  /** Positions of matches in the content */
 24  matches: { field: 'name' | 'content'; start: number; end: number }[];
 25}
 26
 27/**
 28 * Tokenize text into lowercase words for indexing.
 29 */
 30export function tokenize(text: string): string[] {
 31  if (!text) return [];
 32  return text
 33    .toLowerCase()
 34    .replace(/[^\w\s]/g, ' ')
 35    .split(/\s+/)
 36    .filter(w => w.length > 0);
 37}
 38
 39/**
 40 * Strip HTML tags from content, returning plain text.
 41 */
 42export function stripHtml(html: string): string {
 43  if (!html) return '';
 44  return html
 45    .replace(/<[^>]+>/g, ' ')
 46    .replace(/&nbsp;/g, ' ')
 47    .replace(/&amp;/g, '&')
 48    .replace(/&lt;/g, '<')
 49    .replace(/&gt;/g, '>')
 50    .replace(/&quot;/g, '"')
 51    .replace(/&#39;/g, "'")
 52    .replace(/\s+/g, ' ')
 53    .trim();
 54}
 55
 56/**
 57 * Build an inverted index from a collection of documents.
 58 * Maps each token to the set of document IDs containing it.
 59 */
 60export function buildIndex(
 61  documents: SearchDocument[],
 62): Map<string, Set<string>> {
 63  const index = new Map<string, Set<string>>();
 64
 65  for (const doc of documents) {
 66    const nameTokens = tokenize(doc.name);
 67    const contentTokens = tokenize(stripHtml(doc.content));
 68    const allTokens = new Set([...nameTokens, ...contentTokens]);
 69
 70    for (const token of allTokens) {
 71      if (!index.has(token)) {
 72        index.set(token, new Set());
 73      }
 74      index.get(token)!.add(doc.id);
 75    }
 76  }
 77
 78  return index;
 79}
 80
 81/**
 82 * Extract a snippet around the first match of a query in text.
 83 * Returns ~80 chars of context around the match.
 84 */
 85export function extractSnippet(
 86  text: string,
 87  query: string,
 88  contextChars = 40,
 89): string {
 90  if (!text || !query) return '';
 91
 92  const lower = text.toLowerCase();
 93  const qLower = query.toLowerCase();
 94  const idx = lower.indexOf(qLower);
 95
 96  if (idx === -1) {
 97    // No exact match, return start of text
 98    return text.length > contextChars * 2
 99      ? text.slice(0, contextChars * 2) + '…'
100      : text;
101  }
102
103  const start = Math.max(0, idx - contextChars);
104  const end = Math.min(text.length, idx + query.length + contextChars);
105
106  let snippet = text.slice(start, end);
107  if (start > 0) snippet = '…' + snippet;
108  if (end < text.length) snippet = snippet + '…';
109
110  return snippet;
111}
112
113/**
114 * Find match positions of a query within text (case-insensitive).
115 */
116export function findMatchPositions(
117  text: string,
118  query: string,
119): { start: number; end: number }[] {
120  if (!text || !query) return [];
121
122  const positions: { start: number; end: number }[] = [];
123  const lower = text.toLowerCase();
124  const qLower = query.toLowerCase();
125
126  let pos = 0;
127  while (pos < lower.length) {
128    const idx = lower.indexOf(qLower, pos);
129    if (idx === -1) break;
130    positions.push({ start: idx, end: idx + query.length });
131    pos = idx + 1;
132  }
133
134  return positions;
135}
136
137/**
138 * Search documents by query string.
139 * Scores results by:
140 *   - Name match (exact): 10 points
141 *   - Name match (contains): 5 points
142 *   - Content token matches: 1 point per unique token match
143 *   - Content phrase match: 3 bonus points
144 */
145export function searchDocuments(
146  query: string,
147  documents: SearchDocument[],
148  index: Map<string, Set<string>>,
149): SearchResult[] {
150  if (!query || !query.trim()) return [];
151
152  const queryLower = query.toLowerCase().trim();
153  const queryTokens = tokenize(query);
154
155  if (queryTokens.length === 0) return [];
156
157  // Find candidate documents from inverted index
158  const candidates = new Map<string, number>();
159
160  for (const token of queryTokens) {
161    // Exact token match
162    const exact = index.get(token);
163    if (exact) {
164      for (const docId of exact) {
165        candidates.set(docId, (candidates.get(docId) || 0) + 1);
166      }
167    }
168
169    // Prefix match for partial words
170    for (const [indexToken, docIds] of index) {
171      if (indexToken.startsWith(token) && indexToken !== token) {
172        for (const docId of docIds) {
173          candidates.set(docId, (candidates.get(docId) || 0) + 0.5);
174        }
175      }
176    }
177  }
178
179  // Score and rank candidates
180  const results: SearchResult[] = [];
181
182  for (const [docId, tokenScore] of candidates) {
183    const doc = documents.find(d => d.id === docId);
184    if (!doc) continue;
185
186    let score = tokenScore;
187    const matches: SearchResult['matches'] = [];
188
189    const nameLower = doc.name.toLowerCase();
190    const plainContent = stripHtml(doc.content);
191
192    // Name scoring
193    if (nameLower === queryLower) {
194      score += 10;
195    } else if (nameLower.includes(queryLower)) {
196      score += 5;
197    }
198
199    // Name match positions
200    const namePositions = findMatchPositions(doc.name, query);
201    for (const pos of namePositions) {
202      matches.push({ field: 'name', ...pos });
203    }
204
205    // Content phrase match bonus
206    if (plainContent.toLowerCase().includes(queryLower)) {
207      score += 3;
208    }
209
210    // Content match positions
211    const contentPositions = findMatchPositions(plainContent, query);
212    for (const pos of contentPositions) {
213      matches.push({ field: 'content', ...pos });
214    }
215
216    const snippet = extractSnippet(plainContent, query);
217
218    results.push({
219      docId: doc.id,
220      name: doc.name,
221      type: doc.type,
222      score,
223      snippet,
224      matches,
225    });
226  }
227
228  // Sort by score descending, then by name
229  results.sort((a, b) => {
230    if (b.score !== a.score) return b.score - a.score;
231    return a.name.localeCompare(b.name);
232  });
233
234  return results;
235}
Configure Feed

Configure Feed