Ionosphere.tv
3
fork

Configure Feed

Select the types of activity you want to include in your feed.

perf: don't cache timestampsNs in concordance — compute on demand

The concordance cache held millions of nanosecond timestamps (one per
word occurrence across all transcripts). These were only used by the
getTimecodes endpoint. Now timestamps are computed on demand from the
transcript when requested, keeping the concordance cache lean.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>

+36 -19
+36 -19
apps/ionosphere-appview/src/routes.ts
··· 5 5 import { readFileSync, existsSync, writeFileSync, mkdirSync } from "node:fs"; 6 6 import path from "node:path"; 7 7 import { 8 + decode, 8 9 decodeToDocument, 9 10 } from "@ionosphere/format/transcript-encoding"; 11 + import { lemmatize } from "./lemmatize.js"; 10 12 11 13 export function createRoutes(db: Database.Database): Hono { 12 14 const app = new Hono(); ··· 534 536 app.get("/xrpc/tv.ionosphere.getConcordance", (c) => { 535 537 // Serve from cache if available 536 538 if (indexCache) { 537 - return c.json({ entries: stripTimestamps(indexCache.entries) }); 539 + return c.json({ entries: indexCache.entries }); 538 540 } 539 541 540 542 console.log("[index] Building concordance (this takes a couple minutes the first time)..."); ··· 577 579 })); 578 580 579 581 const entries = buildConcordance(transcripts, concepts); 580 - indexCache = { entries, builtAt: Date.now() }; 581 - console.log(`[index] Concordance built: ${entries.length} entries in ${((Date.now() - start) / 1000).toFixed(1)}s`); 582 + // Strip timestampsNs from cache to save memory — they're computed on demand by getTimecodes 583 + const stripped = stripTimestamps(entries); 584 + indexCache = { entries: stripped, builtAt: Date.now() }; 585 + console.log(`[index] Concordance built: ${stripped.length} entries in ${((Date.now() - start) / 1000).toFixed(1)}s`); 582 586 583 - return c.json({ entries: stripTimestamps(entries) }); 587 + return c.json({ entries: stripped }); 584 588 }); 585 589 586 590 // On-demand timecodes for a specific term + talk 591 + // Computed from the transcript rather than cached — keeps memory low 587 592 app.get("/xrpc/tv.ionosphere.getTimecodes", (c) => { 588 593 const term = c.req.query("term"); 589 594 const rkey = c.req.query("rkey"); 590 595 if (!term || !rkey) return c.json({ error: "missing term or rkey" }, 400); 591 - if (!indexCache) return c.json({ timestamps: [] }); 596 + 597 + const row = db.prepare( 598 + `SELECT tr.text, tr.start_ms, tr.timings 599 + FROM transcripts tr 600 + JOIN talks t ON tr.talk_uri = t.uri 601 + WHERE t.rkey = ?` 602 + ).get(rkey) as any; 603 + 604 + if (!row) return c.json({ timestamps: [] }); 605 + 606 + const decoded = decode({ text: row.text, startMs: row.start_ms, timings: JSON.parse(row.timings) }); 607 + const words = row.text.split(/\s+/).filter((w: string) => w.length > 0); 608 + const termLower = term.toLowerCase(); 609 + const termWords = termLower.split(" "); 610 + const timestamps: number[] = []; 592 611 593 - // Search entries and subentries for the matching term + talk 594 - for (const entry of indexCache.entries) { 595 - if (entry.term.toLowerCase() === term.toLowerCase()) { 596 - for (const talk of entry.talks) { 597 - if (talk.rkey === rkey && talk.timestampsNs) { 598 - return c.json({ timestamps: talk.timestampsNs }); 599 - } 612 + for (let i = 0; i < words.length; i++) { 613 + const wordLower = words[i].toLowerCase().replace(/[^a-z0-9'-]/g, "").replace(/^['-]+/, "").replace(/['-]+$/, ""); 614 + // Match single words or bigrams 615 + if (termWords.length === 1) { 616 + if (wordLower === termLower || lemmatize(wordLower) === termLower) { 617 + if (i < decoded.words.length) timestamps.push(Math.round(decoded.words[i].start * 1e9)); 600 618 } 601 - for (const sub of entry.subentries || []) { 602 - for (const talk of sub.talks) { 603 - if (talk.rkey === rkey && talk.timestampsNs) { 604 - return c.json({ timestamps: talk.timestampsNs }); 605 - } 606 - } 619 + } else if (termWords.length === 2 && i < words.length - 1) { 620 + const nextLower = words[i + 1].toLowerCase().replace(/[^a-z0-9'-]/g, "").replace(/^['-]+/, "").replace(/['-]+$/, ""); 621 + if (wordLower === termWords[0] && nextLower === termWords[1]) { 622 + if (i < decoded.words.length) timestamps.push(Math.round(decoded.words[i].start * 1e9)); 607 623 } 608 624 } 609 625 } 610 - return c.json({ timestamps: [] }); 626 + 627 + return c.json({ timestamps: timestamps.sort((a, b) => a - b) }); 611 628 }); 612 629 613 630 // Invalidate all caches (call after data changes)