perf: don't cache timestampsNs in concordance — compute on demand

+36 -19

1 changed file

expand all

apps

ionosphere-appview

src

routes.ts

+36 -19

apps/ionosphere-appview/src/routes.ts

··· 5 5 import { readFileSync, existsSync, writeFileSync, mkdirSync } from "node:fs"; 6 6 import path from "node:path"; 7 7 import { 8 + decode, 8 9 decodeToDocument, 9 10 } from "@ionosphere/format/transcript-encoding"; 11 + import { lemmatize } from "./lemmatize.js"; 10 12 11 13 export function createRoutes(db: Database.Database): Hono { 12 14 const app = new Hono(); ··· 534 536 app.get("/xrpc/tv.ionosphere.getConcordance", (c) => { 535 537 // Serve from cache if available 536 538 if (indexCache) { 537 - return c.json({ entries: stripTimestamps(indexCache.entries) }); 539 + return c.json({ entries: indexCache.entries }); 538 540 } 539 541 540 542 console.log("[index] Building concordance (this takes a couple minutes the first time)..."); ··· 577 579 })); 578 580 579 581 const entries = buildConcordance(transcripts, concepts); 580 - indexCache = { entries, builtAt: Date.now() }; 581 - console.log(`[index] Concordance built: ${entries.length} entries in ${((Date.now() - start) / 1000).toFixed(1)}s`); 582 + // Strip timestampsNs from cache to save memory — they're computed on demand by getTimecodes 583 + const stripped = stripTimestamps(entries); 584 + indexCache = { entries: stripped, builtAt: Date.now() }; 585 + console.log(`[index] Concordance built: ${stripped.length} entries in ${((Date.now() - start) / 1000).toFixed(1)}s`); 582 586 583 - return c.json({ entries: stripTimestamps(entries) }); 587 + return c.json({ entries: stripped }); 584 588 }); 585 589 586 590 // On-demand timecodes for a specific term + talk 591 + // Computed from the transcript rather than cached — keeps memory low 587 592 app.get("/xrpc/tv.ionosphere.getTimecodes", (c) => { 588 593 const term = c.req.query("term"); 589 594 const rkey = c.req.query("rkey"); 590 595 if (!term || !rkey) return c.json({ error: "missing term or rkey" }, 400); 591 - if (!indexCache) return c.json({ timestamps: [] }); 596 + 597 + const row = db.prepare( 598 + `SELECT tr.text, tr.start_ms, tr.timings 599 + FROM transcripts tr 600 + JOIN talks t ON tr.talk_uri = t.uri 601 + WHERE t.rkey = ?` 602 + ).get(rkey) as any; 603 + 604 + if (!row) return c.json({ timestamps: [] }); 605 + 606 + const decoded = decode({ text: row.text, startMs: row.start_ms, timings: JSON.parse(row.timings) }); 607 + const words = row.text.split(/\s+/).filter((w: string) => w.length > 0); 608 + const termLower = term.toLowerCase(); 609 + const termWords = termLower.split(" "); 610 + const timestamps: number[] = []; 592 611 593 - // Search entries and subentries for the matching term + talk 594 - for (const entry of indexCache.entries) { 595 - if (entry.term.toLowerCase() === term.toLowerCase()) { 596 - for (const talk of entry.talks) { 597 - if (talk.rkey === rkey && talk.timestampsNs) { 598 - return c.json({ timestamps: talk.timestampsNs }); 599 - } 612 + for (let i = 0; i < words.length; i++) { 613 + const wordLower = words[i].toLowerCase().replace(/[^a-z0-9'-]/g, "").replace(/^['-]+/, "").replace(/['-]+$/, ""); 614 + // Match single words or bigrams 615 + if (termWords.length === 1) { 616 + if (wordLower === termLower || lemmatize(wordLower) === termLower) { 617 + if (i < decoded.words.length) timestamps.push(Math.round(decoded.words[i].start * 1e9)); 600 618 } 601 - for (const sub of entry.subentries || []) { 602 - for (const talk of sub.talks) { 603 - if (talk.rkey === rkey && talk.timestampsNs) { 604 - return c.json({ timestamps: talk.timestampsNs }); 605 - } 606 - } 619 + } else if (termWords.length === 2 && i < words.length - 1) { 620 + const nextLower = words[i + 1].toLowerCase().replace(/[^a-z0-9'-]/g, "").replace(/^['-]+/, "").replace(/['-]+$/, ""); 621 + if (wordLower === termWords[0] && nextLower === termWords[1]) { 622 + if (i < decoded.words.length) timestamps.push(Math.round(decoded.words[i].start * 1e9)); 607 623 } 608 624 } 609 625 } 610 - return c.json({ timestamps: [] }); 626 + 627 + return c.json({ timestamps: timestamps.sort((a, b) => a - b) }); 611 628 }); 612 629 613 630 // Invalidate all caches (call after data changes)

Configure Feed

Configure Feed