···9292 return nil, err
9393 }
94949595+ // Bound the connection pool. With a remote target (Bunny Database), each
9696+ // idle conn is a stable libsql stream — keeping a handful warm avoids
9797+ // reconnect cost, capping the total prevents runaway contention. Short
9898+ // lifetimes ensure we recycle past any idle-side disconnects and drop any
9999+ // poisoned conn that survived IsPoisonedTxErr eviction.
100100+ db.SetMaxOpenConns(8)
101101+ db.SetMaxIdleConns(4)
102102+ db.SetConnMaxLifetime(5 * time.Minute)
103103+ db.SetConnMaxIdleTime(2 * time.Minute)
104104+95105 // Check if this is an existing database with migrations applied
96106 isExisting, err := hasAppliedMigrations(db)
97107 if err != nil {
···202212 if err != nil {
203213 return fmt.Errorf("failed to begin transaction for migration %d: %w", m.Version, err)
204214 }
215215+ // Deferred rollback is a no-op once Commit succeeds; it guards against
216216+ // panics and any early return that forgets an explicit rollback.
217217+ defer func() { _ = tx.Rollback() }()
205218206219 // Split query into individual statements and execute each
207220 // go-sqlite3's Exec() doesn't reliably execute all statements in multi-statement queries
208221 statements := splitSQLStatements(m.Query)
209222 for i, stmt := range statements {
210223 if _, err := tx.Exec(stmt); err != nil {
211211- tx.Rollback()
212224 return fmt.Errorf("failed to apply migration %d (%s) statement %d: %w", m.Version, m.Name, i+1, err)
213225 }
214226 }
215227216228 // Record migration
217229 if _, err := tx.Exec("INSERT INTO schema_migrations (version) VALUES (?)", m.Version); err != nil {
218218- tx.Rollback()
219230 return fmt.Errorf("failed to record migration %d: %w", m.Version, err)
220231 }
221232
+6
pkg/appview/db/schema.sql
···180180);
181181CREATE INDEX IF NOT EXISTS idx_repo_stats_daily_date ON repository_stats_daily(date DESC);
182182183183+CREATE TABLE IF NOT EXISTS jetstream_cursor (
184184+ id INTEGER PRIMARY KEY CHECK (id = 1),
185185+ cursor INTEGER NOT NULL,
186186+ updated_at TIMESTAMP NOT NULL DEFAULT CURRENT_TIMESTAMP
187187+);
188188+183189CREATE TABLE IF NOT EXISTS stars (
184190 starrer_did TEXT NOT NULL,
185191 owner_did TEXT NOT NULL,
+41-26
pkg/appview/jetstream/backfill.go
···267267 // ProcessManifest calls ResolveHoldDID for legacy manifests.
268268 b.prewarmHoldCaches(ctx, collection, allRecords)
269269270270- // Phase 3: Process records in chunked transactions.
271271- // All network I/O should be cached by now, so transactions stay fast.
272272- const chunkSize = 20
273273- recordCount := 0
274274-275275- for i := 0; i < len(allRecords); i += chunkSize {
276276- end := i + chunkSize
277277- if end > len(allRecords) {
278278- end = len(allRecords)
279279- }
280280-281281- tx, err := b.db.Begin()
282282- if err != nil {
283283- return recordCount, fmt.Errorf("failed to begin transaction: %w", err)
284284- }
285285-286286- txProcessor := NewProcessor(tx, false, b.processor.statsCache)
287287-288288- for j := i; j < end; j++ {
289289- if err := b.processRecordWith(ctx, txProcessor, did, collection, &allRecords[j]); err != nil {
290290- slog.Warn("Backfill failed to process record", "uri", allRecords[j].URI, "error", err)
270270+ // Phase 3: Write records to the DB.
271271+ //
272272+ // For collections whose writes are straightforward idempotent upserts, we
273273+ // batch every record in the repo into one multi-row INSERT per table. This
274274+ // replaces the previous 20-record chunked transaction loop, which exceeded
275275+ // Bunny Database's remote transaction timeout (~5s) once chunks grew large
276276+ // and poisoned the connection pool on timeout.
277277+ //
278278+ // Collections that do network I/O per record (SailorProfile) or have
279279+ // conditional read-then-write logic (Scan) stay on the single-record path
280280+ // where each write is its own statement and cannot hold a long transaction.
281281+ var recordCount int
282282+ var procErr error
283283+ switch collection {
284284+ case atproto.ManifestCollection:
285285+ recordCount, procErr = b.batchManifests(ctx, did, allRecords)
286286+ case atproto.TagCollection:
287287+ recordCount, procErr = b.batchTags(did, allRecords)
288288+ case atproto.StarCollection:
289289+ recordCount, procErr = b.batchStars(ctx, did, allRecords)
290290+ case atproto.RepoPageCollection:
291291+ recordCount, procErr = b.batchRepoPages(did, allRecords)
292292+ case atproto.DailyStatsCollection:
293293+ recordCount, procErr = b.batchDailyStats(ctx, did, allRecords)
294294+ case atproto.StatsCollection:
295295+ recordCount, procErr = b.batchStats(ctx, did, allRecords)
296296+ case atproto.CaptainCollection:
297297+ recordCount, procErr = b.batchCaptains(did, allRecords)
298298+ case atproto.CrewCollection:
299299+ recordCount, procErr = b.batchCrew(did, allRecords)
300300+ default:
301301+ // SailorProfileCollection and ScanCollection keep per-record processing
302302+ // because they do network I/O or conditional reads that would be awkward
303303+ // to batch. Each call writes a single row, so there is no long-lived
304304+ // transaction at risk.
305305+ for i := range allRecords {
306306+ if err := b.processRecordWith(ctx, b.processor, did, collection, &allRecords[i]); err != nil {
307307+ slog.Warn("Backfill failed to process record", "uri", allRecords[i].URI, "error", err)
291308 continue
292309 }
293310 recordCount++
294311 }
295295-296296- if err := tx.Commit(); err != nil {
297297- tx.Rollback()
298298- return recordCount, fmt.Errorf("failed to commit transaction: %w", err)
299299- }
312312+ }
313313+ if procErr != nil {
314314+ return recordCount, procErr
300315 }
301316302317 // Reconciliation runs outside the transaction (involves network I/O and fewer writes)
+145-1
pkg/appview/jetstream/worker.go
···4949 // In-memory cursor tracking for reconnects
5050 lastCursor int64
5151 cursorMutex sync.RWMutex
5252+5353+ // Cursor persistence: a single-slot channel carries the most recent
5454+ // cursor to a background saver goroutine. The saver writes to
5555+ // jetstream_cursor every tick, dropping any older value that has not
5656+ // yet been flushed so the WS read loop is never blocked on DB I/O.
5757+ cursorSave chan int64
5258}
53595460// NewWorker creates a new Jetstream worker
···7480 },
7581 statsCache: statsCache,
7682 processor: NewProcessor(database, true, statsCache), // Use cache for live streaming
8383+ cursorSave: make(chan int64, 1),
7784 }
7885}
7986···258265 // TODO: Re-enable compression once debugging is complete
259266 _ = decoder // Keep decoder to avoid unused variable error
260267261261- if err := w.processMessage(message); err != nil {
268268+ if err := w.processMessageResilient(ctx, message); err != nil {
262269 slog.Error("ERROR processing message", "error", err)
263270 // Continue processing other messages
264271 } else {
···274281// 30 seconds to avoid missing events (events are idempotent DB upserts).
275282// Cycles through all endpoints indefinitely and never gives up.
276283func (w *Worker) StartWithFailover(ctx context.Context) {
284284+ // Bootstrap from the persisted cursor the first time we run. If the DB
285285+ // has a saved cursor we resume from it (minus a small safety rewind so
286286+ // any gap from the previous shutdown is covered). Events are idempotent
287287+ // UPSERTs, so re-processing a handful is harmless.
288288+ if w.startCursor == 0 {
289289+ if cursor, err := db.GetJetstreamCursor(w.db); err != nil {
290290+ slog.Warn("Jetstream failed to load persisted cursor", "error", err)
291291+ } else if cursor > 0 {
292292+ const rewind = int64(30 * 1_000_000) // 30s safety rewind, same units as cursor
293293+ resume := cursor - rewind
294294+ if resume < 0 {
295295+ resume = 0
296296+ }
297297+ w.cursorMutex.Lock()
298298+ w.startCursor = resume
299299+ w.lastCursor = resume
300300+ w.cursorMutex.Unlock()
301301+ slog.Info("Jetstream resuming from persisted cursor",
302302+ "persisted_cursor", cursor,
303303+ "resume_cursor", resume)
304304+ }
305305+ }
306306+307307+ // Launch the background cursor saver. It runs for the lifetime of this
308308+ // Start call and exits on ctx.Done with a final flush.
309309+ saverDone := make(chan struct{})
310310+ go w.runCursorSaver(ctx, saverDone)
311311+ defer func() {
312312+ <-saverDone
313313+ }()
314314+277315 retryDelays := []time.Duration{1 * time.Second, 5 * time.Second, 10 * time.Second}
278316279317 for {
···358396 return w.processor
359397}
360398399399+// runCursorSaver is a long-running goroutine that persists the most recent
400400+// Jetstream cursor to SQLite. It writes at most once every cursorSaveInterval
401401+// so we never hit the DB faster than it can keep up, and always flushes a
402402+// final value on shutdown so the next Start resumes from the right place.
403403+//
404404+// The goroutine intentionally uses b.db directly (not ExecResilient) because
405405+// the INSERT ... ON CONFLICT statement is a single round-trip that cannot
406406+// trigger the poisoned-tx cascade.
407407+func (w *Worker) runCursorSaver(ctx context.Context, done chan<- struct{}) {
408408+ defer close(done)
409409+410410+ const cursorSaveInterval = 5 * time.Second
411411+ ticker := time.NewTicker(cursorSaveInterval)
412412+ defer ticker.Stop()
413413+414414+ var pending int64
415415+ flush := func() {
416416+ if pending == 0 {
417417+ return
418418+ }
419419+ if err := db.SaveJetstreamCursor(w.db, pending); err != nil {
420420+ slog.Warn("Jetstream failed to persist cursor", "cursor", pending, "error", err)
421421+ return
422422+ }
423423+ pending = 0
424424+ }
425425+426426+ for {
427427+ select {
428428+ case <-ctx.Done():
429429+ flush()
430430+ return
431431+ case c := <-w.cursorSave:
432432+ // Keep only the newest value; the ticker decides when to flush.
433433+ if c > pending {
434434+ pending = c
435435+ }
436436+ case <-ticker.C:
437437+ flush()
438438+ }
439439+ }
440440+}
441441+361442// GetLastCursor returns the last processed cursor (time_us) for reconnects
362443func (w *Worker) GetLastCursor() int64 {
363444 w.cursorMutex.RLock()
···365446 return w.lastCursor
366447}
367448449449+// processMessageResilient runs processMessage and, if the underlying DB
450450+// connection was poisoned by a remote tx timeout (common with Bunny Database
451451+// after a backfill chunk exceeded the server-side transaction limit), drains
452452+// the poisoned connections from the pool and retries once. A second failure
453453+// returns the error so the caller's Error log line fires — replacing silent
454454+// data loss with a loud, attributable one.
455455+func (w *Worker) processMessageResilient(ctx context.Context, message []byte) error {
456456+ err := w.processMessage(message)
457457+ if err == nil || !db.IsPoisonedTxErr(err) {
458458+ return err
459459+ }
460460+461461+ slog.Warn("Jetstream poisoned connection detected, draining pool and retrying",
462462+ "error", err)
463463+ drainPool(ctx, w.db)
464464+ time.Sleep(100 * time.Millisecond)
465465+ return w.processMessage(message)
466466+}
467467+468468+// drainPool borrows each idle connection from the pool in turn and runs a
469469+// trivial probe. A poisoned connection fails the probe, and db.ExecResilient
470470+// evicts it via driver.ErrBadConn. Loops up to the pool's open-connection
471471+// limit so a single call can clear every bad conn.
472472+func drainPool(ctx context.Context, database *sql.DB) {
473473+ // MaxOpenConns is 8 (see pkg/appview/db/schema.go). We probe one more time
474474+ // than that to ensure we cycle through every conn if any were mid-use.
475475+ const maxProbes = 10
476476+ for i := 0; i < maxProbes; i++ {
477477+ err := db.ExecResilient(ctx, database, func(conn *sql.Conn) error {
478478+ _, err := conn.ExecContext(ctx, "SELECT 1")
479479+ return err
480480+ })
481481+ if err == nil {
482482+ // Got a healthy conn; no need to probe further — any remaining
483483+ // poisoned conns will be evicted on their next use.
484484+ return
485485+ }
486486+ if ctx.Err() != nil {
487487+ return
488488+ }
489489+ }
490490+}
491491+368492// processMessage processes a single Jetstream event
369493func (w *Worker) processMessage(message []byte) error {
370494 var event JetstreamEvent
···376500 w.cursorMutex.Lock()
377501 w.lastCursor = event.TimeUS
378502 w.cursorMutex.Unlock()
503503+504504+ // Offer the cursor to the async saver. Non-blocking: if the saver is
505505+ // still writing the previous value, we drop-and-replace so the DB always
506506+ // converges on the freshest cursor without ever stalling the read loop.
507507+ if w.cursorSave != nil {
508508+ select {
509509+ case w.cursorSave <- event.TimeUS:
510510+ default:
511511+ // Drain any stale value and try once more — if that still fails
512512+ // we just skip this tick; the saver's timer will catch up.
513513+ select {
514514+ case <-w.cursorSave:
515515+ default:
516516+ }
517517+ select {
518518+ case w.cursorSave <- event.TimeUS:
519519+ default:
520520+ }
521521+ }
522522+ }
379523380524 // Call callback if set
381525 if w.eventCallback != nil {