A container registry that uses the AT Protocol for manifest storage and S3 for blob storage. atcr.io
docker container atproto go
77
fork

Configure Feed

Select the types of activity you want to include in your feed.

improve backfill and jetstream db connections

+205 -29
+13 -2
pkg/appview/db/schema.go
··· 92 92 return nil, err 93 93 } 94 94 95 + // Bound the connection pool. With a remote target (Bunny Database), each 96 + // idle conn is a stable libsql stream — keeping a handful warm avoids 97 + // reconnect cost, capping the total prevents runaway contention. Short 98 + // lifetimes ensure we recycle past any idle-side disconnects and drop any 99 + // poisoned conn that survived IsPoisonedTxErr eviction. 100 + db.SetMaxOpenConns(8) 101 + db.SetMaxIdleConns(4) 102 + db.SetConnMaxLifetime(5 * time.Minute) 103 + db.SetConnMaxIdleTime(2 * time.Minute) 104 + 95 105 // Check if this is an existing database with migrations applied 96 106 isExisting, err := hasAppliedMigrations(db) 97 107 if err != nil { ··· 202 212 if err != nil { 203 213 return fmt.Errorf("failed to begin transaction for migration %d: %w", m.Version, err) 204 214 } 215 + // Deferred rollback is a no-op once Commit succeeds; it guards against 216 + // panics and any early return that forgets an explicit rollback. 217 + defer func() { _ = tx.Rollback() }() 205 218 206 219 // Split query into individual statements and execute each 207 220 // go-sqlite3's Exec() doesn't reliably execute all statements in multi-statement queries 208 221 statements := splitSQLStatements(m.Query) 209 222 for i, stmt := range statements { 210 223 if _, err := tx.Exec(stmt); err != nil { 211 - tx.Rollback() 212 224 return fmt.Errorf("failed to apply migration %d (%s) statement %d: %w", m.Version, m.Name, i+1, err) 213 225 } 214 226 } 215 227 216 228 // Record migration 217 229 if _, err := tx.Exec("INSERT INTO schema_migrations (version) VALUES (?)", m.Version); err != nil { 218 - tx.Rollback() 219 230 return fmt.Errorf("failed to record migration %d: %w", m.Version, err) 220 231 } 221 232
+6
pkg/appview/db/schema.sql
··· 180 180 ); 181 181 CREATE INDEX IF NOT EXISTS idx_repo_stats_daily_date ON repository_stats_daily(date DESC); 182 182 183 + CREATE TABLE IF NOT EXISTS jetstream_cursor ( 184 + id INTEGER PRIMARY KEY CHECK (id = 1), 185 + cursor INTEGER NOT NULL, 186 + updated_at TIMESTAMP NOT NULL DEFAULT CURRENT_TIMESTAMP 187 + ); 188 + 183 189 CREATE TABLE IF NOT EXISTS stars ( 184 190 starrer_did TEXT NOT NULL, 185 191 owner_did TEXT NOT NULL,
+41 -26
pkg/appview/jetstream/backfill.go
··· 267 267 // ProcessManifest calls ResolveHoldDID for legacy manifests. 268 268 b.prewarmHoldCaches(ctx, collection, allRecords) 269 269 270 - // Phase 3: Process records in chunked transactions. 271 - // All network I/O should be cached by now, so transactions stay fast. 272 - const chunkSize = 20 273 - recordCount := 0 274 - 275 - for i := 0; i < len(allRecords); i += chunkSize { 276 - end := i + chunkSize 277 - if end > len(allRecords) { 278 - end = len(allRecords) 279 - } 280 - 281 - tx, err := b.db.Begin() 282 - if err != nil { 283 - return recordCount, fmt.Errorf("failed to begin transaction: %w", err) 284 - } 285 - 286 - txProcessor := NewProcessor(tx, false, b.processor.statsCache) 287 - 288 - for j := i; j < end; j++ { 289 - if err := b.processRecordWith(ctx, txProcessor, did, collection, &allRecords[j]); err != nil { 290 - slog.Warn("Backfill failed to process record", "uri", allRecords[j].URI, "error", err) 270 + // Phase 3: Write records to the DB. 271 + // 272 + // For collections whose writes are straightforward idempotent upserts, we 273 + // batch every record in the repo into one multi-row INSERT per table. This 274 + // replaces the previous 20-record chunked transaction loop, which exceeded 275 + // Bunny Database's remote transaction timeout (~5s) once chunks grew large 276 + // and poisoned the connection pool on timeout. 277 + // 278 + // Collections that do network I/O per record (SailorProfile) or have 279 + // conditional read-then-write logic (Scan) stay on the single-record path 280 + // where each write is its own statement and cannot hold a long transaction. 281 + var recordCount int 282 + var procErr error 283 + switch collection { 284 + case atproto.ManifestCollection: 285 + recordCount, procErr = b.batchManifests(ctx, did, allRecords) 286 + case atproto.TagCollection: 287 + recordCount, procErr = b.batchTags(did, allRecords) 288 + case atproto.StarCollection: 289 + recordCount, procErr = b.batchStars(ctx, did, allRecords) 290 + case atproto.RepoPageCollection: 291 + recordCount, procErr = b.batchRepoPages(did, allRecords) 292 + case atproto.DailyStatsCollection: 293 + recordCount, procErr = b.batchDailyStats(ctx, did, allRecords) 294 + case atproto.StatsCollection: 295 + recordCount, procErr = b.batchStats(ctx, did, allRecords) 296 + case atproto.CaptainCollection: 297 + recordCount, procErr = b.batchCaptains(did, allRecords) 298 + case atproto.CrewCollection: 299 + recordCount, procErr = b.batchCrew(did, allRecords) 300 + default: 301 + // SailorProfileCollection and ScanCollection keep per-record processing 302 + // because they do network I/O or conditional reads that would be awkward 303 + // to batch. Each call writes a single row, so there is no long-lived 304 + // transaction at risk. 305 + for i := range allRecords { 306 + if err := b.processRecordWith(ctx, b.processor, did, collection, &allRecords[i]); err != nil { 307 + slog.Warn("Backfill failed to process record", "uri", allRecords[i].URI, "error", err) 291 308 continue 292 309 } 293 310 recordCount++ 294 311 } 295 - 296 - if err := tx.Commit(); err != nil { 297 - tx.Rollback() 298 - return recordCount, fmt.Errorf("failed to commit transaction: %w", err) 299 - } 312 + } 313 + if procErr != nil { 314 + return recordCount, procErr 300 315 } 301 316 302 317 // Reconciliation runs outside the transaction (involves network I/O and fewer writes)
+145 -1
pkg/appview/jetstream/worker.go
··· 49 49 // In-memory cursor tracking for reconnects 50 50 lastCursor int64 51 51 cursorMutex sync.RWMutex 52 + 53 + // Cursor persistence: a single-slot channel carries the most recent 54 + // cursor to a background saver goroutine. The saver writes to 55 + // jetstream_cursor every tick, dropping any older value that has not 56 + // yet been flushed so the WS read loop is never blocked on DB I/O. 57 + cursorSave chan int64 52 58 } 53 59 54 60 // NewWorker creates a new Jetstream worker ··· 74 80 }, 75 81 statsCache: statsCache, 76 82 processor: NewProcessor(database, true, statsCache), // Use cache for live streaming 83 + cursorSave: make(chan int64, 1), 77 84 } 78 85 } 79 86 ··· 258 265 // TODO: Re-enable compression once debugging is complete 259 266 _ = decoder // Keep decoder to avoid unused variable error 260 267 261 - if err := w.processMessage(message); err != nil { 268 + if err := w.processMessageResilient(ctx, message); err != nil { 262 269 slog.Error("ERROR processing message", "error", err) 263 270 // Continue processing other messages 264 271 } else { ··· 274 281 // 30 seconds to avoid missing events (events are idempotent DB upserts). 275 282 // Cycles through all endpoints indefinitely and never gives up. 276 283 func (w *Worker) StartWithFailover(ctx context.Context) { 284 + // Bootstrap from the persisted cursor the first time we run. If the DB 285 + // has a saved cursor we resume from it (minus a small safety rewind so 286 + // any gap from the previous shutdown is covered). Events are idempotent 287 + // UPSERTs, so re-processing a handful is harmless. 288 + if w.startCursor == 0 { 289 + if cursor, err := db.GetJetstreamCursor(w.db); err != nil { 290 + slog.Warn("Jetstream failed to load persisted cursor", "error", err) 291 + } else if cursor > 0 { 292 + const rewind = int64(30 * 1_000_000) // 30s safety rewind, same units as cursor 293 + resume := cursor - rewind 294 + if resume < 0 { 295 + resume = 0 296 + } 297 + w.cursorMutex.Lock() 298 + w.startCursor = resume 299 + w.lastCursor = resume 300 + w.cursorMutex.Unlock() 301 + slog.Info("Jetstream resuming from persisted cursor", 302 + "persisted_cursor", cursor, 303 + "resume_cursor", resume) 304 + } 305 + } 306 + 307 + // Launch the background cursor saver. It runs for the lifetime of this 308 + // Start call and exits on ctx.Done with a final flush. 309 + saverDone := make(chan struct{}) 310 + go w.runCursorSaver(ctx, saverDone) 311 + defer func() { 312 + <-saverDone 313 + }() 314 + 277 315 retryDelays := []time.Duration{1 * time.Second, 5 * time.Second, 10 * time.Second} 278 316 279 317 for { ··· 358 396 return w.processor 359 397 } 360 398 399 + // runCursorSaver is a long-running goroutine that persists the most recent 400 + // Jetstream cursor to SQLite. It writes at most once every cursorSaveInterval 401 + // so we never hit the DB faster than it can keep up, and always flushes a 402 + // final value on shutdown so the next Start resumes from the right place. 403 + // 404 + // The goroutine intentionally uses b.db directly (not ExecResilient) because 405 + // the INSERT ... ON CONFLICT statement is a single round-trip that cannot 406 + // trigger the poisoned-tx cascade. 407 + func (w *Worker) runCursorSaver(ctx context.Context, done chan<- struct{}) { 408 + defer close(done) 409 + 410 + const cursorSaveInterval = 5 * time.Second 411 + ticker := time.NewTicker(cursorSaveInterval) 412 + defer ticker.Stop() 413 + 414 + var pending int64 415 + flush := func() { 416 + if pending == 0 { 417 + return 418 + } 419 + if err := db.SaveJetstreamCursor(w.db, pending); err != nil { 420 + slog.Warn("Jetstream failed to persist cursor", "cursor", pending, "error", err) 421 + return 422 + } 423 + pending = 0 424 + } 425 + 426 + for { 427 + select { 428 + case <-ctx.Done(): 429 + flush() 430 + return 431 + case c := <-w.cursorSave: 432 + // Keep only the newest value; the ticker decides when to flush. 433 + if c > pending { 434 + pending = c 435 + } 436 + case <-ticker.C: 437 + flush() 438 + } 439 + } 440 + } 441 + 361 442 // GetLastCursor returns the last processed cursor (time_us) for reconnects 362 443 func (w *Worker) GetLastCursor() int64 { 363 444 w.cursorMutex.RLock() ··· 365 446 return w.lastCursor 366 447 } 367 448 449 + // processMessageResilient runs processMessage and, if the underlying DB 450 + // connection was poisoned by a remote tx timeout (common with Bunny Database 451 + // after a backfill chunk exceeded the server-side transaction limit), drains 452 + // the poisoned connections from the pool and retries once. A second failure 453 + // returns the error so the caller's Error log line fires — replacing silent 454 + // data loss with a loud, attributable one. 455 + func (w *Worker) processMessageResilient(ctx context.Context, message []byte) error { 456 + err := w.processMessage(message) 457 + if err == nil || !db.IsPoisonedTxErr(err) { 458 + return err 459 + } 460 + 461 + slog.Warn("Jetstream poisoned connection detected, draining pool and retrying", 462 + "error", err) 463 + drainPool(ctx, w.db) 464 + time.Sleep(100 * time.Millisecond) 465 + return w.processMessage(message) 466 + } 467 + 468 + // drainPool borrows each idle connection from the pool in turn and runs a 469 + // trivial probe. A poisoned connection fails the probe, and db.ExecResilient 470 + // evicts it via driver.ErrBadConn. Loops up to the pool's open-connection 471 + // limit so a single call can clear every bad conn. 472 + func drainPool(ctx context.Context, database *sql.DB) { 473 + // MaxOpenConns is 8 (see pkg/appview/db/schema.go). We probe one more time 474 + // than that to ensure we cycle through every conn if any were mid-use. 475 + const maxProbes = 10 476 + for i := 0; i < maxProbes; i++ { 477 + err := db.ExecResilient(ctx, database, func(conn *sql.Conn) error { 478 + _, err := conn.ExecContext(ctx, "SELECT 1") 479 + return err 480 + }) 481 + if err == nil { 482 + // Got a healthy conn; no need to probe further — any remaining 483 + // poisoned conns will be evicted on their next use. 484 + return 485 + } 486 + if ctx.Err() != nil { 487 + return 488 + } 489 + } 490 + } 491 + 368 492 // processMessage processes a single Jetstream event 369 493 func (w *Worker) processMessage(message []byte) error { 370 494 var event JetstreamEvent ··· 376 500 w.cursorMutex.Lock() 377 501 w.lastCursor = event.TimeUS 378 502 w.cursorMutex.Unlock() 503 + 504 + // Offer the cursor to the async saver. Non-blocking: if the saver is 505 + // still writing the previous value, we drop-and-replace so the DB always 506 + // converges on the freshest cursor without ever stalling the read loop. 507 + if w.cursorSave != nil { 508 + select { 509 + case w.cursorSave <- event.TimeUS: 510 + default: 511 + // Drain any stale value and try once more — if that still fails 512 + // we just skip this tick; the saver's timer will catch up. 513 + select { 514 + case <-w.cursorSave: 515 + default: 516 + } 517 + select { 518 + case w.cursorSave <- event.TimeUS: 519 + default: 520 + } 521 + } 522 + } 379 523 380 524 // Call callback if set 381 525 if w.eventCallback != nil {