AppView in a box as a Vite plugin thing hatk.dev
2
fork

Configure Feed

Select the types of activity you want to include in your feed.

docs: add JSDoc comments to backfill.ts

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>

+96 -5
+96 -5
packages/hatk/src/backfill.ts
··· 16 16 import { emit, timer } from './logger.ts' 17 17 import type { BackfillConfig } from './config.ts' 18 18 19 + /** Options passed to {@link runBackfill}. */ 19 20 interface BackfillOpts { 21 + /** Base URL of the relay or PDS to enumerate repos from (e.g. `wss://bsky.network`). */ 20 22 pdsUrl: string 23 + /** PLC directory URL used to resolve `did:plc` identifiers (e.g. `https://plc.directory`). */ 21 24 plcUrl: string 25 + /** AT Protocol collection NSIDs to index (e.g. `app.bsky.feed.post`). */ 22 26 collections: Set<string> 27 + /** Backfill behavior settings from `config.yaml`. */ 23 28 config: BackfillConfig 24 29 } 25 30 26 - // --- DID Resolution --- 27 31 28 32 interface PdsResolution { 33 + /** The PDS service endpoint URL from the DID document. */ 29 34 pds: string 35 + /** The user's handle extracted from `alsoKnownAs`, or `null` if not present. */ 30 36 handle: string | null 31 37 } 32 38 39 + /** In-memory cache of DID → PDS resolution results to avoid redundant lookups. */ 33 40 const pdsCache = new Map<string, PdsResolution>() 34 41 let plcUrl: string 35 42 43 + /** 44 + * Resolves a DID to its PDS endpoint and handle by fetching the DID document. 45 + * 46 + * Supports both `did:web` (fetches `/.well-known/did.json`) and `did:plc` 47 + * (fetches from the PLC directory). Results are cached for the lifetime of the process. 48 + * 49 + * @example 50 + * ```ts 51 + * const { pds, handle } = await resolvePds('did:plc:abc123') 52 + * // pds = "https://puffball.us-east.host.bsky.network" 53 + * // handle = "alice.bsky.social" 54 + * ``` 55 + */ 36 56 async function resolvePds(did: string): Promise<PdsResolution> { 37 57 const cached = pdsCache.get(did) 38 58 if (cached) return cached ··· 61 81 return result 62 82 } 63 83 64 - // --- Repo Enumeration --- 65 84 85 + /** 86 + * Paginates through all active repos on a relay/PDS using `com.atproto.sync.listRepos`. 87 + * Yields `{ did, rev }` for each active repo. Skips deactivated repos. 88 + */ 66 89 async function* listRepos(pdsUrl: string): AsyncGenerator<{ did: string; rev: string }> { 67 90 let cursor: string | undefined 68 91 while (true) { ··· 79 102 } 80 103 } 81 104 105 + /** 106 + * Paginates through repos that contain records in a specific collection using 107 + * `com.atproto.sync.listReposByCollection`. More efficient than {@link listRepos} 108 + * when only a few collections are needed, since the relay can filter server-side. 109 + * 110 + * Not all relays support this endpoint — callers should fall back to {@link listRepos}. 111 + */ 82 112 async function* listReposByCollection( 83 113 pdsUrl: string, 84 114 collection: string, ··· 98 128 } 99 129 } 100 130 101 - // --- Single Repo Backfill --- 102 131 132 + /** 133 + * Downloads and indexes a single user's repo via `com.atproto.sync.getRepo`. 134 + * 135 + * The full flow: 136 + * 1. Resolve the DID to find the user's PDS endpoint 137 + * 2. Fetch the repo as a CAR file from the PDS 138 + * 3. Parse the CAR, decode the commit, and walk the MST (Merkle Search Tree) 139 + * 4. Delete any existing records for this DID (so deletions are reflected) 140 + * 5. Bulk-insert all records matching the target collections 141 + * 142 + * On failure, applies exponential backoff retry logic. HTTP 4xx errors are 143 + * treated as permanent failures (repo doesn't exist or is deactivated) and 144 + * are not retried. 145 + * 146 + * @param did - The DID of the repo to backfill (e.g. `did:plc:abc123`) 147 + * @param collections - Collection NSIDs to index; records in other collections are skipped 148 + * @param fetchTimeout - Maximum seconds to wait for the CAR download before aborting 149 + * @returns The number of records successfully indexed 150 + * 151 + * @example 152 + * ```ts 153 + * const count = await backfillRepo('did:plc:abc123', new Set(['app.bsky.feed.post']), 30) 154 + * console.log(`Indexed ${count} records`) 155 + * ``` 156 + */ 103 157 export async function backfillRepo(did: string, collections: Set<string>, fetchTimeout: number): Promise<number> { 104 158 const elapsed = timer() 105 159 let count = 0 ··· 215 269 } 216 270 } 217 271 218 - // --- Worker Pool --- 219 272 273 + /** 274 + * Processes items concurrently with a fixed number of workers. 275 + * Workers pull from a shared index so the pool stays saturated even when 276 + * individual items complete at different speeds. Errors from `fn` are 277 + * swallowed (they're expected to be captured via structured logging). 278 + * 279 + * @param items - The work items to process 280 + * @param parallelism - Maximum number of concurrent workers 281 + * @param fn - Async function to run for each item 282 + */ 220 283 async function runWorkerPool<T>(items: T[], parallelism: number, fn: (item: T) => Promise<void>): Promise<void> { 221 284 let index = 0 222 285 ··· 235 298 await Promise.all(workers) 236 299 } 237 300 238 - // --- Main Backfill Entry Point --- 239 301 302 + /** 303 + * Orchestrates a full backfill run: enumerate repos, filter to pending, download, and index. 304 + * 305 + * Operates in one of three modes based on config: 306 + * - **Pinned repos** — backfill only the DIDs listed in `config.repos` 307 + * - **Full network** — enumerate every active repo on the relay via `listRepos` 308 + * - **Collection signal** (default) — use `listReposByCollection` to discover repos that 309 + * contain records in the configured signal collections, falling back to `listRepos` 310 + * if the relay doesn't support collection-scoped enumeration 311 + * 312 + * After the initial pass, failed repos are retried with exponential backoff 313 + * (up to `config.maxRetries` attempts). The run emits structured log events for 314 + * monitoring via the `backfill.run` and `backfill.retry_round` event types. 315 + * 316 + * @example 317 + * ```ts 318 + * await runBackfill({ 319 + * pdsUrl: 'wss://bsky.network', 320 + * plcUrl: 'https://plc.directory', 321 + * collections: new Set(['xyz.statusphere.status']), 322 + * config: { 323 + * fullNetwork: false, 324 + * parallelism: 10, 325 + * fetchTimeout: 30, 326 + * maxRetries: 5, 327 + * }, 328 + * }) 329 + * ``` 330 + */ 240 331 export async function runBackfill(opts: BackfillOpts): Promise<void> { 241 332 const { pdsUrl, collections, config } = opts 242 333 plcUrl = opts.plcUrl