AT Protocol PLC directory counter
4
fork

Configure Feed

Select the types of activity you want to include in your feed.

Initial commit: AT Protocol PLC directory counter

Guido X Jansen 15e0671f

+729
+71
README.md
··· 1 + # AT Protocol PLC Directory Counter 2 + 3 + Count and analyze AT Protocol accounts by scanning all PDS (Personal Data Store) servers via the PLC directory. 4 + 5 + Used to generate the stats at [sifa.id/stats](https://sifa.id/stats). 6 + 7 + ## What It Does 8 + 9 + - **Crawls** the entire PLC directory (~80M DID registrations) 10 + - **Counts** accounts per PDS provider 11 + - **Tests reachability** by resolving handles 12 + - **Exports** CSV time-series data and JSON summaries 13 + 14 + ## Quick Start 15 + 16 + ### Prerequisites 17 + 18 + - Node.js 18+ (needs native `fetch`) 19 + - `tsx` (TypeScript runner): `npm install -g tsx` 20 + 21 + ### Run the Counter 22 + 23 + ```bash 24 + # Full crawl (2-3 hours) 25 + npx tsx count-plc-dids.ts 26 + 27 + # Resume if interrupted 28 + npx tsx count-plc-dids.ts --after 2025-09-02 29 + ``` 30 + 31 + ### Run Reachability Re-checker (Optional) 32 + 33 + ```bash 34 + # Re-run with larger samples (~7 hours) 35 + npx tsx recheck-reachability.ts 36 + ``` 37 + 38 + ## Output Files 39 + 40 + | File | Description | 41 + |------|-------------| 42 + | `plc-count-results.json` | Summary stats, PDS breakdown | 43 + | `plc-count-by-pds.csv` | Per-PDS totals + reachability % | 44 + | `plc-count-by-pds-daily.csv` | Daily time-series by PDS | 45 + 46 + ## Key Findings (March 2026) 47 + 48 + | Metric | Count | 49 + |--------|-------| 50 + | Total `did:plc` | ~80M | 51 + | Reachable | ~59M (74%) | 52 + | Unreachable | ~20M (mostly spam) | 53 + | Unique PDS hosts | ~15K | 54 + 55 + ## How It Works 56 + 57 + 1. **Crawl**: Pages through `plc.directory/export` (1000 ops/page) 58 + 2. **Filter**: Counts genesis ops (`prev=null`) = new DIDs 59 + 3. **Track**: Extracts PDS endpoint from each operation 60 + 4. **Sample**: Tests 20 handles per PDS for reachability 61 + 5. **Export**: Writes CSVs with daily breakdowns 62 + 63 + ## Rate Limiting 64 + 65 + - Uses `plc.wtf` mirror by default (10ms delay) 66 + - Falls back to `plc.directory` (75ms delay) 67 + - Handles 429s with exponential backoff 68 + 69 + ## License 70 + 71 + MIT
+425
count-plc-dids.ts
··· 1 + #!/usr/bin/env npx tsx 2 + /** 3 + * Count total unique did:plc DIDs in the PLC directory. 4 + * 5 + * Crawls the PLC export API page by page (1000 ops per page), 6 + * counts new DIDs (prev=null) and tracks every PDS endpoint individually. 7 + * After crawling, samples handles from each PDS to check reachability. 8 + * 9 + * Uses plc.wtf mirror by default to avoid rate-limiting plc.directory. 10 + * Falls back to plc.directory if mirror is down. 11 + * 12 + * Usage: 13 + * npx tsx count-plc-dids.ts 14 + * npx tsx count-plc-dids.ts --after 2025-09-02 # resume from timestamp 15 + * 16 + * Output: 17 + * plc-count-results.json -- summary stats 18 + * plc-count-by-pds.csv -- per-PDS totals + reachability 19 + * plc-count-by-pds-daily.csv -- daily time-series per PDS 20 + * 21 + * Safe to interrupt (Ctrl+C) -- writes partial results on exit. 22 + * Resume by passing --after with the lastTimestamp from the JSON output. 23 + */ 24 + 25 + const BATCH_SIZE = 1000; 26 + const MIRROR_URL = 'https://plc.wtf/export'; 27 + const FALLBACK_URL = 'https://plc.directory/export'; 28 + const OUTPUT_DIR = new URL('./', import.meta.url).pathname; 29 + const REQUEST_DELAY_MS = 10; 30 + const PROGRESS_EVERY = 50; // pages 31 + const SAVE_EVERY = 2500; // pages 32 + const REACHABILITY_SAMPLES = 20; // DIDs to test per PDS 33 + 34 + interface Stats { 35 + totalOperations: number; 36 + totalCreates: number; 37 + totalUpdates: number; 38 + pdsCounts: Record<string, number>; 39 + /** date (YYYY-MM-DD) -> PDS -> create count */ 40 + daily: Record<string, Record<string, number>>; 41 + /** PDS -> array of sample DIDs (up to REACHABILITY_SAMPLES) */ 42 + pdsSamples: Record<string, string[]>; 43 + /** PDS -> { tested, reachable } (filled after crawl) */ 44 + pdsReachability: Record<string, { tested: number; reachable: number }>; 45 + firstTimestamp: string | null; 46 + lastTimestamp: string | null; 47 + pagesCompleted: number; 48 + startedAt: string; 49 + completedAt: string | null; 50 + resumedFrom: string | null; 51 + } 52 + 53 + /** Normalize PDS endpoint for grouping (strip trailing slash) */ 54 + function normalizePds(endpoint: string): string { 55 + if (!endpoint) return '(empty)'; 56 + return endpoint.replace(/\/+$/, ''); 57 + } 58 + 59 + function fmt(n: number): string { 60 + return n.toLocaleString('en-US'); 61 + } 62 + 63 + function printProgress(stats: Stats, elapsed: number) { 64 + const opsPerSec = stats.totalOperations / (elapsed / 1000); 65 + const estTotal = 85_000_000; 66 + const remaining = Math.max(0, estTotal - stats.totalOperations); 67 + const etaMin = opsPerSec > 0 ? Math.round(remaining / opsPerSec / 60) : 0; 68 + 69 + const top = Object.entries(stats.pdsCounts) 70 + .sort((a, b) => b[1] - a[1]) 71 + .slice(0, 3) 72 + .map(([pds, count]) => { 73 + const label = pds.length > 25 ? pds.slice(0, 22) + '...' : pds; 74 + return `${label}: ${fmt(count)}`; 75 + }) 76 + .join(' | '); 77 + 78 + console.log( 79 + `[Page ${fmt(stats.pagesCompleted)}] ` + 80 + `Creates: ${fmt(stats.totalCreates)} | ` + 81 + `PDSes: ${Object.keys(stats.pdsCounts).length} | ` + 82 + `${opsPerSec.toFixed(0)} ops/s | ` + 83 + `ETA: ~${etaMin}min | ` + 84 + `${stats.lastTimestamp?.slice(0, 10) ?? '?'}` 85 + ); 86 + console.log(` Top: ${top}`); 87 + } 88 + 89 + async function writeResults(stats: Stats, jsonPath: string, csvPath: string) { 90 + const { writeFile } = await import('node:fs/promises'); 91 + 92 + // JSON with summary (exclude daily and pdsSamples -- too large) 93 + const { daily, pdsSamples, ...jsonStats } = stats; 94 + await writeFile(jsonPath, JSON.stringify(jsonStats, null, 2) + '\n'); 95 + 96 + // CSV totals: one row per PDS with reachability 97 + const sorted = Object.entries(stats.pdsCounts).sort((a, b) => b[1] - a[1]); 98 + const csvLines = ['pds,creates,percentage,samples_tested,samples_reachable,reachable_pct']; 99 + for (const [pds, count] of sorted) { 100 + const pct = (count / stats.totalCreates * 100).toFixed(4); 101 + const reach = stats.pdsReachability[pds]; 102 + const tested = reach?.tested ?? 0; 103 + const reachable = reach?.reachable ?? 0; 104 + const reachPct = tested > 0 ? (reachable / tested * 100).toFixed(0) : ''; 105 + const escaped = pds.includes(',') ? `"${pds}"` : pds; 106 + csvLines.push(`${escaped},${count},${pct},${tested},${reachable},${reachPct}`); 107 + } 108 + await writeFile(csvPath, csvLines.join('\n') + '\n'); 109 + 110 + // CSV daily time-series: date,pds,creates 111 + const dailyPath = csvPath.replace('.csv', '-daily.csv'); 112 + const dailyLines = ['date,pds,creates']; 113 + const days = Object.keys(daily).sort(); 114 + for (const day of days) { 115 + const pdsEntries = Object.entries(daily[day]).sort((a, b) => b[1] - a[1]); 116 + for (const [pds, count] of pdsEntries) { 117 + const escaped = pds.includes(',') ? `"${pds}"` : pds; 118 + dailyLines.push(`${day},${escaped},${count}`); 119 + } 120 + } 121 + await writeFile(dailyPath, dailyLines.join('\n') + '\n'); 122 + 123 + console.log(`Results: ${jsonPath}`); 124 + console.log(`By PDS: ${csvPath}`); 125 + console.log(`Daily: ${dailyPath}`); 126 + } 127 + 128 + function printSummary(stats: Stats) { 129 + console.log('\n' + '='.repeat(70)); 130 + console.log('PLC DIRECTORY COUNT RESULTS'); 131 + console.log('='.repeat(70)); 132 + console.log(`Total operations: ${fmt(stats.totalOperations)}`); 133 + console.log(`Total new DIDs: ${fmt(stats.totalCreates)}`); 134 + console.log(`Total updates: ${fmt(stats.totalUpdates)}`); 135 + console.log(`Unique PDS hosts: ${Object.keys(stats.pdsCounts).length}`); 136 + console.log(`Time range: ${stats.firstTimestamp?.slice(0, 19)} to ${stats.lastTimestamp?.slice(0, 19)}`); 137 + console.log(`Pages crawled: ${fmt(stats.pagesCompleted)}`); 138 + 139 + // Reachability summary 140 + const reachEntries = Object.entries(stats.pdsReachability); 141 + if (reachEntries.length > 0) { 142 + let totalReachableDids = 0; 143 + let totalUnreachableDids = 0; 144 + for (const [pds, { tested, reachable }] of reachEntries) { 145 + const count = stats.pdsCounts[pds] ?? 0; 146 + if (tested > 0) { 147 + const rate = reachable / tested; 148 + totalReachableDids += Math.round(count * rate); 149 + totalUnreachableDids += Math.round(count * (1 - rate)); 150 + } 151 + } 152 + // DIDs from untested PDS (tiny ones) -- assume reachable 153 + const testedPdsSet = new Set(reachEntries.map(([pds]) => pds)); 154 + const untestedDids = Object.entries(stats.pdsCounts) 155 + .filter(([pds]) => !testedPdsSet.has(pds)) 156 + .reduce((sum, [, c]) => sum + c, 0); 157 + totalReachableDids += untestedDids; 158 + 159 + console.log(`\nReachability (sampled ${REACHABILITY_SAMPLES} handles per PDS):`); 160 + console.log(` Estimated reachable: ${fmt(totalReachableDids)}`); 161 + console.log(` Estimated unreachable: ${fmt(totalUnreachableDids)}`); 162 + } 163 + 164 + console.log('\nTop 20 PDS providers:'); 165 + const sorted = Object.entries(stats.pdsCounts).sort((a, b) => b[1] - a[1]); 166 + for (const [pds, count] of sorted.slice(0, 20)) { 167 + const pct = (count / stats.totalCreates * 100).toFixed(2); 168 + const reach = stats.pdsReachability[pds]; 169 + const reachStr = reach ? ` [${reach.reachable}/${reach.tested} reachable]` : ''; 170 + console.log(` ${pds.padEnd(50)} ${fmt(count).padStart(14)} (${pct}%)${reachStr}`); 171 + } 172 + if (sorted.length > 20) { 173 + const rest = sorted.slice(20).reduce((sum, [, c]) => sum + c, 0); 174 + console.log(` ... ${sorted.length - 20} more PDS hosts ${fmt(rest).padStart(30)} total`); 175 + } 176 + console.log('='.repeat(70)); 177 + } 178 + 179 + async function fetchPage(cursor: string, useFallback: boolean): Promise<string> { 180 + const baseUrl = useFallback ? FALLBACK_URL : MIRROR_URL; 181 + const url = `${baseUrl}?count=${BATCH_SIZE}&after=${encodeURIComponent(cursor)}`; 182 + const res = await fetch(url, { 183 + signal: AbortSignal.timeout(30_000), 184 + headers: { 'User-Agent': 'sifa-plc-counter/1.0 (contact: hello@sifa.id)' }, 185 + }); 186 + if (!res.ok) { 187 + throw new Error(`HTTP ${res.status}: ${res.statusText}`); 188 + } 189 + return res.text(); 190 + } 191 + 192 + /** 193 + * Check if a DID's handle resolves back to that DID. 194 + * Tries plc.directory to get the handle, then checks if it resolves. 195 + */ 196 + async function checkDidReachable(did: string): Promise<boolean> { 197 + try { 198 + // Resolve DID -> get handle from DID document 199 + const res = await fetch(`https://plc.directory/${did}`, { 200 + signal: AbortSignal.timeout(10_000), 201 + headers: { 'User-Agent': 'sifa-plc-counter/1.0 (contact: hello@sifa.id)' }, 202 + }); 203 + if (!res.ok) return false; 204 + const doc = await res.json() as { alsoKnownAs?: string[] }; 205 + const handles = doc.alsoKnownAs ?? []; 206 + if (handles.length === 0) return false; 207 + 208 + // Extract handle from at:// URI 209 + const atUri = handles[0]; 210 + const handle = atUri.replace('at://', ''); 211 + if (!handle || handle.includes('/')) return false; 212 + 213 + // Resolve handle -> DID via Bluesky public API 214 + const resolveRes = await fetch( 215 + `https://public.api.bsky.app/xrpc/com.atproto.identity.resolveHandle?handle=${encodeURIComponent(handle)}`, 216 + { 217 + signal: AbortSignal.timeout(10_000), 218 + headers: { 'User-Agent': 'sifa-plc-counter/1.0 (contact: hello@sifa.id)' }, 219 + }, 220 + ); 221 + if (!resolveRes.ok) return false; 222 + const resolved = await resolveRes.json() as { did?: string }; 223 + return resolved.did === did; 224 + } catch { 225 + return false; 226 + } 227 + } 228 + 229 + async function runReachabilityChecks(stats: Stats) { 230 + const pdsHosts = Object.entries(stats.pdsCounts) 231 + .sort((a, b) => b[1] - a[1]); 232 + 233 + // Only test PDS providers with 10+ DIDs (skip tiny ones) 234 + const toTest = pdsHosts.filter(([, count]) => count >= 10); 235 + console.log(`\nTesting reachability for ${toTest.length} PDS providers (${REACHABILITY_SAMPLES} samples each)...`); 236 + 237 + let completed = 0; 238 + for (const [pds] of toTest) { 239 + const samples = stats.pdsSamples[pds] ?? []; 240 + if (samples.length === 0) continue; 241 + 242 + let reachable = 0; 243 + for (const did of samples) { 244 + if (await checkDidReachable(did)) { 245 + reachable++; 246 + } 247 + // Small delay between checks 248 + await new Promise((r) => setTimeout(r, 100)); 249 + } 250 + stats.pdsReachability[pds] = { tested: samples.length, reachable }; 251 + completed++; 252 + 253 + if (completed % 10 === 0 || completed === toTest.length) { 254 + console.log(` Checked ${completed}/${toTest.length} PDS providers...`); 255 + } 256 + } 257 + } 258 + 259 + async function main() { 260 + const afterArg = process.argv.indexOf('--after'); 261 + const resumeFrom = afterArg !== -1 ? process.argv[afterArg + 1] : null; 262 + 263 + let cursor: string; 264 + let jsonFile: string; 265 + let csvFile: string; 266 + if (resumeFrom) { 267 + cursor = resumeFrom.includes('T') ? resumeFrom : `${resumeFrom}T00:00:00.000Z`; 268 + const tag = cursor.slice(0, 10).replace(/-/g, ''); 269 + jsonFile = `${OUTPUT_DIR}plc-count-from-${tag}.json`; 270 + csvFile = `${OUTPUT_DIR}plc-count-from-${tag}.csv`; 271 + console.log(`Resuming from: ${cursor}`); 272 + } else { 273 + cursor = '1970-01-01T00:00:00.000Z'; 274 + jsonFile = `${OUTPUT_DIR}plc-count-results.json`; 275 + csvFile = `${OUTPUT_DIR}plc-count-by-pds.csv`; 276 + console.log('Starting full PLC directory crawl...'); 277 + } 278 + console.log(`Output: ${jsonFile}`); 279 + console.log(`Using mirror: ${MIRROR_URL} (fallback: ${FALLBACK_URL})`); 280 + console.log(`Delay: ${REQUEST_DELAY_MS}ms between requests\n`); 281 + 282 + const stats: Stats = { 283 + totalOperations: 0, 284 + totalCreates: 0, 285 + totalUpdates: 0, 286 + pdsCounts: {}, 287 + daily: {}, 288 + pdsSamples: {}, 289 + pdsReachability: {}, 290 + firstTimestamp: null, 291 + lastTimestamp: null, 292 + pagesCompleted: 0, 293 + startedAt: new Date().toISOString(), 294 + completedAt: null, 295 + resumedFrom: resumeFrom, 296 + }; 297 + 298 + const startTime = Date.now(); 299 + 300 + let interrupted = false; 301 + process.on('SIGINT', async () => { 302 + if (interrupted) process.exit(1); 303 + interrupted = true; 304 + console.log('\n\nInterrupted! Writing partial results...'); 305 + printSummary(stats); 306 + await writeResults(stats, jsonFile, csvFile); 307 + process.exit(0); 308 + }); 309 + 310 + let consecutiveErrors = 0; 311 + let useFallback = false; 312 + 313 + // ---- Phase 1: Crawl all operations ---- 314 + console.log('Phase 1: Crawling PLC directory...\n'); 315 + 316 + while (!interrupted) { 317 + let text: string; 318 + try { 319 + text = await fetchPage(cursor, useFallback); 320 + consecutiveErrors = 0; 321 + } catch (err) { 322 + consecutiveErrors++; 323 + const message = err instanceof Error ? err.message : String(err); 324 + 325 + if (message.includes('429') && !useFallback) { 326 + console.log(`Mirror rate-limited, switching to ${FALLBACK_URL}...`); 327 + useFallback = true; 328 + consecutiveErrors = 0; 329 + await new Promise((r) => setTimeout(r, 5000)); 330 + continue; 331 + } 332 + 333 + if (message.includes('429')) { 334 + const backoff = Math.min(60_000, 2000 * Math.pow(2, consecutiveErrors)); 335 + console.error(`429 rate-limited (attempt ${consecutiveErrors}), waiting ${backoff / 1000}s...`); 336 + await new Promise((r) => setTimeout(r, backoff)); 337 + if (consecutiveErrors >= 10) { 338 + console.error('Too many rate limits, stopping. Resume with --after'); 339 + break; 340 + } 341 + continue; 342 + } 343 + 344 + console.error(`Error (attempt ${consecutiveErrors}): ${message}`); 345 + if (consecutiveErrors >= 5) { 346 + console.error('Too many consecutive errors, stopping. Resume with --after'); 347 + break; 348 + } 349 + await new Promise((r) => setTimeout(r, 2000 * consecutiveErrors)); 350 + continue; 351 + } 352 + 353 + const lines = text.trim().split('\n').filter(Boolean); 354 + if (lines.length === 0) { 355 + console.log('No more records. Crawl complete!'); 356 + break; 357 + } 358 + 359 + for (const line of lines) { 360 + const record = JSON.parse(line); 361 + stats.totalOperations++; 362 + 363 + if (!stats.firstTimestamp) { 364 + stats.firstTimestamp = record.createdAt; 365 + } 366 + stats.lastTimestamp = record.createdAt; 367 + 368 + const op = record.operation ?? {}; 369 + const isCreate = op.prev === null || op.prev === undefined; 370 + 371 + if (isCreate) { 372 + stats.totalCreates++; 373 + const raw = op.services?.atproto_pds?.endpoint ?? op.service ?? ''; 374 + const pds = normalizePds(raw); 375 + stats.pdsCounts[pds] = (stats.pdsCounts[pds] ?? 0) + 1; 376 + 377 + // Daily breakdown 378 + const day = record.createdAt.slice(0, 10); // YYYY-MM-DD 379 + if (!stats.daily[day]) stats.daily[day] = {}; 380 + stats.daily[day][pds] = (stats.daily[day][pds] ?? 0) + 1; 381 + 382 + // Collect samples for reachability (reservoir sampling: keep last N) 383 + if (!stats.pdsSamples[pds]) stats.pdsSamples[pds] = []; 384 + if (stats.pdsSamples[pds].length < REACHABILITY_SAMPLES) { 385 + stats.pdsSamples[pds].push(record.did); 386 + } else { 387 + // Replace randomly to get spread across time 388 + const idx = Math.floor(Math.random() * stats.pdsCounts[pds]); 389 + if (idx < REACHABILITY_SAMPLES) { 390 + stats.pdsSamples[pds][idx] = record.did; 391 + } 392 + } 393 + } else { 394 + stats.totalUpdates++; 395 + } 396 + } 397 + 398 + cursor = stats.lastTimestamp!; 399 + stats.pagesCompleted++; 400 + 401 + if (stats.pagesCompleted % PROGRESS_EVERY === 0) { 402 + printProgress(stats, Date.now() - startTime); 403 + if (stats.pagesCompleted % SAVE_EVERY === 0) { 404 + await writeResults(stats, jsonFile, csvFile); 405 + } 406 + } 407 + 408 + await new Promise((r) => setTimeout(r, REQUEST_DELAY_MS)); 409 + } 410 + 411 + // ---- Phase 2: Reachability checks ---- 412 + if (!interrupted) { 413 + console.log('\nPhase 2: Checking handle reachability...\n'); 414 + await runReachabilityChecks(stats); 415 + } 416 + 417 + stats.completedAt = new Date().toISOString(); 418 + printSummary(stats); 419 + await writeResults(stats, jsonFile, csvFile); 420 + } 421 + 422 + main().catch((err) => { 423 + console.error('Fatal error:', err); 424 + process.exit(1); 425 + });
+233
recheck-reachability.ts
··· 1 + #!/usr/bin/env npx tsx 2 + /** 3 + * Re-run reachability checks using plc.directory/export. 4 + * 5 + * For each PDS with 10+ creates, fetches sample DIDs from plc.directory 6 + * and checks if their handles resolve. Sample count scales with PDS size. 7 + * 8 + * Updates plc-count-by-pds.csv and plc-count-results.json. 9 + * 10 + * Usage: npx tsx recheck-reachability.ts 11 + */ 12 + 13 + import { readFileSync, writeFileSync } from 'node:fs'; 14 + import { join } from 'node:path'; 15 + 16 + const OUTPUT_DIR = new URL('./', import.meta.url).pathname; 17 + const MIN_CREATES = 10; 18 + 19 + // Scaled to fit ~7 hour total runtime for ~2400 PDS hosts 20 + function samplesForPds(creates: number): number { 21 + if (creates < 100) return 3; 22 + if (creates < 10_000) return 5; 23 + if (creates < 100_000) return 10; 24 + if (creates < 1_000_000) return 20; 25 + return 50; 26 + } 27 + 28 + // Limit page scanning based on PDS size -- rare PDS hosts aren't worth scanning forever 29 + function maxPagesForPds(creates: number): number { 30 + if (creates < 100) return 2; 31 + if (creates < 1_000) return 3; 32 + if (creates < 10_000) return 5; 33 + if (creates < 100_000) return 10; 34 + return 20; 35 + } 36 + 37 + function fmt(n: number): string { 38 + return n.toLocaleString('en-US'); 39 + } 40 + 41 + async function checkDidReachable(did: string): Promise<boolean> { 42 + try { 43 + const res = await fetch(`https://plc.directory/${did}`, { 44 + signal: AbortSignal.timeout(10_000), 45 + headers: { 'User-Agent': 'sifa-plc-counter/1.0 (contact: hello@sifa.id)' }, 46 + }); 47 + if (!res.ok) return false; 48 + const doc = (await res.json()) as { alsoKnownAs?: string[] }; 49 + const handles = doc.alsoKnownAs ?? []; 50 + if (handles.length === 0) return false; 51 + 52 + const handle = handles[0].replace('at://', ''); 53 + if (!handle || handle.includes('/')) return false; 54 + 55 + const resolveRes = await fetch( 56 + `https://public.api.bsky.app/xrpc/com.atproto.identity.resolveHandle?handle=${encodeURIComponent(handle)}`, 57 + { 58 + signal: AbortSignal.timeout(10_000), 59 + headers: { 'User-Agent': 'sifa-plc-counter/1.0 (contact: hello@sifa.id)' }, 60 + }, 61 + ); 62 + if (!resolveRes.ok) return false; 63 + const resolved = (await resolveRes.json()) as { did?: string }; 64 + return resolved.did === did; 65 + } catch { 66 + return false; 67 + } 68 + } 69 + 70 + /** 71 + * Fetch sample DIDs for a specific PDS from plc.directory/export. 72 + * Scans from multiple time periods, pages forward until enough found. 73 + */ 74 + async function fetchSamples(pds: string, count: number, totalCreates: number): Promise<string[]> { 75 + const dids: string[] = []; 76 + 77 + const cursors = [ 78 + '2024-01-01T00:00:00.000Z', 79 + '2025-01-01T00:00:00.000Z', 80 + '2025-09-01T00:00:00.000Z', 81 + '2026-01-01T00:00:00.000Z', 82 + ]; 83 + 84 + const maxPages = maxPagesForPds(totalCreates); 85 + 86 + for (const startCursor of cursors) { 87 + if (dids.length >= count) break; 88 + let cursor = startCursor; 89 + let pagesScanned = 0; 90 + 91 + while (dids.length < count && pagesScanned < maxPages) { 92 + try { 93 + const res = await fetch( 94 + `https://plc.directory/export?count=1000&after=${encodeURIComponent(cursor)}`, 95 + { 96 + signal: AbortSignal.timeout(30_000), 97 + headers: { 'User-Agent': 'sifa-plc-counter/1.0 (contact: hello@sifa.id)' }, 98 + }, 99 + ); 100 + 101 + if (res.status === 429) { 102 + console.log(` rate limited, waiting 10s...`); 103 + await new Promise((r) => setTimeout(r, 10_000)); 104 + continue; 105 + } 106 + 107 + if (!res.ok) break; 108 + const text = await res.text(); 109 + const lines = text.trim().split('\n').filter(Boolean); 110 + if (lines.length === 0) break; 111 + 112 + let lastTs = ''; 113 + for (const line of lines) { 114 + const record = JSON.parse(line); 115 + lastTs = record.createdAt; 116 + const op = record.operation ?? {}; 117 + if (op.prev !== null && op.prev !== undefined) continue; 118 + const endpoint = (op.services?.atproto_pds?.endpoint ?? op.service ?? '').replace(/\/+$/, ''); 119 + if (endpoint === pds && dids.length < count) { 120 + dids.push(record.did); 121 + } 122 + } 123 + 124 + cursor = lastTs; 125 + pagesScanned++; 126 + await new Promise((r) => setTimeout(r, 75)); // respect rate limits 127 + } catch { 128 + break; 129 + } 130 + } 131 + } 132 + 133 + return dids; 134 + } 135 + 136 + async function main() { 137 + const statsPath = join(OUTPUT_DIR, 'plc-count-results.json'); 138 + const stats = JSON.parse(readFileSync(statsPath, 'utf-8')); 139 + 140 + const pdsHosts = Object.entries(stats.pdsCounts as Record<string, number>) 141 + .filter(([, count]) => count >= MIN_CREATES) 142 + .sort((a, b) => (b[1] as number) - (a[1] as number)); 143 + 144 + console.log(`Checking reachability for ${pdsHosts.length} PDS hosts\n`); 145 + 146 + const reachability: Record<string, { tested: number; reachable: number }> = {}; 147 + let completed = 0; 148 + const startTime = Date.now(); 149 + 150 + for (const [pds, totalCreates] of pdsHosts) { 151 + const target = samplesForPds(totalCreates as number); 152 + console.log(` [${completed + 1}/${pdsHosts.length}] ${pds.slice(0, 45).padEnd(45)} fetching ${target} samples... [${fmt(totalCreates as number)} DIDs]`); 153 + 154 + const samples = await fetchSamples(pds, target, totalCreates as number); 155 + 156 + if (samples.length === 0) { 157 + console.log(` -> 0 samples found, skipping`); 158 + completed++; 159 + continue; 160 + } 161 + 162 + let reachable = 0; 163 + for (const did of samples) { 164 + if (await checkDidReachable(did)) { 165 + reachable++; 166 + } 167 + await new Promise((r) => setTimeout(r, 50)); 168 + } 169 + 170 + const pct = Math.round((reachable / samples.length) * 100); 171 + reachability[pds] = { tested: samples.length, reachable }; 172 + completed++; 173 + 174 + const elapsed = (Date.now() - startTime) / 1000; 175 + const avgPerHost = elapsed / completed; 176 + const remaining = pdsHosts.length - completed; 177 + const etaMin = Math.round((remaining * avgPerHost) / 60); 178 + console.log(` -> ${reachable}/${samples.length} (${pct}%) reachable [ETA: ~${etaMin}min]`); 179 + } 180 + 181 + // Update stats 182 + stats.pdsReachability = reachability; 183 + 184 + // Write updated JSON (exclude bulky fields) 185 + const { daily: _d, pdsSamples: _s, ...jsonStats } = stats; 186 + writeFileSync(statsPath, JSON.stringify(jsonStats, null, 2) + '\n'); 187 + 188 + // Regenerate CSV 189 + const csvPath = join(OUTPUT_DIR, 'plc-count-by-pds.csv'); 190 + const sorted = Object.entries(stats.pdsCounts as Record<string, number>).sort((a, b) => (b[1] as number) - (a[1] as number)); 191 + const totalCreatesAll = Object.values(stats.pdsCounts as Record<string, number>).reduce((s: number, c: number) => s + c, 0); 192 + const csvLines = ['pds,creates,percentage,samples_tested,samples_reachable,reachable_pct']; 193 + for (const [pds, count] of sorted) { 194 + const pct = ((count as number) / totalCreatesAll * 100).toFixed(4); 195 + const reach = reachability[pds]; 196 + const tested = reach?.tested ?? 0; 197 + const reachableCount = reach?.reachable ?? 0; 198 + const reachPct = tested > 0 ? Math.round((reachableCount / tested) * 100).toString() : ''; 199 + const escaped = pds.includes(',') ? `"${pds}"` : pds; 200 + csvLines.push(`${escaped},${count},${pct},${tested},${reachableCount},${reachPct}`); 201 + } 202 + writeFileSync(csvPath, csvLines.join('\n') + '\n'); 203 + 204 + // Summary 205 + console.log('\n' + '='.repeat(60)); 206 + let totalReachable = 0; 207 + let totalUnreachable = 0; 208 + for (const [pds, reach] of Object.entries(reachability)) { 209 + const count = (stats.pdsCounts as Record<string, number>)[pds] ?? 0; 210 + if (reach.tested > 0) { 211 + const rate = reach.reachable / reach.tested; 212 + totalReachable += Math.round(count * rate); 213 + totalUnreachable += Math.round(count * (1 - rate)); 214 + } 215 + } 216 + const testedSet = new Set(Object.keys(reachability)); 217 + const untested = Object.entries(stats.pdsCounts as Record<string, number>) 218 + .filter(([pds]) => !testedSet.has(pds)) 219 + .reduce((sum: number, [, c]) => sum + (c as number), 0); 220 + totalReachable += untested; 221 + 222 + console.log(`Estimated reachable: ${fmt(totalReachable)}`); 223 + console.log(`Estimated unreachable: ${fmt(totalUnreachable)}`); 224 + console.log(`Untested (tiny PDS): ${fmt(untested)}`); 225 + console.log('='.repeat(60)); 226 + console.log(`\nUpdated: ${statsPath}`); 227 + console.log(`Updated: ${csvPath}`); 228 + } 229 + 230 + main().catch((err) => { 231 + console.error('Fatal error:', err); 232 + process.exit(1); 233 + });