AT Protocol PLC directory counter
4
fork

Configure Feed

Select the types of activity you want to include in your feed.

add check-reachability.ts for NAS workflow and update README

- Add check-reachability.ts: reads/writes plc-count-by-pds.csv directly
(NAS production workflow) rather than requiring plc-count-results.json
- Update key findings to April 2026 (~87M DIDs, ~16K PDS hosts)
- Add methodology link to sifa.id/stats#methodology
- Note Backlinko discrepancy: PLC total vs reported active users
- Document all three scripts in a comparison table

+303 -15
+29 -15
README.md
··· 2 2 3 3 Count and analyze AT Protocol accounts by scanning all PDS (Personal Data Store) servers via the PLC directory. 4 4 5 - Used to generate the stats at [sifa.id/stats](https://sifa.id/stats). 5 + Used to generate the stats at [sifa.id/stats](https://sifa.id/stats). See the [methodology](https://sifa.id/stats#methodology) for a full explanation of what is and isn't counted. 6 6 7 7 ## What It Does 8 8 9 - - **Crawls** the entire PLC directory (~80M DID registrations) 9 + - **Crawls** the entire PLC directory (~87M DID registrations as of April 2026) 10 10 - **Counts** accounts per PDS provider 11 - - **Tests reachability** by resolving handles 12 - - **Exports** CSV time-series data and JSON summaries 11 + - **Tests reachability** by sampling handles and resolving them via the AT Protocol identity system 12 + - **Exports** CSV time-series data for daily and all-time views 13 + 14 + ## Scripts 15 + 16 + | Script | Purpose | Runtime | 17 + |--------|---------|---------| 18 + | `count-plc-dids.ts` | Full crawl of plc.directory, builds all CSVs | 2–3 hours | 19 + | `recheck-reachability.ts` | Reachability scan using `plc-count-results.json` (standalone workflow) | ~7 hours | 20 + | `check-reachability.ts` | Reachability scan reading/writing the NAS CSV format (production workflow) | ~7 hours | 13 21 14 22 ## Quick Start 15 23 ··· 28 36 npx tsx count-plc-dids.ts --after 2025-09-02 29 37 ``` 30 38 31 - ### Run Reachability Re-checker (Optional) 39 + ### Run Reachability Check (Optional, ~7 hours) 32 40 33 41 ```bash 34 - # Re-run with larger samples (~7 hours) 42 + # Standalone workflow (reads plc-count-results.json) 35 43 npx tsx recheck-reachability.ts 44 + 45 + # NAS/production workflow (reads/writes plc-count-by-pds.csv directly) 46 + npx tsx check-reachability.ts 36 47 ``` 37 48 38 49 ## Output Files ··· 42 53 | `plc-count-results.json` | Summary stats, PDS breakdown | 43 54 | `plc-count-by-pds.csv` | Per-PDS totals + reachability % | 44 55 | `plc-count-by-pds-daily.csv` | Daily time-series by PDS | 56 + | `data/plc-reachability-through.txt` | Date of last reachability scan (NAS workflow) | 45 57 46 - ## Key Findings (March 2026) 58 + ## Key Findings (April 2026) 47 59 48 60 | Metric | Count | 49 61 |--------|-------| 50 - | Total `did:plc` | ~80M | 51 - | Reachable | ~59M (74%) | 52 - | Unreachable | ~20M (mostly spam) | 53 - | Unique PDS hosts | ~15K | 62 + | Total `did:plc` | ~87M | 63 + | Unique PDS hosts | ~16K | 64 + | Bluesky (bsky.social + *.bsky.network) | ~61M | 65 + | Self-hosted / other | ~23M (dominated by ~22M pds.trump.com bot accounts) | 66 + 67 + Note: our total is higher than Bluesky's reported active user count (~33–36M) because we count every identity ever registered in the PLC directory, including bots, spam, and deleted/deactivated accounts. See the [methodology](https://sifa.id/stats#methodology) for details. 54 68 55 69 ## How It Works 56 70 57 71 1. **Crawl**: Pages through `plc.directory/export` (1000 ops/page) 58 72 2. **Filter**: Counts genesis ops (`prev=null`) = new DIDs 59 73 3. **Track**: Extracts PDS endpoint from each operation 60 - 4. **Sample**: Tests 20 handles per PDS for reachability 74 + 4. **Sample**: Tests 3–50 handles per PDS (scaled to host size) for reachability 61 75 5. **Export**: Writes CSVs with daily breakdowns 62 76 63 77 ## Rate Limiting 64 78 65 - - Uses `plc.wtf` mirror by default (10ms delay) 66 - - Falls back to `plc.directory` (75ms delay) 67 - - Handles 429s with exponential backoff 79 + - 75ms delay between pages 80 + - Handles 429s with 10s backoff 81 + - Reachability checks: 50ms between handle resolves 68 82 69 83 ## License 70 84
+274
check-reachability.ts
··· 1 + #!/usr/bin/env npx tsx 2 + /** 3 + * check-reachability.ts 4 + * 5 + * For each PDS with 10+ creates in plc-count-by-pds.csv, samples DIDs from 6 + * plc.directory and checks if their handles resolve. Updates the 7 + * samples_tested / samples_reachable / reachable_pct columns in the CSV and 8 + * writes data/plc-reachability-through.txt with today's date. 9 + * 10 + * Takes ~7 hours for ~2400 PDS hosts. Run manually; not part of nightly crawl. 11 + * 12 + * Usage: npx tsx scripts/check-reachability.ts 13 + */ 14 + 15 + import { readFileSync, writeFileSync, existsSync } from 'node:fs'; 16 + import { resolve, dirname } from 'node:path'; 17 + import { fileURLToPath } from 'node:url'; 18 + 19 + const __dirname = dirname(fileURLToPath(import.meta.url)); 20 + const ROOT = resolve(__dirname, '..'); 21 + const CSV_PATH = resolve(ROOT, 'data', 'plc-count-by-pds.csv'); 22 + const THROUGH_PATH = resolve(ROOT, 'data', 'plc-reachability-through.txt'); 23 + const MIN_CREATES = 10; 24 + 25 + function samplesForPds(creates: number): number { 26 + if (creates < 100) return 3; 27 + if (creates < 10_000) return 5; 28 + if (creates < 100_000) return 10; 29 + if (creates < 1_000_000) return 20; 30 + return 50; 31 + } 32 + 33 + function maxPagesForPds(creates: number): number { 34 + if (creates < 100) return 2; 35 + if (creates < 1_000) return 3; 36 + if (creates < 10_000) return 5; 37 + if (creates < 100_000) return 10; 38 + return 20; 39 + } 40 + 41 + function fmt(n: number): string { 42 + return n.toLocaleString('en-US'); 43 + } 44 + 45 + interface CsvRow { 46 + pds: string; 47 + creates: number; 48 + percentage: string; 49 + samplesTested: number; 50 + samplesReachable: number; 51 + reachablePct: string; 52 + } 53 + 54 + function parseCsv(path: string): CsvRow[] { 55 + const lines = readFileSync(path, 'utf-8').split('\n'); 56 + const rows: CsvRow[] = []; 57 + for (const line of lines.slice(1)) { 58 + if (!line.trim()) continue; 59 + // Handle quoted fields (PDS URLs with commas) 60 + let fields: string[]; 61 + if (line.startsWith('"')) { 62 + const closeQuote = line.indexOf('"', 1); 63 + const pds = line.slice(1, closeQuote); 64 + const rest = line.slice(closeQuote + 2).split(','); 65 + fields = [pds, ...rest]; 66 + } else { 67 + fields = line.split(','); 68 + } 69 + rows.push({ 70 + pds: fields[0]?.trim() ?? '', 71 + creates: parseInt(fields[1]?.trim() ?? '0', 10), 72 + percentage: fields[2]?.trim() ?? '', 73 + samplesTested: parseInt(fields[3]?.trim() || '0', 10), 74 + samplesReachable: parseInt(fields[4]?.trim() || '0', 10), 75 + reachablePct: fields[5]?.trim() ?? '', 76 + }); 77 + } 78 + return rows.filter((r) => r.pds); 79 + } 80 + 81 + function writeCsv(path: string, rows: CsvRow[]): void { 82 + const lines = ['pds,creates,percentage,samples_tested,samples_reachable,reachable_pct']; 83 + for (const r of rows) { 84 + const escapedPds = r.pds.includes(',') ? `"${r.pds}"` : r.pds; 85 + lines.push( 86 + `${escapedPds},${r.creates},${r.percentage},${r.samplesTested || ''},${r.samplesReachable || ''},${r.reachablePct}`, 87 + ); 88 + } 89 + writeFileSync(path, lines.join('\n') + '\n'); 90 + } 91 + 92 + async function checkDidReachable(did: string): Promise<boolean> { 93 + try { 94 + const res = await fetch(`https://plc.directory/${did}`, { 95 + signal: AbortSignal.timeout(10_000), 96 + headers: { 'User-Agent': 'sifa-plc-counter/1.0 (contact: hello@sifa.id)' }, 97 + }); 98 + if (!res.ok) return false; 99 + const doc = (await res.json()) as { alsoKnownAs?: string[] }; 100 + const handles = doc.alsoKnownAs ?? []; 101 + if (handles.length === 0) return false; 102 + 103 + const handle = (handles[0] ?? '').replace('at://', ''); 104 + if (!handle || handle.includes('/')) return false; 105 + 106 + const resolveRes = await fetch( 107 + `https://public.api.bsky.app/xrpc/com.atproto.identity.resolveHandle?handle=${encodeURIComponent(handle)}`, 108 + { 109 + signal: AbortSignal.timeout(10_000), 110 + headers: { 'User-Agent': 'sifa-plc-counter/1.0 (contact: hello@sifa.id)' }, 111 + }, 112 + ); 113 + if (!resolveRes.ok) return false; 114 + const resolved = (await resolveRes.json()) as { did?: string }; 115 + return resolved.did === did; 116 + } catch { 117 + return false; 118 + } 119 + } 120 + 121 + async function fetchSamples(pds: string, count: number, totalCreates: number): Promise<string[]> { 122 + const dids: string[] = []; 123 + const cursors = [ 124 + '2024-01-01T00:00:00.000Z', 125 + '2025-01-01T00:00:00.000Z', 126 + '2025-09-01T00:00:00.000Z', 127 + '2026-01-01T00:00:00.000Z', 128 + ]; 129 + const maxPages = maxPagesForPds(totalCreates); 130 + 131 + for (const startCursor of cursors) { 132 + if (dids.length >= count) break; 133 + let cursor = startCursor; 134 + let pagesScanned = 0; 135 + 136 + while (dids.length < count && pagesScanned < maxPages) { 137 + try { 138 + const res = await fetch( 139 + `https://plc.directory/export?count=1000&after=${encodeURIComponent(cursor)}`, 140 + { 141 + signal: AbortSignal.timeout(30_000), 142 + headers: { 'User-Agent': 'sifa-plc-counter/1.0 (contact: hello@sifa.id)' }, 143 + }, 144 + ); 145 + if (res.status === 429) { 146 + console.log(' rate limited, waiting 10s...'); 147 + await new Promise((r) => setTimeout(r, 10_000)); 148 + continue; 149 + } 150 + if (!res.ok) break; 151 + const text = await res.text(); 152 + const lines = text.trim().split('\n').filter(Boolean); 153 + if (lines.length === 0) break; 154 + 155 + let lastTs = ''; 156 + for (const line of lines) { 157 + const record = JSON.parse(line) as { 158 + did: string; 159 + createdAt: string; 160 + operation?: { 161 + prev?: unknown; 162 + services?: { atproto_pds?: { endpoint?: string } }; 163 + service?: string; 164 + }; 165 + }; 166 + lastTs = record.createdAt; 167 + const op = record.operation ?? {}; 168 + if (op.prev !== null && op.prev !== undefined) continue; 169 + const endpoint = (op.services?.atproto_pds?.endpoint ?? op.service ?? '').replace( 170 + /\/+$/, 171 + '', 172 + ); 173 + if (endpoint === pds && dids.length < count) dids.push(record.did); 174 + } 175 + 176 + cursor = lastTs; 177 + pagesScanned++; 178 + await new Promise((r) => setTimeout(r, 75)); 179 + } catch { 180 + break; 181 + } 182 + } 183 + } 184 + return dids; 185 + } 186 + 187 + async function main() { 188 + if (!existsSync(CSV_PATH)) { 189 + console.error(`CSV not found: ${CSV_PATH}`); 190 + process.exit(1); 191 + } 192 + 193 + const rows = parseCsv(CSV_PATH); 194 + const toCheck = rows.filter((r) => r.creates >= MIN_CREATES); 195 + console.log(`Checking reachability for ${toCheck.length} PDS hosts (${rows.length} total)\n`); 196 + 197 + const reachabilityMap = new Map<string, { tested: number; reachable: number }>(); 198 + let completed = 0; 199 + const startTime = Date.now(); 200 + 201 + for (const row of toCheck) { 202 + const target = samplesForPds(row.creates); 203 + console.log( 204 + ` [${completed + 1}/${toCheck.length}] ${row.pds.slice(0, 45).padEnd(45)} fetching ${target} samples... [${fmt(row.creates)} DIDs]`, 205 + ); 206 + 207 + const samples = await fetchSamples(row.pds, target, row.creates); 208 + if (samples.length === 0) { 209 + console.log(' -> 0 samples found, skipping'); 210 + completed++; 211 + continue; 212 + } 213 + 214 + let reachable = 0; 215 + for (const did of samples) { 216 + if (await checkDidReachable(did)) reachable++; 217 + await new Promise((r) => setTimeout(r, 50)); 218 + } 219 + 220 + const pct = Math.round((reachable / samples.length) * 100); 221 + reachabilityMap.set(row.pds, { tested: samples.length, reachable }); 222 + completed++; 223 + 224 + const elapsed = (Date.now() - startTime) / 1000; 225 + const avgPerHost = elapsed / completed; 226 + const etaMin = Math.round(((toCheck.length - completed) * avgPerHost) / 60); 227 + console.log(` -> ${reachable}/${samples.length} (${pct}%) reachable [ETA: ~${etaMin}min]`); 228 + 229 + // Write progress to CSV every 50 hosts so a restart doesn't lose everything 230 + if (completed % 50 === 0) { 231 + const updated = rows.map((r) => { 232 + const reach = reachabilityMap.get(r.pds); 233 + if (!reach) return r; 234 + const reachPct = 235 + reach.tested > 0 ? Math.round((reach.reachable / reach.tested) * 100).toString() : ''; 236 + return { 237 + ...r, 238 + samplesTested: reach.tested, 239 + samplesReachable: reach.reachable, 240 + reachablePct: reachPct, 241 + }; 242 + }); 243 + writeCsv(CSV_PATH, updated); 244 + console.log(` [checkpoint] CSV updated after ${completed} hosts`); 245 + } 246 + } 247 + 248 + // Final CSV write 249 + const today = new Date().toISOString().slice(0, 10); 250 + const updatedRows = rows.map((r) => { 251 + const reach = reachabilityMap.get(r.pds); 252 + if (!reach) return r; 253 + const reachPct = 254 + reach.tested > 0 ? Math.round((reach.reachable / reach.tested) * 100).toString() : ''; 255 + return { 256 + ...r, 257 + samplesTested: reach.tested, 258 + samplesReachable: reach.reachable, 259 + reachablePct: reachPct, 260 + }; 261 + }); 262 + writeCsv(CSV_PATH, updatedRows); 263 + writeFileSync(THROUGH_PATH, today + '\n'); 264 + 265 + console.log('\n' + '='.repeat(60)); 266 + console.log(`CSV updated: ${CSV_PATH}`); 267 + console.log(`Reachability date written: ${THROUGH_PATH} (${today})`); 268 + console.log('='.repeat(60)); 269 + } 270 + 271 + main().catch((err) => { 272 + console.error('Fatal error:', err); 273 + process.exit(1); 274 + });