Add a new script to scrape trusted verifiers from bskycheck · ricci.io/are-we-decentralized-yet@7a6686e

+1 -1

centralization_stats.py

··· 84 84 85 85 # Different CSVs use different columns for the hostname 86 86 def get_domain(row): 87 - for key in ("domain", "hostname","instance","name","org_id","e.id","o.name","a.asn","asn", "provider"): 87 + for key in ("domain", "hostname","instance","name","org_id","e.id","o.name","a.asn","asn", "provider", "verifier"): 88 88 if key in row: 89 89 return row.get(key, "") 90 90 return None

+77

data-fetchers/bsky-verifiers/fetch-bsky-verifiers.py

··· 1 + #!/usr/bin/env python3 2 + 3 + import csv 4 + from datetime import datetime, timezone 5 + from pathlib import Path 6 + import socket 7 + 8 + import requests 9 + from bs4 import BeautifulSoup 10 + from urllib3.util import connection as urllib3_connection 11 + 12 + URL = "https://bskycheck.com/stats.php" 13 + 14 + def _force_ipv4(): 15 + urllib3_connection.allowed_gai_family = lambda: socket.AF_INET 16 + 17 + 18 + def _normalize_header(text): 19 + return " ".join(text.split()).strip().lower() 20 + 21 + 22 + def find_verifiers_table(soup): 23 + for table in soup.find_all("table"): 24 + headers = [ 25 + _normalize_header(th.get_text()) 26 + for th in table.find_all("th") 27 + ] 28 + if "verifier" in headers and "users verified" in headers: 29 + return table 30 + return None 31 + 32 + 33 + def extract_rows(table): 34 + rows = [] 35 + body_rows = table.find_all("tr") 36 + for tr in body_rows: 37 + cells = tr.find_all("td") 38 + if len(cells) < 2: 39 + continue 40 + verifier = cells[0].get_text(strip=True) 41 + users_verified_text = cells[1].get_text(strip=True) 42 + users_verified = int(users_verified_text.replace(",", "")) 43 + rows.append((verifier, users_verified)) 44 + return rows 45 + 46 + 47 + def main(): 48 + _force_ipv4() 49 + response = requests.get(URL, timeout=30) 50 + response.raise_for_status() 51 + 52 + soup = BeautifulSoup(response.text, "html.parser") 53 + table = find_verifiers_table(soup) 54 + if table is None: 55 + raise RuntimeError("Could not find Trusted Verifiers table with expected headers.") 56 + 57 + rows = extract_rows(table) 58 + if not rows: 59 + raise RuntimeError("No rows found in Trusted Verifiers table.") 60 + 61 + repo_root = Path(__file__).resolve().parents[2] 62 + output_dir = repo_root / "data" / "bsky-verifiers" 63 + output_dir.mkdir(parents=True, exist_ok=True) 64 + 65 + timestamp = datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ") 66 + output_file = output_dir / f"{timestamp}.csv" 67 + 68 + with output_file.open("w", newline="", encoding="utf-8") as f: 69 + writer = csv.writer(f) 70 + writer.writerow(["verifier", "count"]) 71 + writer.writerows(rows) 72 + 73 + print(f"Wrote {len(rows)} rows to {output_file}") 74 + 75 + 76 + if __name__ == "__main__": 77 + main()

+166 -19

data-fetchers/geo/fetch-geo-hosts.py

··· 3 3 import argparse 4 4 import asyncio 5 5 import csv 6 + import ipaddress 6 7 import json 7 8 import os 8 9 import socket 9 10 import tarfile 11 + import urllib.request 10 12 from pathlib import Path 11 13 12 14 try: ··· 19 21 20 22 REPO_ROOT = Path(__file__).resolve().parents[2] 21 23 DATA_DIR = REPO_ROOT / "data" 24 + GEO_DIR = DATA_DIR / "geo" 22 25 ICON_DIR = Path("data-static") / "icons" 23 26 24 27 AT_BSKY_STYLE = { ··· 202 205 return None, None 203 206 204 207 205 - def is_cdn(details) -> bool: 208 + ALL_CDN_ORG_MARKERS = [ 209 + "akamai", 210 + "akamaiedge", 211 + "akamaitechnologies", 212 + "cloudflare", 213 + "edgesuite", 214 + "fastly", 215 + ] 216 + 217 + 218 + def fetch_url_text(url: str) -> str: 219 + request = urllib.request.Request( 220 + url, 221 + headers={"User-Agent": "are-we-decentralized-yet/1.0"}, 222 + ) 223 + with urllib.request.urlopen(request, timeout=30) as response: 224 + return response.read().decode("utf-8") 225 + 226 + 227 + def fetch_url_json(url: str) -> dict: 228 + return json.loads(fetch_url_text(url)) 229 + 230 + 231 + def ensure_cdn_ip_files(refresh: bool) -> dict[str, Path]: 232 + cdn_files = { 233 + "cloudflare": GEO_DIR / "cloudflare-ips.txt", 234 + "aws": GEO_DIR / "aws-ip-ranges.json", 235 + "gcp": GEO_DIR / "gcp-ip-ranges.json", 236 + "fastly": GEO_DIR / "fastly-ip-list.json", 237 + } 238 + if not refresh and all(path.exists() for path in cdn_files.values()): 239 + return cdn_files 240 + 241 + GEO_DIR.mkdir(parents=True, exist_ok=True) 242 + 243 + if refresh or not cdn_files["cloudflare"].exists(): 244 + ipv4 = fetch_url_text("https://www.cloudflare.com/ips-v4/") 245 + ipv6 = fetch_url_text("https://www.cloudflare.com/ips-v6/") 246 + combined = "\n".join( 247 + line.strip() 248 + for line in (ipv4 + "\n" + ipv6).splitlines() 249 + if line.strip() 250 + ) 251 + cdn_files["cloudflare"].write_text(combined + "\n", encoding="utf-8") 252 + 253 + if refresh or not cdn_files["aws"].exists(): 254 + aws_data = fetch_url_json( 255 + "https://ip-ranges.amazonaws.com/ip-ranges.json" 256 + ) 257 + cdn_files["aws"].write_text( 258 + json.dumps(aws_data, ensure_ascii=True, indent=2), encoding="utf-8" 259 + ) 260 + 261 + if refresh or not cdn_files["gcp"].exists(): 262 + gcp_data = fetch_url_json( 263 + "https://www.gstatic.com/ipranges/cloud.json" 264 + ) 265 + cdn_files["gcp"].write_text( 266 + json.dumps(gcp_data, ensure_ascii=True, indent=2), encoding="utf-8" 267 + ) 268 + 269 + if refresh or not cdn_files["fastly"].exists(): 270 + fastly_data = fetch_url_json("https://api.fastly.com/public-ip-list") 271 + cdn_files["fastly"].write_text( 272 + json.dumps(fastly_data, ensure_ascii=True, indent=2), encoding="utf-8" 273 + ) 274 + 275 + return cdn_files 276 + 277 + 278 + def load_cloudflare_prefixes(path: Path) -> list[str]: 279 + if not path.exists(): 280 + return [] 281 + prefixes = [] 282 + for line in path.read_text(encoding="utf-8").splitlines(): 283 + line = line.strip() 284 + if line: 285 + prefixes.append(line) 286 + return prefixes 287 + 288 + 289 + def load_aws_cloudfront_prefixes(path: Path) -> list[str]: 290 + if not path.exists(): 291 + return [] 292 + data = json.loads(path.read_text(encoding="utf-8")) 293 + prefixes = [] 294 + for entry in data.get("prefixes", []): 295 + if entry.get("service") == "CLOUDFRONT": 296 + prefix = entry.get("ip_prefix") 297 + if prefix: 298 + prefixes.append(prefix) 299 + for entry in data.get("ipv6_prefixes", []): 300 + if entry.get("service") == "CLOUDFRONT": 301 + prefix = entry.get("ipv6_prefix") 302 + if prefix: 303 + prefixes.append(prefix) 304 + return prefixes 305 + 306 + 307 + def load_gcp_cdn_prefixes(path: Path) -> list[str]: 308 + if not path.exists(): 309 + return [] 310 + data = json.loads(path.read_text(encoding="utf-8")) 311 + prefixes = [] 312 + for entry in data.get("prefixes", []): 313 + if entry.get("service") != "Google Cloud CDN": 314 + continue 315 + prefix = entry.get("ipv4Prefix") or entry.get("ipv6Prefix") 316 + if prefix: 317 + prefixes.append(prefix) 318 + return prefixes 319 + 320 + 321 + def load_fastly_prefixes(path: Path) -> list[str]: 322 + if not path.exists(): 323 + return [] 324 + data = json.loads(path.read_text(encoding="utf-8")) 325 + prefixes = [] 326 + for key in ("addresses", "ipv6_addresses"): 327 + for entry in data.get(key, []): 328 + if entry: 329 + prefixes.append(entry) 330 + return prefixes 331 + 332 + 333 + def build_cdn_networks(refresh: bool) -> list[ipaddress._BaseNetwork]: 334 + cdn_files = ensure_cdn_ip_files(refresh) 335 + prefix_lists = [ 336 + load_cloudflare_prefixes(cdn_files["cloudflare"]), 337 + load_aws_cloudfront_prefixes(cdn_files["aws"]), 338 + load_gcp_cdn_prefixes(cdn_files["gcp"]), 339 + load_fastly_prefixes(cdn_files["fastly"]), 340 + ] 341 + networks = [] 342 + for prefixes in prefix_lists: 343 + for prefix in prefixes: 344 + try: 345 + networks.append(ipaddress.ip_network(prefix, strict=False)) 346 + except ValueError: 347 + continue 348 + return networks 349 + 350 + 351 + def is_cdn(details, ip: str | None, cdn_networks: list[ipaddress._BaseNetwork]) -> bool: 206 352 org = get_detail_field(details, "org") 207 353 asn = get_detail_field(details, "asn") 208 354 asn_domain = None ··· 221 367 ] 222 368 if s 223 369 ] 224 - cdn_markers = [ 225 - "akamai", 226 - "akamaiedge", 227 - "akamaitechnologies", 228 - "amazonaws", 229 - "cdn", 230 - "cloudflare", 231 - "cloudfront", 232 - "edgesuite", 233 - "fastly", 234 - "gcore", 235 - "googleusercontent", 236 - "google", 237 - "stackpath", 238 - "stackpathdns", 239 - ] 240 - return any(any(marker in s for marker in cdn_markers) for s in haystacks) 370 + if any( 371 + any(marker in s for marker in ALL_CDN_ORG_MARKERS) for s in haystacks 372 + ): 373 + return True 374 + 375 + if not ip: 376 + return False 377 + try: 378 + ip_value = ipaddress.ip_address(ip) 379 + except ValueError: 380 + return False 381 + return any(ip_value in network for network in cdn_networks) 241 382 242 383 243 384 async def resolve_hostnames( ··· 338 479 default=str(DATA_DIR / "cache/dns-cache.json"), 339 480 help="Path to DNS cache JSON (default: data/cache/dns-cache.json)", 340 481 ) 482 + parser.add_argument( 483 + "--refresh-cdn-ips", 484 + action="store_true", 485 + help="Refetch CDN IP range data even if cached files exist", 486 + ) 341 487 args = parser.parse_args() 342 488 343 489 csv_path, hosts = load_hosts(args.source) ··· 370 516 cache = load_cache(cache_path) 371 517 dns_cache_path = Path(args.dns_cache) 372 518 dns_cache = load_cache(dns_cache_path) 519 + cdn_networks = build_cdn_networks(args.refresh_cdn_ips) 373 520 374 521 hostnames = [entry["hostname"] for entry in hosts] 375 522 details_by_ip: dict[str, object] = {} ··· 496 643 "lat": lat, 497 644 "lon": lon, 498 645 "network": extract_network(details), 499 - "cdn": is_cdn(details), 646 + "cdn": is_cdn(details, ip, cdn_networks), 500 647 "anycast": bool(get_detail_field(details, "anycast")), 501 648 "color": style.get("color"), 502 649 "icon": style.get("icon"),

+10 -1

helpers/update-datafile.py

··· 180 180 fedi_csv = find_newest_file(REPO_ROOT / "data" / "fedi-mau") 181 181 at_csv = find_newest_file(REPO_ROOT / "data" / "at-mau") 182 182 git_csv = find_newest_file(REPO_ROOT / "data" / "git") 183 + bsky_verifiers_csv = find_newest_file(REPO_ROOT / "data" / "bsky-verifiers") 183 184 184 185 fedi_dt = parse_timestamp_from_name(fedi_csv.name) 185 186 at_dt = parse_timestamp_from_name(at_csv.name) 186 187 git_dt = parse_timestamp_from_name(git_csv.name) 187 - if fedi_dt is None or at_dt is None or git_dt is None: 188 + bsky_verifiers_dt = parse_timestamp_from_name(bsky_verifiers_csv.name) 189 + if fedi_dt is None or at_dt is None or git_dt is None or bsky_verifiers_dt is None: 188 190 raise RuntimeError("Unable to parse timestamps for latest files") 189 191 190 192 update_network( ··· 207 209 git_csv, 208 210 git_dt.strftime("%m-%d-%Y"), 209 211 data_file=str(git_csv.relative_to(REPO_ROOT)), 212 + ) 213 + update_network( 214 + data, 215 + "bsky_verifiers", 216 + bsky_verifiers_csv, 217 + bsky_verifiers_dt.strftime("%m-%d-%Y"), 218 + data_file=str(bsky_verifiers_csv.relative_to(REPO_ROOT)), 210 219 ) 211 220 212 221 week_target = datetime.now(timezone.utc) - timedelta(days=7)

Configure Feed

Configure Feed