add site_archive.py: recursive website archiver · kira.pds.witchcraft.systems/web-archive@6a1b6b6

+533 -2

2 changed files

expand all

README.md

+25 -2

README.md

··· 1 1 # web-archive 2 2 3 - web archiver with MASL bundle mode for ATProto. captures web pages as content-addressed bundles stored on your PDS with optional IPFS pinning. 3 + web archiver with MASL bundle mode for ATProto. captures web pages as content-addressed bundles stored on your PDS with optional IPFS pinning. includes a recursive site crawler for archiving entire websites. 4 4 5 5 ## what it does 6 6 7 7 - **single mode**: archives a single HTML page with its CID stored as a `systems.witchcraft.archive.capture` ATProto record 8 - - **bundle mode**: archives a page + all subresources (CSS, JS, images, fonts) as a MASL bundle (`ing.dasl.masl` record) — each resource gets its own content-addressed blob on your PDS 8 + - **bundle mode**: archives a page + all subresources (CSS, JS, images, fonts) as a MASL bundle — each resource gets its own content-addressed blob on your PDS 9 + - **site mode** (`site_archive.py`): recursively crawls a website (BFS), archives each page as a bundle, creates a site manifest linking them all together. internal links are rewritten to point to sibling archive captures. 9 10 - **CSS url() scanning**: follows `@import` and `url()` references in stylesheets to capture fonts, background images, etc. 10 11 - **IPFS pinning**: optionally pins the HTML to IPFS via a local kubo node 11 12 - **PDS blob storage**: uploads all resources as PDS blobs with proper content-type headers ··· 35 36 python web_archive.py --verify <rkey> 36 37 ``` 37 38 39 + ### site archiver 40 + 41 + ```bash 42 + # dry-run: show what would be archived 43 + python site_archive.py https://example.com --dry-run 44 + 45 + # archive a site (default: depth 2, max 30 pages) 46 + python site_archive.py https://example.com 47 + 48 + # customize crawl depth and page limit 49 + python site_archive.py https://example.com --depth 3 --max-pages 50 50 + 51 + # list site archives 52 + python site_archive.py --list 53 + 54 + # show status of a site archive 55 + python site_archive.py --status <rkey> 56 + ``` 57 + 38 58 ## auth 39 59 40 60 set these environment variables: ··· 61 81 - `blobs`: path → ATProto blob ref (for content retrieval from PDS) 62 82 - archive metadata (url, title, capturedAt, etc) at top level 63 83 - see [MASL spec](https://dasl.ing/masl.html) for the manifest format 84 + - `systems.witchcraft.archive.site` — site manifests linking multiple page bundles 85 + - page list with URLs, titles, depths, and bundle rkeys 86 + - link map for internal link rewriting between archived pages 64 87 65 88 ## viewer 66 89

+508

site_archive.py

··· 1 + #!/usr/bin/env python3 2 + """ 3 + site_archive.py - Recursively archive an entire website to ATProto. 4 + 5 + Crawls a site starting from a root URL, archives each page as a bundle 6 + using web_archive.py's bundle mode, and rewrites internal links to point 7 + to sibling archive captures. 8 + 9 + Usage: 10 + python site_archive.py <url> # Archive site (default depth 2) 11 + python site_archive.py <url> --depth 3 # Max crawl depth 12 + python site_archive.py <url> --max-pages 20 # Max pages to archive 13 + python site_archive.py <url> --dry-run # Show what would be archived 14 + python site_archive.py <url> --no-ipfs # Skip IPFS pinning 15 + python site_archive.py --list # List site archives 16 + python site_archive.py --status <site-rkey> # Show archive status 17 + 18 + Each page becomes a bundle record. A site manifest record ties them together: 19 + - systems.witchcraft.archive.site (manifest with page list + link map) 20 + - systems.witchcraft.archive.bundle (one per page, as before) 21 + 22 + Internal links are rewritten to reference sibling bundle rkeys, so the 23 + archive is self-contained and navigable without the live site. 24 + """ 25 + 26 + import argparse 27 + import json 28 + import os 29 + import re 30 + import sys 31 + import time 32 + from collections import deque 33 + from datetime import datetime, timezone 34 + from urllib.parse import urljoin, urlparse, urlunparse 35 + 36 + import requests 37 + from bs4 import BeautifulSoup 38 + 39 + 40 + # --- config --- 41 + 42 + PDS = os.environ.get("ATP_PDS_URL", "https://pds.witchcraft.systems") 43 + COLLECTION_BUNDLE = "systems.witchcraft.archive.bundle" 44 + COLLECTION_SITE = "systems.witchcraft.archive.site" 45 + 46 + MAX_PAGES_DEFAULT = 30 47 + MAX_DEPTH_DEFAULT = 2 48 + MAX_PAGE_SIZE = 5 * 1024 * 1024 # 5MB per page 49 + CRAWL_DELAY = 1.0 # seconds between requests (be polite) 50 + 51 + SKIP_EXTENSIONS = { 52 + '.pdf', '.zip', '.tar', '.gz', '.bz2', '.xz', 53 + '.png', '.jpg', '.jpeg', '.gif', '.webp', '.svg', '.ico', 54 + '.mp3', '.mp4', '.wav', '.ogg', '.webm', '.avi', 55 + '.woff', '.woff2', '.ttf', '.eot', 56 + '.exe', '.dmg', '.deb', '.rpm', 57 + } 58 + 59 + 60 + # --- auth --- 61 + 62 + def get_session(): 63 + """Get ATProto session using KeePass credentials.""" 64 + try: 65 + from pykeepass import PyKeePass 66 + kp = PyKeePass('/home/astra/clawd/.kira.kdbx', 67 + keyfile='/home/astra/clawd/.keyfile') 68 + entry = kp.find_entries(title='Bluesky - kira.pds.witchcraft.systems', 69 + first=True) 70 + resp = requests.post(f'{PDS}/xrpc/com.atproto.server.createSession', 71 + json={'identifier': entry.username, 'password': entry.password}) 72 + resp.raise_for_status() 73 + return resp.json() 74 + except Exception as e: 75 + print(f"auth error: {e}") 76 + sys.exit(1) 77 + 78 + 79 + # --- crawling --- 80 + 81 + def normalize_url(url, base_domain): 82 + """Normalize a URL for deduplication. Returns None if should skip.""" 83 + parsed = urlparse(url) 84 + 85 + # Only http/https 86 + if parsed.scheme not in ('http', 'https', ''): 87 + return None 88 + 89 + # Must be same domain 90 + if parsed.netloc and parsed.netloc != base_domain: 91 + return None 92 + 93 + # Skip binary files 94 + path_lower = parsed.path.lower() 95 + for ext in SKIP_EXTENSIONS: 96 + if path_lower.endswith(ext): 97 + return None 98 + 99 + # Skip fragments-only links 100 + if not parsed.path and not parsed.netloc: 101 + return None 102 + 103 + # Remove fragment, normalize trailing slash 104 + clean = urlunparse(( 105 + parsed.scheme or 'https', 106 + parsed.netloc or base_domain, 107 + parsed.path.rstrip('/') or '/', 108 + parsed.params, 109 + parsed.query, 110 + '' # no fragment 111 + )) 112 + return clean 113 + 114 + 115 + def extract_page_links(html, base_url, base_domain): 116 + """Extract all same-domain page links from HTML.""" 117 + soup = BeautifulSoup(html, 'html.parser') 118 + links = set() 119 + 120 + for tag in soup.find_all('a', href=True): 121 + href = tag['href'].strip() 122 + if not href or href.startswith('#') or href.startswith('javascript:'): 123 + continue 124 + 125 + abs_url = urljoin(base_url, href) 126 + normalized = normalize_url(abs_url, base_domain) 127 + if normalized: 128 + links.add(normalized) 129 + 130 + return links 131 + 132 + 133 + def fetch_page(url): 134 + """Fetch a page. Returns (html_str, title, final_url, status) or None.""" 135 + try: 136 + resp = requests.get(url, timeout=30, headers={ 137 + 'User-Agent': 'SiteArchive/1.0 (ATProto recursive web archiver)' 138 + }, allow_redirects=True) 139 + 140 + content_type = resp.headers.get('content-type', '') 141 + if 'text/html' not in content_type and 'application/xhtml' not in content_type: 142 + return None # not a page 143 + 144 + if len(resp.content) > MAX_PAGE_SIZE: 145 + print(f" SKIP (too large: {len(resp.content):,}b): {url}") 146 + return None 147 + 148 + html = resp.text 149 + soup = BeautifulSoup(html, 'html.parser') 150 + title_tag = soup.find('title') 151 + title = title_tag.get_text(strip=True) if title_tag else '' 152 + 153 + return (html, title, resp.url, resp.status_code) 154 + except Exception as e: 155 + print(f" FAIL: {url} ({e})") 156 + return None 157 + 158 + 159 + def crawl_site(root_url, max_depth=MAX_DEPTH_DEFAULT, max_pages=MAX_PAGES_DEFAULT): 160 + """BFS crawl a site. Returns list of (url, title, html, depth).""" 161 + parsed_root = urlparse(root_url) 162 + base_domain = parsed_root.netloc 163 + 164 + visited = set() 165 + pages = [] 166 + queue = deque([(root_url, 0)]) # (url, depth) 167 + 168 + print(f"crawling {base_domain} (max depth: {max_depth}, max pages: {max_pages})") 169 + print() 170 + 171 + while queue and len(pages) < max_pages: 172 + url, depth = queue.popleft() 173 + 174 + if url in visited: 175 + continue 176 + visited.add(url) 177 + 178 + if depth > max_depth: 179 + continue 180 + 181 + result = fetch_page(url) 182 + if result is None: 183 + continue 184 + 185 + html, title, final_url, status = result 186 + pages.append({ 187 + 'url': url, 188 + 'final_url': final_url, 189 + 'title': title, 190 + 'html': html, 191 + 'depth': depth, 192 + 'status': status, 193 + }) 194 + 195 + prefix = " " * depth 196 + print(f"{prefix}[d{depth}] {title[:60] or url}") 197 + 198 + # Extract links for next level 199 + if depth < max_depth: 200 + links = extract_page_links(html, final_url, base_domain) 201 + new_links = links - visited 202 + for link in sorted(new_links): 203 + queue.append((link, depth + 1)) 204 + 205 + # Be polite 206 + if len(pages) < max_pages and queue: 207 + time.sleep(CRAWL_DELAY) 208 + 209 + print(f"\ncrawled {len(pages)} pages ({len(visited)} URLs visited)") 210 + return pages 211 + 212 + 213 + # --- archiving --- 214 + 215 + def archive_page_as_bundle(url, no_ipfs=False): 216 + """Archive a single page using web_archive.py's bundle mode. 217 + Returns the record URI and rkey, or None on failure.""" 218 + import subprocess 219 + cmd = ['python', '/home/astra/clawd/scripts/web_archive.py', url, '--bundle'] 220 + if no_ipfs: 221 + cmd.append('--no-ipfs') 222 + 223 + try: 224 + result = subprocess.run(cmd, capture_output=True, text=True, timeout=120) 225 + output = result.stdout 226 + 227 + # Parse the URI from output 228 + for line in output.split('\n'): 229 + if line.startswith('bundled!'): 230 + uri = line.split('bundled!')[1].strip() 231 + rkey = uri.split('/')[-1] 232 + return uri, rkey 233 + 234 + # Fallback: look for "at://" in output 235 + for line in output.split('\n'): 236 + if 'at://' in line and COLLECTION_BUNDLE in line: 237 + match = re.search(r'(at://[^\s]+)', line) 238 + if match: 239 + uri = match.group(1) 240 + rkey = uri.split('/')[-1] 241 + return uri, rkey 242 + 243 + print(f" archive failed (no URI in output)") 244 + if result.stderr: 245 + print(f" stderr: {result.stderr[:200]}") 246 + return None 247 + except subprocess.TimeoutExpired: 248 + print(f" archive timed out for {url}") 249 + return None 250 + except Exception as e: 251 + print(f" archive error: {e}") 252 + return None 253 + 254 + 255 + def rewrite_links(html, link_map, base_url, base_domain): 256 + """Rewrite internal links in HTML to point to archived versions. 257 + 258 + link_map: {original_url: bundle_rkey} 259 + Returns modified HTML string. 260 + """ 261 + soup = BeautifulSoup(html, 'html.parser') 262 + 263 + for tag in soup.find_all('a', href=True): 264 + href = tag['href'].strip() 265 + if not href or href.startswith('#') or href.startswith('javascript:'): 266 + continue 267 + 268 + abs_url = urljoin(base_url, href) 269 + normalized = normalize_url(abs_url, base_domain) 270 + 271 + if normalized and normalized in link_map: 272 + rkey = link_map[normalized] 273 + # Rewrite to pdsls viewer URL 274 + tag['href'] = f'https://pdsls.dev/at/did:plc:2tqqxubv2lu4ahj35ysjer2r/{COLLECTION_BUNDLE}/{rkey}' 275 + tag['data-archive-original'] = href 276 + tag['data-archive-rkey'] = rkey 277 + 278 + return str(soup) 279 + 280 + 281 + def create_site_manifest(session, root_url, pages_info, link_map): 282 + """Create a site manifest record that ties all page bundles together.""" 283 + now = datetime.now(timezone.utc).isoformat().replace("+00:00", "Z") 284 + parsed = urlparse(root_url) 285 + 286 + record = { 287 + '$type': COLLECTION_SITE, 288 + 'name': parsed.netloc, 289 + 'rootUrl': root_url, 290 + 'capturedAt': now, 291 + 'pageCount': len(pages_info), 292 + 'pages': [], 293 + 'linkMap': {}, 294 + } 295 + 296 + for page in pages_info: 297 + page_entry = { 298 + 'url': page['url'], 299 + 'title': page.get('title', ''), 300 + 'bundleUri': page.get('bundle_uri', ''), 301 + 'bundleRkey': page.get('bundle_rkey', ''), 302 + 'depth': page['depth'], 303 + } 304 + record['pages'].append(page_entry) 305 + 306 + # Include link rewriting map 307 + for url, rkey in link_map.items(): 308 + record['linkMap'][url] = rkey 309 + 310 + # Generate TID 311 + import random 312 + us = int(time.time() * 1_000_000) 313 + clock_id = random.randint(0, 1023) 314 + tid_int = (us << 10) | clock_id 315 + CHARSET = "234567abcdefghijklmnopqrstuvwxyz" 316 + tid = "" 317 + v = tid_int 318 + for _ in range(13): 319 + tid = CHARSET[v & 0x1F] + tid 320 + v >>= 5 321 + 322 + resp = requests.post(f'{PDS}/xrpc/com.atproto.repo.putRecord', 323 + headers={'Authorization': f'Bearer {session["accessJwt"]}'}, 324 + json={ 325 + 'repo': session['did'], 326 + 'collection': COLLECTION_SITE, 327 + 'rkey': tid, 328 + 'record': record, 329 + }) 330 + 331 + if resp.status_code == 200: 332 + result = resp.json() 333 + print(f"\nsite manifest: {result['uri']}") 334 + print(f" view: https://pdsls.dev/at/{session['did']}/{COLLECTION_SITE}/{tid}") 335 + return result 336 + else: 337 + print(f"manifest creation failed: {resp.status_code} {resp.text[:200]}") 338 + return None 339 + 340 + 341 + # --- commands --- 342 + 343 + def cmd_archive_site(root_url, max_depth=MAX_DEPTH_DEFAULT, 344 + max_pages=MAX_PAGES_DEFAULT, no_ipfs=False, 345 + dry_run=False): 346 + """Main command: crawl and archive a site.""" 347 + 348 + # Phase 1: Crawl 349 + pages = crawl_site(root_url, max_depth=max_depth, max_pages=max_pages) 350 + 351 + if not pages: 352 + print("no pages found!") 353 + return 354 + 355 + if dry_run: 356 + print(f"\n--- DRY RUN ---") 357 + print(f"would archive {len(pages)} pages:") 358 + for p in pages: 359 + prefix = " " * p['depth'] 360 + print(f" {prefix}{p['url']}") 361 + print(f" {prefix} title: {p['title'][:60]}") 362 + return 363 + 364 + # Phase 2: Archive each page as a bundle 365 + print(f"\n--- archiving {len(pages)} pages ---\n") 366 + link_map = {} # normalized_url -> bundle_rkey 367 + pages_info = [] 368 + 369 + for i, page in enumerate(pages): 370 + print(f"\n[{i+1}/{len(pages)}] {page['url']}") 371 + result = archive_page_as_bundle(page['url'], no_ipfs=no_ipfs) 372 + 373 + if result: 374 + uri, rkey = result 375 + page['bundle_uri'] = uri 376 + page['bundle_rkey'] = rkey 377 + 378 + # Normalize URL for link map 379 + parsed_root = urlparse(root_url) 380 + normalized = normalize_url(page['url'], parsed_root.netloc) 381 + if normalized: 382 + link_map[normalized] = rkey 383 + # Also map final_url if different 384 + if page.get('final_url') and page['final_url'] != page['url']: 385 + norm_final = normalize_url(page['final_url'], parsed_root.netloc) 386 + if norm_final: 387 + link_map[norm_final] = rkey 388 + 389 + pages_info.append(page) 390 + print(f" archived: {rkey}") 391 + else: 392 + print(f" FAILED to archive") 393 + 394 + # Rate limit between archives 395 + if i < len(pages) - 1: 396 + time.sleep(2) 397 + 398 + print(f"\n--- archived {len(pages_info)}/{len(pages)} pages ---") 399 + 400 + # Phase 3: Create site manifest 401 + if pages_info: 402 + session = get_session() 403 + create_site_manifest(session, root_url, pages_info, link_map) 404 + 405 + # Summary 406 + print(f"\n=== site archive complete ===") 407 + print(f" root: {root_url}") 408 + print(f" pages: {len(pages_info)}") 409 + print(f" link map entries: {len(link_map)}") 410 + for p in pages_info: 411 + depth_prefix = " " * p['depth'] 412 + print(f" {depth_prefix}{p['url']}") 413 + print(f" {depth_prefix} -> {p.get('bundle_rkey', '?')}") 414 + 415 + 416 + def cmd_list_sites(): 417 + """List site archive manifests.""" 418 + session = get_session() 419 + resp = requests.get(f'{PDS}/xrpc/com.atproto.repo.listRecords', 420 + headers={'Authorization': f'Bearer {session["accessJwt"]}'}, 421 + params={ 422 + 'repo': session['did'], 423 + 'collection': COLLECTION_SITE, 424 + 'limit': 20, 425 + }) 426 + 427 + if resp.status_code != 200: 428 + print(f"error: {resp.status_code}") 429 + return 430 + 431 + records = resp.json().get('records', []) 432 + if not records: 433 + print("no site archives found") 434 + return 435 + 436 + print(f"site archives ({len(records)}):\n") 437 + for rec in records: 438 + val = rec['value'] 439 + rkey = rec['uri'].split('/')[-1] 440 + print(f" {val.get('name', '?')} ({val.get('pageCount', '?')} pages)") 441 + print(f" root: {val.get('rootUrl', '?')}") 442 + print(f" captured: {val.get('capturedAt', '?')}") 443 + print(f" rkey: {rkey}") 444 + print() 445 + 446 + 447 + def cmd_status(site_rkey): 448 + """Show status of a site archive.""" 449 + session = get_session() 450 + resp = requests.get(f'{PDS}/xrpc/com.atproto.repo.getRecord', 451 + headers={'Authorization': f'Bearer {session["accessJwt"]}'}, 452 + params={ 453 + 'repo': session['did'], 454 + 'collection': COLLECTION_SITE, 455 + 'rkey': site_rkey, 456 + }) 457 + 458 + if resp.status_code != 200: 459 + print(f"not found: {site_rkey}") 460 + return 461 + 462 + val = resp.json()['value'] 463 + print(f"site: {val.get('name', '?')}") 464 + print(f"root: {val.get('rootUrl', '?')}") 465 + print(f"captured: {val.get('capturedAt', '?')}") 466 + print(f"pages: {val.get('pageCount', '?')}") 467 + print() 468 + 469 + for page in val.get('pages', []): 470 + depth_prefix = " " * page.get('depth', 0) 471 + print(f" {depth_prefix}{page.get('url', '?')}") 472 + print(f" {depth_prefix} title: {page.get('title', '')[:60]}") 473 + print(f" {depth_prefix} bundle: {page.get('bundleRkey', '?')}") 474 + 475 + 476 + def main(): 477 + parser = argparse.ArgumentParser( 478 + description='Recursively archive a website to ATProto') 479 + parser.add_argument('url', nargs='?', help='Root URL to archive') 480 + parser.add_argument('--depth', type=int, default=MAX_DEPTH_DEFAULT, 481 + help=f'Max crawl depth (default: {MAX_DEPTH_DEFAULT})') 482 + parser.add_argument('--max-pages', type=int, default=MAX_PAGES_DEFAULT, 483 + help=f'Max pages to archive (default: {MAX_PAGES_DEFAULT})') 484 + parser.add_argument('--no-ipfs', action='store_true', 485 + help='Skip IPFS pinning') 486 + parser.add_argument('--dry-run', action='store_true', 487 + help='Show what would be archived without doing it') 488 + parser.add_argument('--list', action='store_true', 489 + help='List site archives') 490 + parser.add_argument('--status', metavar='RKEY', 491 + help='Show status of a site archive') 492 + 493 + args = parser.parse_args() 494 + 495 + if args.list: 496 + cmd_list_sites() 497 + elif args.status: 498 + cmd_status(args.status) 499 + elif args.url: 500 + cmd_archive_site(args.url, max_depth=args.depth, 501 + max_pages=args.max_pages, no_ipfs=args.no_ipfs, 502 + dry_run=args.dry_run) 503 + else: 504 + parser.print_help() 505 + 506 + 507 + if __name__ == '__main__': 508 + main()

Configure Feed

Configure Feed