web archiver with MASL bundle mode for ATProto. captures web pages as content-addressed bundles stored on your PDS with optional IPFS pinning.
6
fork

Configure Feed

Select the types of activity you want to include in your feed.

at main 758 lines 28 kB view raw
1#!/usr/bin/env python3 2""" 3web_archive.py - Capture and archive web pages to ATProto with IPFS pinning. 4 5Creates signed, timestamped records of web page captures on your PDS. 6Each capture includes an IPFS CID (content-addressed identifier) and a PDS blob. 7 8Supports two modes: 9 - Single capture: archive one URL as a `systems.witchcraft.archive.capture` record 10 - Bundle capture: archive a page + all its subresources (CSS, JS, images) as a 11 `systems.witchcraft.archive.bundle` record containing a MASL-shaped manifest 12 with CID-addressed resources (MASL spec: https://dasl.ing/masl.html) 13 14Usage: 15 python web_archive.py <url> # Archive a URL (single capture) 16 python web_archive.py <url> --bundle # Archive URL + subresources as MASL bundle 17 python web_archive.py --list [--limit N] # List recent captures 18 python web_archive.py --verify <rkey> # Re-fetch and verify CID 19 python web_archive.py --search <query> # Search captures by URL/title 20""" 21 22import argparse 23import hashlib 24import json 25import mimetypes 26import os 27import re 28import sys 29from datetime import datetime, timezone 30from urllib.parse import urljoin, urlparse 31 32import requests 33from multiformats import CID, multihash 34 35 36# --- config --- 37 38PDS = os.environ.get("ATP_PDS_URL", "https://bsky.social") 39ATP_HANDLE = os.environ.get("ATP_HANDLE", "") 40ATP_PASSWORD = os.environ.get("ATP_PASSWORD", "") 41COLLECTION_CAPTURE = "systems.witchcraft.archive.capture" 42COLLECTION_BUNDLE = "ing.dasl.masl" 43COLLECTION_MASL_LEGACY = COLLECTION_BUNDLE # consolidated 44 45# Max subresources to fetch per bundle (safety limit) 46MAX_SUBRESOURCES = 100 47# Max size per individual resource (10MB) 48MAX_RESOURCE_SIZE = 10 * 1024 * 1024 49# Allowed subresource schemes 50ALLOWED_SCHEMES = {"http", "https"} 51 52 53# --- auth --- 54 55def get_session(): 56 handle = ATP_HANDLE 57 password = ATP_PASSWORD 58 59 if not handle or not password: 60 print("error: set ATP_HANDLE and ATP_PASSWORD environment variables") 61 print(" export ATP_PDS_URL=https://your.pds.example.com") 62 print(" export ATP_HANDLE=your.handle") 63 print(" export ATP_PASSWORD=your-app-password") 64 sys.exit(1) 65 66 resp = requests.post(f"{PDS}/xrpc/com.atproto.server.createSession", 67 json={"identifier": handle, "password": password}) 68 resp.raise_for_status() 69 return resp.json() 70 71 72# --- fetch --- 73 74def fetch_page(url): 75 """Fetch a URL and return (html_bytes, title, final_url, word_count, status_code).""" 76 headers = { 77 "User-Agent": "WebArchive/1.0 (ATProto web archiver)" 78 } 79 resp = requests.get(url, headers=headers, timeout=30, allow_redirects=True) 80 resp.raise_for_status() 81 html_bytes = resp.content 82 html = html_bytes.decode("utf-8", errors="replace") 83 final_url = resp.url 84 85 # Extract title 86 title_match = re.search(r"<title[^>]*>(.*?)</title>", html, re.IGNORECASE | re.DOTALL) 87 title = title_match.group(1).strip() if title_match else "" 88 title = title.replace("&amp;", "&").replace("&lt;", "<").replace("&gt;", ">") 89 title = title.replace("&#39;", "'").replace("&quot;", '"') 90 title = title[:256] 91 92 # Word count from text extraction 93 text = re.sub(r"<script[^>]*>.*?</script>", "", html, flags=re.DOTALL | re.IGNORECASE) 94 text = re.sub(r"<style[^>]*>.*?</style>", "", text, flags=re.DOTALL | re.IGNORECASE) 95 text = re.sub(r"<[^>]+>", " ", text) 96 text = re.sub(r"\s+", " ", text).strip() 97 word_count = len(text.split()) 98 99 return html_bytes, title, final_url, word_count, resp.status_code 100 101 102def fetch_resource(url): 103 """Fetch a subresource. Returns (bytes, content_type, final_url) or None on failure.""" 104 headers = { 105 "User-Agent": "WebArchive/1.0 (ATProto web archiver)" 106 } 107 try: 108 resp = requests.get(url, headers=headers, timeout=20, allow_redirects=True, 109 stream=True) 110 resp.raise_for_status() 111 112 # Check size before downloading full body 113 content_length = resp.headers.get("content-length") 114 if content_length and int(content_length) > MAX_RESOURCE_SIZE: 115 print(f" skip (too large: {int(content_length):,} bytes): {url[:80]}") 116 return None 117 118 content = resp.content 119 if len(content) > MAX_RESOURCE_SIZE: 120 print(f" skip (too large: {len(content):,} bytes): {url[:80]}") 121 return None 122 123 content_type = resp.headers.get("content-type", "application/octet-stream") 124 # Strip charset/params from content-type for MASL 125 content_type = content_type.split(";")[0].strip() 126 return content, content_type, resp.url 127 except Exception as e: 128 print(f" skip (error: {e}): {url[:80]}") 129 return None 130 131 132def extract_subresource_urls(html, base_url): 133 """Extract URLs of subresources (CSS, JS, images, fonts) from HTML.""" 134 urls = {} # url -> expected type hint 135 136 # CSS: <link rel="stylesheet" href="..."> 137 for m in re.finditer(r'<link[^>]+rel=["\']stylesheet["\'][^>]+href=["\']([^"\']+)["\']', html, re.IGNORECASE): 138 urls[m.group(1)] = "text/css" 139 for m in re.finditer(r'<link[^>]+href=["\']([^"\']+)["\'][^>]+rel=["\']stylesheet["\']', html, re.IGNORECASE): 140 urls[m.group(1)] = "text/css" 141 142 # JS: <script src="..."> 143 for m in re.finditer(r'<script[^>]+src=["\']([^"\']+)["\']', html, re.IGNORECASE): 144 urls[m.group(1)] = "application/javascript" 145 146 # Images: <img src="...">, <source srcset="..."> 147 for m in re.finditer(r'<img[^>]+src=["\']([^"\']+)["\']', html, re.IGNORECASE): 148 urls[m.group(1)] = "image/*" 149 for m in re.finditer(r'<source[^>]+srcset=["\']([^"\']+)["\']', html, re.IGNORECASE): 150 # srcset can have multiple URLs with widths 151 for part in m.group(1).split(","): 152 src = part.strip().split()[0] 153 if src: 154 urls[src] = "image/*" 155 156 # Favicons and other <link> with href 157 for m in re.finditer(r'<link[^>]+href=["\']([^"\']+)["\'][^>]*>', html, re.IGNORECASE): 158 href = m.group(1) 159 if href not in urls: 160 # Check if it's an icon or other resource 161 tag = m.group(0).lower() 162 if 'rel="icon"' in tag or "rel='icon'" in tag or 'rel="apple-touch-icon"' in tag: 163 urls[href] = "image/*" 164 165 # CSS @import and url() in inline <style> 166 for style_match in re.finditer(r'<style[^>]*>(.*?)</style>', html, re.DOTALL | re.IGNORECASE): 167 style_content = style_match.group(1) 168 for m in re.finditer(r'url\(["\']?([^"\')\s]+)["\']?\)', style_content): 169 url_val = m.group(1) 170 if not url_val.startswith("data:"): 171 urls[url_val] = "application/octet-stream" 172 for m in re.finditer(r'@import\s+["\']([^"\']+)["\']', style_content): 173 urls[m.group(1)] = "text/css" 174 175 # Resolve relative URLs and filter 176 resolved = {} 177 base_parsed = urlparse(base_url) 178 for url, type_hint in urls.items(): 179 if url.startswith("data:") or url.startswith("#") or url.startswith("javascript:"): 180 continue 181 absolute = urljoin(base_url, url) 182 parsed = urlparse(absolute) 183 # Only same-origin resources (MASL paths must start with /) 184 if parsed.scheme in ALLOWED_SCHEMES and parsed.netloc == base_parsed.netloc: 185 path = parsed.path or "/" 186 if path not in resolved: 187 resolved[path] = (absolute, type_hint) 188 189 return resolved 190 191 192def compute_cid(content_bytes): 193 """Compute IPFS CIDv1 (raw codec, sha2-256) for content bytes.""" 194 digest = hashlib.sha256(content_bytes).digest() 195 mh = multihash.wrap(digest, "sha2-256") 196 cid = CID("base32", 1, "raw", mh) 197 return str(cid) 198 199 200def hash_content(content_bytes): 201 """SHA-256 hash of content bytes.""" 202 return hashlib.sha256(content_bytes).hexdigest() 203 204 205def pin_to_ipfs(content_bytes, filename="capture.html", content_type="text/html"): 206 """Pin content to local IPFS node. Returns (cid, pinned).""" 207 try: 208 resp = requests.post("http://127.0.0.1:5001/api/v0/add", 209 files={"file": (filename, content_bytes, content_type)}, 210 params={"pin": "true", "cid-version": "1", "raw-leaves": "false"}, 211 timeout=30) 212 resp.raise_for_status() 213 data = resp.json() 214 return data["Hash"], True 215 except Exception as e: 216 return compute_cid(content_bytes), False 217 218 219def upload_blob(session, content_bytes, content_type="text/html"): 220 """Upload content as a blob to the PDS. Returns blob ref.""" 221 try: 222 resp = requests.post(f"{PDS}/xrpc/com.atproto.repo.uploadBlob", 223 headers={ 224 "Authorization": f"Bearer {session['accessJwt']}", 225 "Content-Type": content_type, 226 }, 227 data=content_bytes, 228 timeout=60) 229 resp.raise_for_status() 230 return resp.json().get("blob") 231 except Exception as e: 232 print(f" warning: blob upload failed ({e})") 233 return None 234 235 236def announce_ipfs_content(cid): 237 """Announce content to IPFS DHT so it's discoverable on public gateways.""" 238 try: 239 resp = requests.post("http://127.0.0.1:5001/api/v0/routing/provide", 240 params={"arg": cid}, 241 timeout=60) 242 return resp.status_code == 200 243 except: 244 return False 245 246 247# --- atproto records --- 248 249def create_capture(session, url, final_url, title, cid, content_hash, word_count, 250 html_size, pinned=False, blob_ref=None): 251 """Create a single-page archive capture record.""" 252 now = datetime.now(timezone.utc).isoformat().replace("+00:00", "Z") 253 254 record = { 255 "$type": COLLECTION_CAPTURE, 256 "url": url, 257 "capturedAt": now, 258 "ipfsCid": cid, 259 "contentHash": f"sha256:{content_hash}", 260 "title": title, 261 "wordCount": word_count, 262 "htmlSize": html_size, 263 "pinned": pinned, 264 } 265 266 if final_url != url: 267 record["finalUrl"] = final_url 268 if blob_ref: 269 record["htmlBlob"] = blob_ref 270 271 resp = requests.post(f"{PDS}/xrpc/com.atproto.repo.createRecord", 272 headers={"Authorization": f"Bearer {session['accessJwt']}"}, 273 json={ 274 "repo": session["did"], 275 "collection": COLLECTION_CAPTURE, 276 "record": record, 277 }) 278 resp.raise_for_status() 279 return resp.json() 280 281 282def create_bundle_record(session, name, url, resources_map, blobs_map, 283 captured_at, archive_meta=None): 284 """Create an ing.dasl.masl record on ATProto (Web Tiles format). 285 286 Uses MASL bundle format with archive metadata namespaced under 287 systems.witchcraft.archive. 288 289 resources_map: dict of path -> {src: blob_ref, content-type: str} 290 blobs_map: dict of path -> blob_ref (kept for compat, merged into resources) 291 archive_meta: dict with url, capturedAt, title, etc. 292 """ 293 # Merge blob refs into resources (MASL format: resources[path].src = blob ref) 294 merged_resources = {} 295 for path, res_data in resources_map.items(): 296 entry = dict(res_data) if isinstance(res_data, dict) else {"src": res_data} 297 if path in blobs_map: 298 entry["src"] = blobs_map[path] 299 merged_resources[path] = entry 300 301 record = { 302 "$type": COLLECTION_BUNDLE, 303 # MASL required fields at top level 304 "name": name, 305 "resources": merged_resources, 306 } 307 308 # Archive metadata namespaced 309 archive_ns = { 310 "url": archive_meta.get("url", url) if archive_meta else url, 311 "title": archive_meta.get("title", name) if archive_meta else name, 312 "capturedAt": captured_at, 313 } 314 if archive_meta: 315 for key in ["wordCount", "totalSize", "resourceCount", 316 "rootIpfsCid", "contentHash", "pinned", "finalUrl"]: 317 if key in archive_meta: 318 archive_ns[key] = archive_meta[key] 319 record["systems.witchcraft.archive"] = archive_ns 320 321 resp = requests.post(f"{PDS}/xrpc/com.atproto.repo.createRecord", 322 headers={"Authorization": f"Bearer {session['accessJwt']}"}, 323 json={ 324 "repo": session["did"], 325 "collection": COLLECTION_BUNDLE, 326 "record": record, 327 }) 328 resp.raise_for_status() 329 return resp.json() 330 331 332# --- commands --- 333 334def cmd_archive(url, no_ipfs=False, no_blob=False): 335 """Archive a single URL.""" 336 print(f"fetching {url}...") 337 html_bytes, title, final_url, word_count, status = fetch_page(url) 338 content_hash = hash_content(html_bytes) 339 html_size = len(html_bytes) 340 341 if not no_ipfs: 342 print(f" pinning to IPFS...") 343 cid, pinned = pin_to_ipfs(html_bytes) 344 if pinned: 345 announce_ipfs_content(cid) 346 else: 347 cid = compute_cid(html_bytes) 348 pinned = False 349 350 print(f" title: {title[:80]}") 351 print(f" final url: {final_url}") 352 print(f" cid: {cid}") 353 print(f" pinned: {'yes' if pinned else 'no (local CID only)'}") 354 print(f" hash: sha256:{content_hash[:16]}...") 355 print(f" size: {html_size:,} bytes, ~{word_count:,} words") 356 357 session = get_session() 358 359 blob_ref = None 360 if not no_blob: 361 print(f" uploading blob to PDS...") 362 blob_ref = upload_blob(session, html_bytes) 363 if blob_ref: 364 print(f" blob uploaded!") 365 366 result = create_capture(session, url, final_url, title, cid, content_hash, 367 word_count, html_size, pinned=pinned, blob_ref=blob_ref) 368 uri = result.get("uri", "") 369 rkey = uri.split("/")[-1] if uri else "?" 370 371 print(f"\ncaptured! {uri}") 372 print(f" view: https://pdsls.dev/at/{session['did']}/{COLLECTION_CAPTURE}/{rkey}") 373 return uri 374 375 376def cmd_bundle(url, no_ipfs=False, max_resources=MAX_SUBRESOURCES): 377 """Archive a URL + all its subresources as a MASL bundle.""" 378 print(f"fetching {url}...") 379 html_bytes, title, final_url, word_count, status = fetch_page(url) 380 html = html_bytes.decode("utf-8", errors="replace") 381 html_size = len(html_bytes) 382 383 print(f" title: {title[:80]}") 384 print(f" size: {html_size:,} bytes, ~{word_count:,} words") 385 386 # Extract subresource URLs 387 print(f"\nscanning for subresources...") 388 subresource_urls = extract_subresource_urls(html, final_url) 389 print(f" found {len(subresource_urls)} same-origin subresources") 390 391 if len(subresource_urls) > max_resources: 392 print(f" capping at {max_resources} (use --max-resources to change)") 393 # Keep only the first N 394 subresource_urls = dict(list(subresource_urls.items())[:max_resources]) 395 396 # Fetch all subresources 397 print(f"\nfetching subresources...") 398 resources = {} # path -> (bytes, content_type, ipfs_cid) 399 400 for path, (abs_url, type_hint) in subresource_urls.items(): 401 result = fetch_resource(abs_url) 402 if result: 403 res_bytes, content_type, _ = result 404 print(f" ok ({len(res_bytes):,}b {content_type}): {path}") 405 resources[path] = (res_bytes, content_type) 406 407 # Phase 2: Scan fetched CSS files for url() references (fonts, images, etc) 408 css_extra_urls = {} 409 for path, (res_bytes, content_type) in list(resources.items()): 410 if "css" in content_type: 411 css_text = res_bytes.decode("utf-8", errors="replace") 412 css_base = urljoin(final_url, path) 413 for m in re.finditer(r'url\(["\']?([^"\')\s]+)["\']?\)', css_text): 414 url_val = m.group(1) 415 if url_val.startswith("data:"): 416 continue 417 abs_url = urljoin(css_base, url_val) 418 parsed = urlparse(abs_url) 419 base_parsed = urlparse(final_url) 420 if parsed.scheme in ALLOWED_SCHEMES: 421 css_path = parsed.path or "/" 422 if css_path not in resources and css_path not in css_extra_urls: 423 css_extra_urls[css_path] = (abs_url, "application/octet-stream") 424 425 if css_extra_urls: 426 print(f"\n found {len(css_extra_urls)} extra resources from CSS url() refs") 427 for path, (abs_url, _) in css_extra_urls.items(): 428 if len(resources) >= max_resources: 429 break 430 result = fetch_resource(abs_url) 431 if result: 432 res_bytes, content_type, _ = result 433 print(f" ok ({len(res_bytes):,}b {content_type}): {path}") 434 resources[path] = (res_bytes, content_type) 435 436 print(f"\n fetched {len(resources)} total subresources") 437 438 # Get session for blob uploads 439 session = get_session() 440 441 # Process root page 442 print(f"\nprocessing root page...") 443 if not no_ipfs: 444 root_cid, root_pinned = pin_to_ipfs(html_bytes) 445 if root_pinned: 446 announce_ipfs_content(root_cid) 447 else: 448 root_cid = compute_cid(html_bytes) 449 root_pinned = False 450 451 root_blob = upload_blob(session, html_bytes, "text/html") 452 if not root_blob: 453 print(" ERROR: failed to upload root page blob, aborting") 454 sys.exit(1) 455 456 print(f" root cid: {root_cid} (pinned: {root_pinned})") 457 458 # Build MASL resources map (CID strings) and blobs map (ATProto blob refs) 459 # MASL spec: src should be a CID link, not a blob ref 460 resources_map = { 461 "/": { 462 "src": root_cid, 463 "content-type": "text/html", 464 } 465 } 466 blobs_map = { 467 "/": root_blob, 468 } 469 470 # Process subresources 471 total_pinned = 1 if root_pinned else 0 472 total_blobs = 1 473 total_bytes = html_size 474 failed_blobs = 0 475 476 for path, (res_bytes, content_type) in resources.items(): 477 total_bytes += len(res_bytes) 478 479 # Pin to IPFS 480 if not no_ipfs: 481 res_cid, res_pinned = pin_to_ipfs(res_bytes, 482 filename=path.split("/")[-1] or "resource", 483 content_type=content_type) 484 if res_pinned: 485 total_pinned += 1 486 announce_ipfs_content(res_cid) 487 else: 488 res_cid = compute_cid(res_bytes) 489 490 # Upload blob to PDS 491 blob_ref = upload_blob(session, res_bytes, content_type) 492 if blob_ref: 493 total_blobs += 1 494 resources_map[path] = { 495 "src": res_cid, 496 "content-type": content_type, 497 } 498 blobs_map[path] = blob_ref 499 else: 500 failed_blobs += 1 501 print(f" blob failed: {path}") 502 503 print(f"\n resources: {len(resources_map)} ({total_blobs} blobs, {total_pinned} pinned)") 504 print(f" total size: {total_bytes:,} bytes") 505 if failed_blobs: 506 print(f" failed blobs: {failed_blobs}") 507 508 # Build archive metadata (namespaced per MASL spec) 509 now = datetime.now(timezone.utc).isoformat().replace("+00:00", "Z") 510 archive_meta = { 511 "url": url, 512 "capturedAt": now, 513 "title": title, 514 "wordCount": word_count, 515 "totalSize": total_bytes, 516 "resourceCount": len(resources_map), 517 "rootIpfsCid": root_cid, 518 "contentHash": f"sha256:{hash_content(html_bytes)}", 519 "pinned": total_pinned > 0, 520 } 521 if final_url != url: 522 archive_meta["finalUrl"] = final_url 523 524 # Create bundle record 525 print(f"\ncreating archive bundle record...") 526 bundle_name = title or urlparse(url).netloc 527 result = create_bundle_record(session, bundle_name, url, 528 resources_map, blobs_map, 529 now, archive_meta=archive_meta) 530 uri = result.get("uri", "") 531 rkey = uri.split("/")[-1] if uri else "?" 532 533 print(f"\nbundled! {uri}") 534 print(f" view: https://pdsls.dev/at/{session['did']}/{COLLECTION_BUNDLE}/{rkey}") 535 print(f" resources: {len(resources_map)} files, {total_bytes:,} bytes total") 536 print(f" root IPFS: ipfs://{root_cid}") 537 return uri 538 539 540def cmd_list(limit, collection=None): 541 """List recent captures from either or both collections.""" 542 session = get_session() 543 all_records = [] 544 545 collections = ([COLLECTION_CAPTURE, COLLECTION_BUNDLE] 546 if not collection else [collection]) 547 548 for coll in collections: 549 try: 550 resp = requests.get(f"{PDS}/xrpc/com.atproto.repo.listRecords", 551 headers={"Authorization": f"Bearer {session['accessJwt']}"}, 552 params={ 553 "repo": session["did"], 554 "collection": coll, 555 "limit": limit, 556 "reverse": True, 557 }) 558 resp.raise_for_status() 559 for rec in resp.json().get("records", []): 560 rec["_collection"] = coll 561 all_records.append(rec) 562 except: 563 pass 564 565 if not all_records: 566 print("no captures yet") 567 return 568 569 # Sort by captured time (newest first) 570 def sort_key(r): 571 val = r.get("value", {}) 572 meta = val.get("systems.witchcraft.archive", {}) 573 return meta.get("capturedAt", val.get("capturedAt", "")) 574 all_records.sort(key=sort_key, reverse=True) 575 576 for rec in all_records[:limit]: 577 val = rec.get("value", {}) 578 rkey = rec["uri"].split("/")[-1] 579 coll = rec["_collection"] 580 is_bundle = coll in (COLLECTION_BUNDLE, COLLECTION_MASL_LEGACY) 581 582 if is_bundle: 583 meta = val.get("systems.witchcraft.archive", {}) 584 title = val.get("name", meta.get("title", "(untitled)"))[:60] 585 url = meta.get("url", "")[:60] 586 captured = meta.get("capturedAt", "")[:19] 587 cid = meta.get("rootIpfsCid", "") 588 res_count = meta.get("resourceCount", len(val.get("resources", {}))) 589 total_size = meta.get("totalSize", 0) 590 tag = f"[BUNDLE {res_count} files, {total_size:,}b]" 591 else: 592 title = val.get("title", "(untitled)")[:60] 593 url = val.get("url", "")[:60] 594 captured = val.get("capturedAt", "")[:19] 595 cid = val.get("ipfsCid", val.get("cid", "")) 596 tag = "[single]" 597 598 print(f" [{rkey}] {captured} {tag}") 599 print(f" {title}") 600 print(f" {url}") 601 if cid: 602 print(f" ipfs://{cid}") 603 print() 604 605 606def cmd_verify(rkey): 607 """Verify a capture's hash against current page content.""" 608 session = get_session() 609 610 # Try all collections 611 rec = None 612 for coll in [COLLECTION_CAPTURE, COLLECTION_BUNDLE]: 613 try: 614 resp = requests.get(f"{PDS}/xrpc/com.atproto.repo.getRecord", 615 headers={"Authorization": f"Bearer {session['accessJwt']}"}, 616 params={"repo": session["did"], "collection": coll, "rkey": rkey}) 617 resp.raise_for_status() 618 rec = resp.json() 619 rec["_collection"] = coll 620 break 621 except: 622 continue 623 624 if not rec: 625 print(f"capture {rkey} not found in any collection") 626 return 627 628 val = rec.get("value", {}) 629 is_bundle = rec["_collection"] in (COLLECTION_BUNDLE, COLLECTION_MASL_LEGACY) 630 631 if is_bundle: 632 meta = val.get("systems.witchcraft.archive", {}) 633 url = meta.get("finalUrl", meta.get("url", "")) 634 stored_cid = meta.get("rootIpfsCid", "") 635 stored_hash = meta.get("contentHash", "") 636 captured_at = meta.get("capturedAt", "") 637 else: 638 url = val.get("finalUrl", val.get("url", "")) 639 stored_cid = val.get("ipfsCid", val.get("cid", "")) 640 stored_hash = val.get("contentHash", "") 641 captured_at = val.get("capturedAt", "") 642 643 print(f"verifying capture {rkey} ({'bundle' if is_bundle else 'single'})...") 644 print(f" url: {url}") 645 print(f" captured: {captured_at}") 646 if stored_cid: 647 print(f" stored cid: {stored_cid}") 648 print(f" stored hash: {stored_hash}") 649 650 print(f"\nre-fetching {url}...") 651 html_bytes, _, _, _, _ = fetch_page(url) 652 current_hash = f"sha256:{hash_content(html_bytes)}" 653 current_cid = compute_cid(html_bytes) 654 655 if stored_cid: 656 print(f" current cid: {current_cid}") 657 print(f" current hash: {current_hash}") 658 659 match = (stored_cid == current_cid) if stored_cid else (stored_hash == current_hash) 660 if match: 661 print("\n MATCH - page content unchanged since capture") 662 else: 663 print("\n MISMATCH - page has changed since capture!") 664 print(" (this is expected for dynamic pages)") 665 666 667def cmd_search(query): 668 """Search captures by URL or title.""" 669 session = get_session() 670 query_lower = query.lower() 671 matches = [] 672 673 for coll in [COLLECTION_CAPTURE, COLLECTION_BUNDLE]: 674 try: 675 resp = requests.get(f"{PDS}/xrpc/com.atproto.repo.listRecords", 676 headers={"Authorization": f"Bearer {session['accessJwt']}"}, 677 params={"repo": session["did"], "collection": coll, "limit": 100}) 678 resp.raise_for_status() 679 for rec in resp.json().get("records", []): 680 val = rec.get("value", {}) 681 if coll in (COLLECTION_BUNDLE, COLLECTION_MASL_LEGACY): 682 meta = val.get("systems.witchcraft.archive", {}) 683 url = meta.get("url", "").lower() 684 title = val.get("name", meta.get("title", "")).lower() 685 else: 686 url = val.get("url", "").lower() 687 title = val.get("title", "").lower() 688 if query_lower in url or query_lower in title: 689 rec["_collection"] = coll 690 matches.append(rec) 691 except: 692 pass 693 694 if not matches: 695 print(f"no captures matching '{query}'") 696 return 697 698 print(f"found {len(matches)} capture(s):") 699 for rec in matches: 700 val = rec.get("value", {}) 701 rkey = rec["uri"].split("/")[-1] 702 coll = rec["_collection"] 703 is_bundle = coll == COLLECTION_BUNDLE 704 705 if is_bundle: 706 meta = val.get("systems.witchcraft.archive", {}) 707 title = val.get("name", meta.get("title", ""))[:60] 708 url = meta.get("url", "") 709 res_count = meta.get("resourceCount", 0) 710 print(f" [{rkey}] [BUNDLE {res_count} files] {title}") 711 else: 712 title = val.get("title", "")[:60] 713 url = val.get("url", "") 714 print(f" [{rkey}] {title}") 715 print(f" {url}") 716 print() 717 718 719# --- main --- 720 721def main(): 722 parser = argparse.ArgumentParser( 723 description="Archive web pages to ATProto with IPFS pinning and MASL bundles") 724 parser.add_argument("url", nargs="?", help="URL to archive") 725 parser.add_argument("--bundle", action="store_true", 726 help="Archive URL + subresources as MASL bundle") 727 parser.add_argument("--list", action="store_true", help="List recent captures") 728 parser.add_argument("--limit", type=int, default=20, help="Number of captures to list") 729 parser.add_argument("--verify", metavar="RKEY", 730 help="Verify a capture's hash against current page") 731 parser.add_argument("--search", metavar="QUERY", 732 help="Search captures by URL or title") 733 parser.add_argument("--no-ipfs", action="store_true", 734 help="Skip IPFS pinning (compute CID locally only)") 735 parser.add_argument("--no-blob", action="store_true", 736 help="Skip PDS blob upload (single mode only)") 737 parser.add_argument("--max-resources", type=int, default=MAX_SUBRESOURCES, 738 help=f"Max subresources to fetch for bundles (default: {MAX_SUBRESOURCES})") 739 740 args = parser.parse_args() 741 742 if args.list: 743 cmd_list(args.limit) 744 elif args.verify: 745 cmd_verify(args.verify) 746 elif args.search: 747 cmd_search(args.search) 748 elif args.url: 749 if args.bundle: 750 cmd_bundle(args.url, no_ipfs=args.no_ipfs, max_resources=args.max_resources) 751 else: 752 cmd_archive(args.url, no_ipfs=args.no_ipfs, no_blob=args.no_blob) 753 else: 754 parser.print_help() 755 756 757if __name__ == "__main__": 758 main()