initial commit: web_archive.py with MASL bundle mode · kira.pds.witchcraft.systems/web-archive@47d64f0

+797

2 changed files

expand all

README.md

+67

README.md

··· 1 + # web-archive 2 + 3 + web archiver with MASL bundle mode for ATProto. captures web pages as content-addressed bundles stored on your PDS with optional IPFS pinning. 4 + 5 + ## what it does 6 + 7 + - **single mode**: archives a single HTML page with its CID stored as a `systems.witchcraft.archive.capture` ATProto record 8 + - **bundle mode**: archives a page + all subresources (CSS, JS, images, fonts) as a MASL bundle (`ing.dasl.masl` record) — each resource gets its own content-addressed blob on your PDS 9 + - **CSS url() scanning**: follows `@import` and `url()` references in stylesheets to capture fonts, background images, etc. 10 + - **IPFS pinning**: optionally pins the HTML to IPFS via a local kubo node 11 + - **PDS blob storage**: uploads all resources as PDS blobs with proper content-type headers 12 + 13 + ## usage 14 + 15 + ```bash 16 + # single page archive 17 + python web_archive.py https://example.com 18 + 19 + # bundle mode (page + all subresources) 20 + python web_archive.py https://example.com --bundle 21 + 22 + # bundle with resource limit 23 + python web_archive.py https://example.com --bundle --max-resources 50 24 + 25 + # skip IPFS pinning 26 + python web_archive.py https://example.com --no-ipfs 27 + 28 + # list all archives 29 + python web_archive.py --list 30 + 31 + # search archives 32 + python web_archive.py --search "example" 33 + 34 + # verify archive integrity 35 + python web_archive.py --verify <rkey> 36 + ``` 37 + 38 + ## auth 39 + 40 + set these environment variables: 41 + 42 + ```bash 43 + export ATP_PDS_URL=https://your.pds.example.com 44 + export ATP_HANDLE=your.handle 45 + export ATP_PASSWORD=your-app-password 46 + ``` 47 + 48 + ## dependencies 49 + 50 + ```bash 51 + pip install requests beautifulsoup4 52 + ``` 53 + 54 + optional: `ipfs` CLI (kubo) for IPFS pinning 55 + 56 + ## record types 57 + 58 + - `systems.witchcraft.archive.capture` — single page captures 59 + - `ing.dasl.masl` — MASL bundle mode records (see [dasl.ing/masl.html](https://dasl.ing/masl.html)) 60 + 61 + ## viewer 62 + 63 + archived pages can be viewed with the [archive viewer](https://sites.wisp.place/kira.pds.witchcraft.systems/archive-viewer/) hosted on wisp.place. 64 + 65 + ## license 66 + 67 + MIT

+730

web_archive.py

··· 1 + #!/usr/bin/env python3 2 + """ 3 + web_archive.py - Capture and archive web pages to ATProto with IPFS pinning. 4 + 5 + Creates signed, timestamped records of web page captures on your PDS. 6 + Each capture includes an IPFS CID (content-addressed identifier) and a PDS blob. 7 + 8 + Supports two modes: 9 + - Single capture: archive one URL as a `systems.witchcraft.archive.capture` record 10 + - Bundle capture: archive a page + all its subresources (CSS, JS, images) as an 11 + `ing.dasl.masl` Bundle Mode record (MASL spec: https://dasl.ing/masl.html) 12 + 13 + Usage: 14 + python web_archive.py <url> # Archive a URL (single capture) 15 + python web_archive.py <url> --bundle # Archive URL + subresources as MASL bundle 16 + python web_archive.py --list [--limit N] # List recent captures 17 + python web_archive.py --verify <rkey> # Re-fetch and verify CID 18 + python web_archive.py --search <query> # Search captures by URL/title 19 + """ 20 + 21 + import argparse 22 + import hashlib 23 + import json 24 + import mimetypes 25 + import os 26 + import re 27 + import sys 28 + from datetime import datetime, timezone 29 + from urllib.parse import urljoin, urlparse 30 + 31 + import requests 32 + from multiformats import CID, multihash 33 + 34 + 35 + # --- config --- 36 + 37 + PDS = os.environ.get("ATP_PDS_URL", "https://bsky.social") 38 + ATP_HANDLE = os.environ.get("ATP_HANDLE", "") 39 + ATP_PASSWORD = os.environ.get("ATP_PASSWORD", "") 40 + COLLECTION_CAPTURE = "systems.witchcraft.archive.capture" 41 + COLLECTION_MASL = "ing.dasl.masl" 42 + 43 + # Max subresources to fetch per bundle (safety limit) 44 + MAX_SUBRESOURCES = 100 45 + # Max size per individual resource (10MB) 46 + MAX_RESOURCE_SIZE = 10 * 1024 * 1024 47 + # Allowed subresource schemes 48 + ALLOWED_SCHEMES = {"http", "https"} 49 + 50 + 51 + # --- auth --- 52 + 53 + def get_session(): 54 + handle = ATP_HANDLE 55 + password = ATP_PASSWORD 56 + 57 + if not handle or not password: 58 + print("error: set ATP_HANDLE and ATP_PASSWORD environment variables") 59 + print(" export ATP_PDS_URL=https://your.pds.example.com") 60 + print(" export ATP_HANDLE=your.handle") 61 + print(" export ATP_PASSWORD=your-app-password") 62 + sys.exit(1) 63 + 64 + resp = requests.post(f"{PDS}/xrpc/com.atproto.server.createSession", 65 + json={"identifier": handle, "password": password}) 66 + resp.raise_for_status() 67 + return resp.json() 68 + 69 + 70 + # --- fetch --- 71 + 72 + def fetch_page(url): 73 + """Fetch a URL and return (html_bytes, title, final_url, word_count, status_code).""" 74 + headers = { 75 + "User-Agent": "WebArchive/1.0 (ATProto web archiver)" 76 + } 77 + resp = requests.get(url, headers=headers, timeout=30, allow_redirects=True) 78 + resp.raise_for_status() 79 + html_bytes = resp.content 80 + html = html_bytes.decode("utf-8", errors="replace") 81 + final_url = resp.url 82 + 83 + # Extract title 84 + title_match = re.search(r"<title[^>]*>(.*?)</title>", html, re.IGNORECASE | re.DOTALL) 85 + title = title_match.group(1).strip() if title_match else "" 86 + title = title.replace("&", "&").replace("<", "<").replace(">", ">") 87 + title = title.replace("'", "'").replace(""", '"') 88 + title = title[:256] 89 + 90 + # Word count from text extraction 91 + text = re.sub(r"<script[^>]*>.*?</script>", "", html, flags=re.DOTALL | re.IGNORECASE) 92 + text = re.sub(r"<style[^>]*>.*?</style>", "", text, flags=re.DOTALL | re.IGNORECASE) 93 + text = re.sub(r"<[^>]+>", " ", text) 94 + text = re.sub(r"\s+", " ", text).strip() 95 + word_count = len(text.split()) 96 + 97 + return html_bytes, title, final_url, word_count, resp.status_code 98 + 99 + 100 + def fetch_resource(url): 101 + """Fetch a subresource. Returns (bytes, content_type, final_url) or None on failure.""" 102 + headers = { 103 + "User-Agent": "WebArchive/1.0 (ATProto web archiver)" 104 + } 105 + try: 106 + resp = requests.get(url, headers=headers, timeout=20, allow_redirects=True, 107 + stream=True) 108 + resp.raise_for_status() 109 + 110 + # Check size before downloading full body 111 + content_length = resp.headers.get("content-length") 112 + if content_length and int(content_length) > MAX_RESOURCE_SIZE: 113 + print(f" skip (too large: {int(content_length):,} bytes): {url[:80]}") 114 + return None 115 + 116 + content = resp.content 117 + if len(content) > MAX_RESOURCE_SIZE: 118 + print(f" skip (too large: {len(content):,} bytes): {url[:80]}") 119 + return None 120 + 121 + content_type = resp.headers.get("content-type", "application/octet-stream") 122 + # Strip charset/params from content-type for MASL 123 + content_type = content_type.split(";")[0].strip() 124 + return content, content_type, resp.url 125 + except Exception as e: 126 + print(f" skip (error: {e}): {url[:80]}") 127 + return None 128 + 129 + 130 + def extract_subresource_urls(html, base_url): 131 + """Extract URLs of subresources (CSS, JS, images, fonts) from HTML.""" 132 + urls = {} # url -> expected type hint 133 + 134 + # CSS: <link rel="stylesheet" href="..."> 135 + for m in re.finditer(r'<link[^>]+rel=["\']stylesheet["\'][^>]+href=["\']([^"\']+)["\']', html, re.IGNORECASE): 136 + urls[m.group(1)] = "text/css" 137 + for m in re.finditer(r'<link[^>]+href=["\']([^"\']+)["\'][^>]+rel=["\']stylesheet["\']', html, re.IGNORECASE): 138 + urls[m.group(1)] = "text/css" 139 + 140 + # JS: <script src="..."> 141 + for m in re.finditer(r'<script[^>]+src=["\']([^"\']+)["\']', html, re.IGNORECASE): 142 + urls[m.group(1)] = "application/javascript" 143 + 144 + # Images: <img src="...">, <source srcset="..."> 145 + for m in re.finditer(r'<img[^>]+src=["\']([^"\']+)["\']', html, re.IGNORECASE): 146 + urls[m.group(1)] = "image/*" 147 + for m in re.finditer(r'<source[^>]+srcset=["\']([^"\']+)["\']', html, re.IGNORECASE): 148 + # srcset can have multiple URLs with widths 149 + for part in m.group(1).split(","): 150 + src = part.strip().split()[0] 151 + if src: 152 + urls[src] = "image/*" 153 + 154 + # Favicons and other <link> with href 155 + for m in re.finditer(r'<link[^>]+href=["\']([^"\']+)["\'][^>]*>', html, re.IGNORECASE): 156 + href = m.group(1) 157 + if href not in urls: 158 + # Check if it's an icon or other resource 159 + tag = m.group(0).lower() 160 + if 'rel="icon"' in tag or "rel='icon'" in tag or 'rel="apple-touch-icon"' in tag: 161 + urls[href] = "image/*" 162 + 163 + # CSS @import and url() in inline <style> 164 + for style_match in re.finditer(r'<style[^>]*>(.*?)</style>', html, re.DOTALL | re.IGNORECASE): 165 + style_content = style_match.group(1) 166 + for m in re.finditer(r'url$["\']?([^"\')\s]+)["\']?$', style_content): 167 + url_val = m.group(1) 168 + if not url_val.startswith("data:"): 169 + urls[url_val] = "application/octet-stream" 170 + for m in re.finditer(r'@import\s+["\']([^"\']+)["\']', style_content): 171 + urls[m.group(1)] = "text/css" 172 + 173 + # Resolve relative URLs and filter 174 + resolved = {} 175 + base_parsed = urlparse(base_url) 176 + for url, type_hint in urls.items(): 177 + if url.startswith("data:") or url.startswith("#") or url.startswith("javascript:"): 178 + continue 179 + absolute = urljoin(base_url, url) 180 + parsed = urlparse(absolute) 181 + # Only same-origin resources (MASL paths must start with /) 182 + if parsed.scheme in ALLOWED_SCHEMES and parsed.netloc == base_parsed.netloc: 183 + path = parsed.path or "/" 184 + if path not in resolved: 185 + resolved[path] = (absolute, type_hint) 186 + 187 + return resolved 188 + 189 + 190 + def compute_cid(content_bytes): 191 + """Compute IPFS CIDv1 (raw codec, sha2-256) for content bytes.""" 192 + digest = hashlib.sha256(content_bytes).digest() 193 + mh = multihash.wrap(digest, "sha2-256") 194 + cid = CID("base32", 1, "raw", mh) 195 + return str(cid) 196 + 197 + 198 + def hash_content(content_bytes): 199 + """SHA-256 hash of content bytes.""" 200 + return hashlib.sha256(content_bytes).hexdigest() 201 + 202 + 203 + def pin_to_ipfs(content_bytes, filename="capture.html", content_type="text/html"): 204 + """Pin content to local IPFS node. Returns (cid, pinned).""" 205 + try: 206 + resp = requests.post("http://127.0.0.1:5001/api/v0/add", 207 + files={"file": (filename, content_bytes, content_type)}, 208 + params={"pin": "true", "cid-version": "1", "raw-leaves": "false"}, 209 + timeout=30) 210 + resp.raise_for_status() 211 + data = resp.json() 212 + return data["Hash"], True 213 + except Exception as e: 214 + return compute_cid(content_bytes), False 215 + 216 + 217 + def upload_blob(session, content_bytes, content_type="text/html"): 218 + """Upload content as a blob to the PDS. Returns blob ref.""" 219 + try: 220 + resp = requests.post(f"{PDS}/xrpc/com.atproto.repo.uploadBlob", 221 + headers={ 222 + "Authorization": f"Bearer {session['accessJwt']}", 223 + "Content-Type": content_type, 224 + }, 225 + data=content_bytes, 226 + timeout=60) 227 + resp.raise_for_status() 228 + return resp.json().get("blob") 229 + except Exception as e: 230 + print(f" warning: blob upload failed ({e})") 231 + return None 232 + 233 + 234 + def announce_ipfs_content(cid): 235 + """Announce content to IPFS DHT so it's discoverable on public gateways.""" 236 + try: 237 + resp = requests.post("http://127.0.0.1:5001/api/v0/routing/provide", 238 + params={"arg": cid}, 239 + timeout=60) 240 + return resp.status_code == 200 241 + except: 242 + return False 243 + 244 + 245 + # --- atproto records --- 246 + 247 + def create_capture(session, url, final_url, title, cid, content_hash, word_count, 248 + html_size, pinned=False, blob_ref=None): 249 + """Create a single-page archive capture record.""" 250 + now = datetime.now(timezone.utc).isoformat().replace("+00:00", "Z") 251 + 252 + record = { 253 + "$type": COLLECTION_CAPTURE, 254 + "url": url, 255 + "capturedAt": now, 256 + "ipfsCid": cid, 257 + "contentHash": f"sha256:{content_hash}", 258 + "title": title, 259 + "wordCount": word_count, 260 + "htmlSize": html_size, 261 + "pinned": pinned, 262 + } 263 + 264 + if final_url != url: 265 + record["finalUrl"] = final_url 266 + if blob_ref: 267 + record["htmlBlob"] = blob_ref 268 + 269 + resp = requests.post(f"{PDS}/xrpc/com.atproto.repo.createRecord", 270 + headers={"Authorization": f"Bearer {session['accessJwt']}"}, 271 + json={ 272 + "repo": session["did"], 273 + "collection": COLLECTION_CAPTURE, 274 + "record": record, 275 + }) 276 + resp.raise_for_status() 277 + return resp.json() 278 + 279 + 280 + def create_masl_bundle(session, name, url, resources_map, captured_at, 281 + archive_meta=None): 282 + """Create a MASL Bundle Mode record on ATProto. 283 + 284 + resources_map: dict of path -> {src: blob_ref, content-type: str, ...} 285 + archive_meta: optional dict with archival metadata (url, capturedAt, etc) 286 + """ 287 + record = { 288 + "$type": COLLECTION_MASL, 289 + "name": name, 290 + "resources": resources_map, 291 + } 292 + 293 + # Add our archival metadata in a namespaced object (per MASL spec recommendation) 294 + if archive_meta: 295 + record["systems.witchcraft.archive"] = archive_meta 296 + 297 + resp = requests.post(f"{PDS}/xrpc/com.atproto.repo.createRecord", 298 + headers={"Authorization": f"Bearer {session['accessJwt']}"}, 299 + json={ 300 + "repo": session["did"], 301 + "collection": COLLECTION_MASL, 302 + "record": record, 303 + }) 304 + resp.raise_for_status() 305 + return resp.json() 306 + 307 + 308 + # --- commands --- 309 + 310 + def cmd_archive(url, no_ipfs=False, no_blob=False): 311 + """Archive a single URL.""" 312 + print(f"fetching {url}...") 313 + html_bytes, title, final_url, word_count, status = fetch_page(url) 314 + content_hash = hash_content(html_bytes) 315 + html_size = len(html_bytes) 316 + 317 + if not no_ipfs: 318 + print(f" pinning to IPFS...") 319 + cid, pinned = pin_to_ipfs(html_bytes) 320 + if pinned: 321 + announce_ipfs_content(cid) 322 + else: 323 + cid = compute_cid(html_bytes) 324 + pinned = False 325 + 326 + print(f" title: {title[:80]}") 327 + print(f" final url: {final_url}") 328 + print(f" cid: {cid}") 329 + print(f" pinned: {'yes' if pinned else 'no (local CID only)'}") 330 + print(f" hash: sha256:{content_hash[:16]}...") 331 + print(f" size: {html_size:,} bytes, ~{word_count:,} words") 332 + 333 + session = get_session() 334 + 335 + blob_ref = None 336 + if not no_blob: 337 + print(f" uploading blob to PDS...") 338 + blob_ref = upload_blob(session, html_bytes) 339 + if blob_ref: 340 + print(f" blob uploaded!") 341 + 342 + result = create_capture(session, url, final_url, title, cid, content_hash, 343 + word_count, html_size, pinned=pinned, blob_ref=blob_ref) 344 + uri = result.get("uri", "") 345 + rkey = uri.split("/")[-1] if uri else "?" 346 + 347 + print(f"\ncaptured! {uri}") 348 + print(f" view: https://pdsls.dev/at/{session['did']}/{COLLECTION_CAPTURE}/{rkey}") 349 + return uri 350 + 351 + 352 + def cmd_bundle(url, no_ipfs=False, max_resources=MAX_SUBRESOURCES): 353 + """Archive a URL + all its subresources as a MASL bundle.""" 354 + print(f"fetching {url}...") 355 + html_bytes, title, final_url, word_count, status = fetch_page(url) 356 + html = html_bytes.decode("utf-8", errors="replace") 357 + html_size = len(html_bytes) 358 + 359 + print(f" title: {title[:80]}") 360 + print(f" size: {html_size:,} bytes, ~{word_count:,} words") 361 + 362 + # Extract subresource URLs 363 + print(f"\nscanning for subresources...") 364 + subresource_urls = extract_subresource_urls(html, final_url) 365 + print(f" found {len(subresource_urls)} same-origin subresources") 366 + 367 + if len(subresource_urls) > max_resources: 368 + print(f" capping at {max_resources} (use --max-resources to change)") 369 + # Keep only the first N 370 + subresource_urls = dict(list(subresource_urls.items())[:max_resources]) 371 + 372 + # Fetch all subresources 373 + print(f"\nfetching subresources...") 374 + resources = {} # path -> (bytes, content_type, ipfs_cid) 375 + 376 + for path, (abs_url, type_hint) in subresource_urls.items(): 377 + result = fetch_resource(abs_url) 378 + if result: 379 + res_bytes, content_type, _ = result 380 + print(f" ok ({len(res_bytes):,}b {content_type}): {path}") 381 + resources[path] = (res_bytes, content_type) 382 + 383 + # Phase 2: Scan fetched CSS files for url() references (fonts, images, etc) 384 + css_extra_urls = {} 385 + for path, (res_bytes, content_type) in list(resources.items()): 386 + if "css" in content_type: 387 + css_text = res_bytes.decode("utf-8", errors="replace") 388 + css_base = urljoin(final_url, path) 389 + for m in re.finditer(r'url$["\']?([^"\')\s]+)["\']?$', css_text): 390 + url_val = m.group(1) 391 + if url_val.startswith("data:"): 392 + continue 393 + abs_url = urljoin(css_base, url_val) 394 + parsed = urlparse(abs_url) 395 + base_parsed = urlparse(final_url) 396 + if parsed.scheme in ALLOWED_SCHEMES: 397 + css_path = parsed.path or "/" 398 + if css_path not in resources and css_path not in css_extra_urls: 399 + css_extra_urls[css_path] = (abs_url, "application/octet-stream") 400 + 401 + if css_extra_urls: 402 + print(f"\n found {len(css_extra_urls)} extra resources from CSS url() refs") 403 + for path, (abs_url, _) in css_extra_urls.items(): 404 + if len(resources) >= max_resources: 405 + break 406 + result = fetch_resource(abs_url) 407 + if result: 408 + res_bytes, content_type, _ = result 409 + print(f" ok ({len(res_bytes):,}b {content_type}): {path}") 410 + resources[path] = (res_bytes, content_type) 411 + 412 + print(f"\n fetched {len(resources)} total subresources") 413 + 414 + # Get session for blob uploads 415 + session = get_session() 416 + 417 + # Process root page 418 + print(f"\nprocessing root page...") 419 + if not no_ipfs: 420 + root_cid, root_pinned = pin_to_ipfs(html_bytes) 421 + if root_pinned: 422 + announce_ipfs_content(root_cid) 423 + else: 424 + root_cid = compute_cid(html_bytes) 425 + root_pinned = False 426 + 427 + root_blob = upload_blob(session, html_bytes, "text/html") 428 + if not root_blob: 429 + print(" ERROR: failed to upload root page blob, aborting") 430 + sys.exit(1) 431 + 432 + print(f" root cid: {root_cid} (pinned: {root_pinned})") 433 + 434 + # Build MASL resources map 435 + # Per MASL spec: keys are paths starting with /, values have src (blob ref) + content-type 436 + resources_map = { 437 + "/": { 438 + "src": root_blob, 439 + "content-type": "text/html", 440 + } 441 + } 442 + 443 + # Process subresources 444 + total_pinned = 1 if root_pinned else 0 445 + total_blobs = 1 446 + total_bytes = html_size 447 + failed_blobs = 0 448 + 449 + for path, (res_bytes, content_type) in resources.items(): 450 + total_bytes += len(res_bytes) 451 + 452 + # Pin to IPFS 453 + if not no_ipfs: 454 + res_cid, res_pinned = pin_to_ipfs(res_bytes, 455 + filename=path.split("/")[-1] or "resource", 456 + content_type=content_type) 457 + if res_pinned: 458 + total_pinned += 1 459 + announce_ipfs_content(res_cid) 460 + else: 461 + res_cid = compute_cid(res_bytes) 462 + 463 + # Upload blob to PDS 464 + blob_ref = upload_blob(session, res_bytes, content_type) 465 + if blob_ref: 466 + total_blobs += 1 467 + entry = { 468 + "src": blob_ref, 469 + "content-type": content_type, 470 + } 471 + resources_map[path] = entry 472 + else: 473 + failed_blobs += 1 474 + print(f" blob failed: {path}") 475 + 476 + print(f"\n resources: {len(resources_map)} ({total_blobs} blobs, {total_pinned} pinned)") 477 + print(f" total size: {total_bytes:,} bytes") 478 + if failed_blobs: 479 + print(f" failed blobs: {failed_blobs}") 480 + 481 + # Build archive metadata (namespaced per MASL spec) 482 + now = datetime.now(timezone.utc).isoformat().replace("+00:00", "Z") 483 + archive_meta = { 484 + "url": url, 485 + "capturedAt": now, 486 + "title": title, 487 + "wordCount": word_count, 488 + "totalSize": total_bytes, 489 + "resourceCount": len(resources_map), 490 + "rootIpfsCid": root_cid, 491 + "contentHash": f"sha256:{hash_content(html_bytes)}", 492 + "pinned": total_pinned > 0, 493 + } 494 + if final_url != url: 495 + archive_meta["finalUrl"] = final_url 496 + 497 + # Create MASL bundle record 498 + print(f"\ncreating MASL bundle record...") 499 + bundle_name = title or urlparse(url).netloc 500 + result = create_masl_bundle(session, bundle_name, url, resources_map, 501 + now, archive_meta=archive_meta) 502 + uri = result.get("uri", "") 503 + rkey = uri.split("/")[-1] if uri else "?" 504 + 505 + print(f"\nbundled! {uri}") 506 + print(f" view: https://pdsls.dev/at/{session['did']}/{COLLECTION_MASL}/{rkey}") 507 + print(f" resources: {len(resources_map)} files, {total_bytes:,} bytes total") 508 + print(f" root IPFS: ipfs://{root_cid}") 509 + return uri 510 + 511 + 512 + def cmd_list(limit, collection=None): 513 + """List recent captures from either or both collections.""" 514 + session = get_session() 515 + all_records = [] 516 + 517 + collections = [COLLECTION_CAPTURE, COLLECTION_MASL] if not collection else [collection] 518 + 519 + for coll in collections: 520 + try: 521 + resp = requests.get(f"{PDS}/xrpc/com.atproto.repo.listRecords", 522 + headers={"Authorization": f"Bearer {session['accessJwt']}"}, 523 + params={ 524 + "repo": session["did"], 525 + "collection": coll, 526 + "limit": limit, 527 + "reverse": True, 528 + }) 529 + resp.raise_for_status() 530 + for rec in resp.json().get("records", []): 531 + rec["_collection"] = coll 532 + all_records.append(rec) 533 + except: 534 + pass 535 + 536 + if not all_records: 537 + print("no captures yet") 538 + return 539 + 540 + # Sort by captured time (newest first) 541 + def sort_key(r): 542 + val = r.get("value", {}) 543 + # MASL bundles store capturedAt in namespaced metadata 544 + meta = val.get("systems.witchcraft.archive", {}) 545 + return meta.get("capturedAt", val.get("capturedAt", "")) 546 + all_records.sort(key=sort_key, reverse=True) 547 + 548 + for rec in all_records[:limit]: 549 + val = rec.get("value", {}) 550 + rkey = rec["uri"].split("/")[-1] 551 + coll = rec["_collection"] 552 + is_bundle = coll == COLLECTION_MASL 553 + 554 + if is_bundle: 555 + meta = val.get("systems.witchcraft.archive", {}) 556 + title = val.get("name", meta.get("title", "(untitled)"))[:60] 557 + url = meta.get("url", "")[:60] 558 + captured = meta.get("capturedAt", "")[:19] 559 + cid = meta.get("rootIpfsCid", "") 560 + res_count = meta.get("resourceCount", len(val.get("resources", {}))) 561 + total_size = meta.get("totalSize", 0) 562 + tag = f"[BUNDLE {res_count} files, {total_size:,}b]" 563 + else: 564 + title = val.get("title", "(untitled)")[:60] 565 + url = val.get("url", "")[:60] 566 + captured = val.get("capturedAt", "")[:19] 567 + cid = val.get("ipfsCid", val.get("cid", "")) 568 + tag = "[single]" 569 + 570 + print(f" [{rkey}] {captured} {tag}") 571 + print(f" {title}") 572 + print(f" {url}") 573 + if cid: 574 + print(f" ipfs://{cid}") 575 + print() 576 + 577 + 578 + def cmd_verify(rkey): 579 + """Verify a capture's hash against current page content.""" 580 + session = get_session() 581 + 582 + # Try capture collection first, then MASL 583 + rec = None 584 + for coll in [COLLECTION_CAPTURE, COLLECTION_MASL]: 585 + try: 586 + resp = requests.get(f"{PDS}/xrpc/com.atproto.repo.getRecord", 587 + headers={"Authorization": f"Bearer {session['accessJwt']}"}, 588 + params={"repo": session["did"], "collection": coll, "rkey": rkey}) 589 + resp.raise_for_status() 590 + rec = resp.json() 591 + rec["_collection"] = coll 592 + break 593 + except: 594 + continue 595 + 596 + if not rec: 597 + print(f"capture {rkey} not found in any collection") 598 + return 599 + 600 + val = rec.get("value", {}) 601 + is_bundle = rec["_collection"] == COLLECTION_MASL 602 + 603 + if is_bundle: 604 + meta = val.get("systems.witchcraft.archive", {}) 605 + url = meta.get("finalUrl", meta.get("url", "")) 606 + stored_cid = meta.get("rootIpfsCid", "") 607 + stored_hash = meta.get("contentHash", "") 608 + captured_at = meta.get("capturedAt", "") 609 + else: 610 + url = val.get("finalUrl", val.get("url", "")) 611 + stored_cid = val.get("ipfsCid", val.get("cid", "")) 612 + stored_hash = val.get("contentHash", "") 613 + captured_at = val.get("capturedAt", "") 614 + 615 + print(f"verifying capture {rkey} ({'bundle' if is_bundle else 'single'})...") 616 + print(f" url: {url}") 617 + print(f" captured: {captured_at}") 618 + if stored_cid: 619 + print(f" stored cid: {stored_cid}") 620 + print(f" stored hash: {stored_hash}") 621 + 622 + print(f"\nre-fetching {url}...") 623 + html_bytes, _, _, _, _ = fetch_page(url) 624 + current_hash = f"sha256:{hash_content(html_bytes)}" 625 + current_cid = compute_cid(html_bytes) 626 + 627 + if stored_cid: 628 + print(f" current cid: {current_cid}") 629 + print(f" current hash: {current_hash}") 630 + 631 + match = (stored_cid == current_cid) if stored_cid else (stored_hash == current_hash) 632 + if match: 633 + print("\n MATCH - page content unchanged since capture") 634 + else: 635 + print("\n MISMATCH - page has changed since capture!") 636 + print(" (this is expected for dynamic pages)") 637 + 638 + 639 + def cmd_search(query): 640 + """Search captures by URL or title.""" 641 + session = get_session() 642 + query_lower = query.lower() 643 + matches = [] 644 + 645 + for coll in [COLLECTION_CAPTURE, COLLECTION_MASL]: 646 + try: 647 + resp = requests.get(f"{PDS}/xrpc/com.atproto.repo.listRecords", 648 + headers={"Authorization": f"Bearer {session['accessJwt']}"}, 649 + params={"repo": session["did"], "collection": coll, "limit": 100}) 650 + resp.raise_for_status() 651 + for rec in resp.json().get("records", []): 652 + val = rec.get("value", {}) 653 + if coll == COLLECTION_MASL: 654 + meta = val.get("systems.witchcraft.archive", {}) 655 + url = meta.get("url", "").lower() 656 + title = val.get("name", meta.get("title", "")).lower() 657 + else: 658 + url = val.get("url", "").lower() 659 + title = val.get("title", "").lower() 660 + if query_lower in url or query_lower in title: 661 + rec["_collection"] = coll 662 + matches.append(rec) 663 + except: 664 + pass 665 + 666 + if not matches: 667 + print(f"no captures matching '{query}'") 668 + return 669 + 670 + print(f"found {len(matches)} capture(s):") 671 + for rec in matches: 672 + val = rec.get("value", {}) 673 + rkey = rec["uri"].split("/")[-1] 674 + coll = rec["_collection"] 675 + is_bundle = coll == COLLECTION_MASL 676 + 677 + if is_bundle: 678 + meta = val.get("systems.witchcraft.archive", {}) 679 + title = val.get("name", meta.get("title", ""))[:60] 680 + url = meta.get("url", "") 681 + res_count = meta.get("resourceCount", 0) 682 + print(f" [{rkey}] [BUNDLE {res_count} files] {title}") 683 + else: 684 + title = val.get("title", "")[:60] 685 + url = val.get("url", "") 686 + print(f" [{rkey}] {title}") 687 + print(f" {url}") 688 + print() 689 + 690 + 691 + # --- main --- 692 + 693 + def main(): 694 + parser = argparse.ArgumentParser( 695 + description="Archive web pages to ATProto with IPFS pinning and MASL bundles") 696 + parser.add_argument("url", nargs="?", help="URL to archive") 697 + parser.add_argument("--bundle", action="store_true", 698 + help="Archive URL + subresources as MASL bundle") 699 + parser.add_argument("--list", action="store_true", help="List recent captures") 700 + parser.add_argument("--limit", type=int, default=20, help="Number of captures to list") 701 + parser.add_argument("--verify", metavar="RKEY", 702 + help="Verify a capture's hash against current page") 703 + parser.add_argument("--search", metavar="QUERY", 704 + help="Search captures by URL or title") 705 + parser.add_argument("--no-ipfs", action="store_true", 706 + help="Skip IPFS pinning (compute CID locally only)") 707 + parser.add_argument("--no-blob", action="store_true", 708 + help="Skip PDS blob upload (single mode only)") 709 + parser.add_argument("--max-resources", type=int, default=MAX_SUBRESOURCES, 710 + help=f"Max subresources to fetch for bundles (default: {MAX_SUBRESOURCES})") 711 + 712 + args = parser.parse_args() 713 + 714 + if args.list: 715 + cmd_list(args.limit) 716 + elif args.verify: 717 + cmd_verify(args.verify) 718 + elif args.search: 719 + cmd_search(args.search) 720 + elif args.url: 721 + if args.bundle: 722 + cmd_bundle(args.url, no_ipfs=args.no_ipfs, max_resources=args.max_resources) 723 + else: 724 + cmd_archive(args.url, no_ipfs=args.no_ipfs, no_blob=args.no_blob) 725 + else: 726 + parser.print_help() 727 + 728 + 729 + if __name__ == "__main__": 730 + main()

Configure Feed

Configure Feed