web archiver with MASL bundle mode for ATProto. captures web pages as content-addressed bundles stored on your PDS with optional IPFS pinning.
1#!/usr/bin/env python3
2"""
3web_archive.py - Capture and archive web pages to ATProto with IPFS pinning.
4
5Creates signed, timestamped records of web page captures on your PDS.
6Each capture includes an IPFS CID (content-addressed identifier) and a PDS blob.
7
8Supports two modes:
9 - Single capture: archive one URL as a `systems.witchcraft.archive.capture` record
10 - Bundle capture: archive a page + all its subresources (CSS, JS, images) as a
11 `systems.witchcraft.archive.bundle` record containing a MASL-shaped manifest
12 with CID-addressed resources (MASL spec: https://dasl.ing/masl.html)
13
14Usage:
15 python web_archive.py <url> # Archive a URL (single capture)
16 python web_archive.py <url> --bundle # Archive URL + subresources as MASL bundle
17 python web_archive.py --list [--limit N] # List recent captures
18 python web_archive.py --verify <rkey> # Re-fetch and verify CID
19 python web_archive.py --search <query> # Search captures by URL/title
20"""
21
22import argparse
23import hashlib
24import json
25import mimetypes
26import os
27import re
28import sys
29from datetime import datetime, timezone
30from urllib.parse import urljoin, urlparse
31
32import requests
33from multiformats import CID, multihash
34
35
36# --- config ---
37
38PDS = os.environ.get("ATP_PDS_URL", "https://bsky.social")
39ATP_HANDLE = os.environ.get("ATP_HANDLE", "")
40ATP_PASSWORD = os.environ.get("ATP_PASSWORD", "")
41COLLECTION_CAPTURE = "systems.witchcraft.archive.capture"
42COLLECTION_BUNDLE = "ing.dasl.masl"
43COLLECTION_MASL_LEGACY = COLLECTION_BUNDLE # consolidated
44
45# Max subresources to fetch per bundle (safety limit)
46MAX_SUBRESOURCES = 100
47# Max size per individual resource (10MB)
48MAX_RESOURCE_SIZE = 10 * 1024 * 1024
49# Allowed subresource schemes
50ALLOWED_SCHEMES = {"http", "https"}
51
52
53# --- auth ---
54
55def get_session():
56 handle = ATP_HANDLE
57 password = ATP_PASSWORD
58
59 if not handle or not password:
60 print("error: set ATP_HANDLE and ATP_PASSWORD environment variables")
61 print(" export ATP_PDS_URL=https://your.pds.example.com")
62 print(" export ATP_HANDLE=your.handle")
63 print(" export ATP_PASSWORD=your-app-password")
64 sys.exit(1)
65
66 resp = requests.post(f"{PDS}/xrpc/com.atproto.server.createSession",
67 json={"identifier": handle, "password": password})
68 resp.raise_for_status()
69 return resp.json()
70
71
72# --- fetch ---
73
74def fetch_page(url):
75 """Fetch a URL and return (html_bytes, title, final_url, word_count, status_code)."""
76 headers = {
77 "User-Agent": "WebArchive/1.0 (ATProto web archiver)"
78 }
79 resp = requests.get(url, headers=headers, timeout=30, allow_redirects=True)
80 resp.raise_for_status()
81 html_bytes = resp.content
82 html = html_bytes.decode("utf-8", errors="replace")
83 final_url = resp.url
84
85 # Extract title
86 title_match = re.search(r"<title[^>]*>(.*?)</title>", html, re.IGNORECASE | re.DOTALL)
87 title = title_match.group(1).strip() if title_match else ""
88 title = title.replace("&", "&").replace("<", "<").replace(">", ">")
89 title = title.replace("'", "'").replace(""", '"')
90 title = title[:256]
91
92 # Word count from text extraction
93 text = re.sub(r"<script[^>]*>.*?</script>", "", html, flags=re.DOTALL | re.IGNORECASE)
94 text = re.sub(r"<style[^>]*>.*?</style>", "", text, flags=re.DOTALL | re.IGNORECASE)
95 text = re.sub(r"<[^>]+>", " ", text)
96 text = re.sub(r"\s+", " ", text).strip()
97 word_count = len(text.split())
98
99 return html_bytes, title, final_url, word_count, resp.status_code
100
101
102def fetch_resource(url):
103 """Fetch a subresource. Returns (bytes, content_type, final_url) or None on failure."""
104 headers = {
105 "User-Agent": "WebArchive/1.0 (ATProto web archiver)"
106 }
107 try:
108 resp = requests.get(url, headers=headers, timeout=20, allow_redirects=True,
109 stream=True)
110 resp.raise_for_status()
111
112 # Check size before downloading full body
113 content_length = resp.headers.get("content-length")
114 if content_length and int(content_length) > MAX_RESOURCE_SIZE:
115 print(f" skip (too large: {int(content_length):,} bytes): {url[:80]}")
116 return None
117
118 content = resp.content
119 if len(content) > MAX_RESOURCE_SIZE:
120 print(f" skip (too large: {len(content):,} bytes): {url[:80]}")
121 return None
122
123 content_type = resp.headers.get("content-type", "application/octet-stream")
124 # Strip charset/params from content-type for MASL
125 content_type = content_type.split(";")[0].strip()
126 return content, content_type, resp.url
127 except Exception as e:
128 print(f" skip (error: {e}): {url[:80]}")
129 return None
130
131
132def extract_subresource_urls(html, base_url):
133 """Extract URLs of subresources (CSS, JS, images, fonts) from HTML."""
134 urls = {} # url -> expected type hint
135
136 # CSS: <link rel="stylesheet" href="...">
137 for m in re.finditer(r'<link[^>]+rel=["\']stylesheet["\'][^>]+href=["\']([^"\']+)["\']', html, re.IGNORECASE):
138 urls[m.group(1)] = "text/css"
139 for m in re.finditer(r'<link[^>]+href=["\']([^"\']+)["\'][^>]+rel=["\']stylesheet["\']', html, re.IGNORECASE):
140 urls[m.group(1)] = "text/css"
141
142 # JS: <script src="...">
143 for m in re.finditer(r'<script[^>]+src=["\']([^"\']+)["\']', html, re.IGNORECASE):
144 urls[m.group(1)] = "application/javascript"
145
146 # Images: <img src="...">, <source srcset="...">
147 for m in re.finditer(r'<img[^>]+src=["\']([^"\']+)["\']', html, re.IGNORECASE):
148 urls[m.group(1)] = "image/*"
149 for m in re.finditer(r'<source[^>]+srcset=["\']([^"\']+)["\']', html, re.IGNORECASE):
150 # srcset can have multiple URLs with widths
151 for part in m.group(1).split(","):
152 src = part.strip().split()[0]
153 if src:
154 urls[src] = "image/*"
155
156 # Favicons and other <link> with href
157 for m in re.finditer(r'<link[^>]+href=["\']([^"\']+)["\'][^>]*>', html, re.IGNORECASE):
158 href = m.group(1)
159 if href not in urls:
160 # Check if it's an icon or other resource
161 tag = m.group(0).lower()
162 if 'rel="icon"' in tag or "rel='icon'" in tag or 'rel="apple-touch-icon"' in tag:
163 urls[href] = "image/*"
164
165 # CSS @import and url() in inline <style>
166 for style_match in re.finditer(r'<style[^>]*>(.*?)</style>', html, re.DOTALL | re.IGNORECASE):
167 style_content = style_match.group(1)
168 for m in re.finditer(r'url\(["\']?([^"\')\s]+)["\']?\)', style_content):
169 url_val = m.group(1)
170 if not url_val.startswith("data:"):
171 urls[url_val] = "application/octet-stream"
172 for m in re.finditer(r'@import\s+["\']([^"\']+)["\']', style_content):
173 urls[m.group(1)] = "text/css"
174
175 # Resolve relative URLs and filter
176 resolved = {}
177 base_parsed = urlparse(base_url)
178 for url, type_hint in urls.items():
179 if url.startswith("data:") or url.startswith("#") or url.startswith("javascript:"):
180 continue
181 absolute = urljoin(base_url, url)
182 parsed = urlparse(absolute)
183 # Only same-origin resources (MASL paths must start with /)
184 if parsed.scheme in ALLOWED_SCHEMES and parsed.netloc == base_parsed.netloc:
185 path = parsed.path or "/"
186 if path not in resolved:
187 resolved[path] = (absolute, type_hint)
188
189 return resolved
190
191
192def compute_cid(content_bytes):
193 """Compute IPFS CIDv1 (raw codec, sha2-256) for content bytes."""
194 digest = hashlib.sha256(content_bytes).digest()
195 mh = multihash.wrap(digest, "sha2-256")
196 cid = CID("base32", 1, "raw", mh)
197 return str(cid)
198
199
200def hash_content(content_bytes):
201 """SHA-256 hash of content bytes."""
202 return hashlib.sha256(content_bytes).hexdigest()
203
204
205def pin_to_ipfs(content_bytes, filename="capture.html", content_type="text/html"):
206 """Pin content to local IPFS node. Returns (cid, pinned)."""
207 try:
208 resp = requests.post("http://127.0.0.1:5001/api/v0/add",
209 files={"file": (filename, content_bytes, content_type)},
210 params={"pin": "true", "cid-version": "1", "raw-leaves": "false"},
211 timeout=30)
212 resp.raise_for_status()
213 data = resp.json()
214 return data["Hash"], True
215 except Exception as e:
216 return compute_cid(content_bytes), False
217
218
219def upload_blob(session, content_bytes, content_type="text/html"):
220 """Upload content as a blob to the PDS. Returns blob ref."""
221 try:
222 resp = requests.post(f"{PDS}/xrpc/com.atproto.repo.uploadBlob",
223 headers={
224 "Authorization": f"Bearer {session['accessJwt']}",
225 "Content-Type": content_type,
226 },
227 data=content_bytes,
228 timeout=60)
229 resp.raise_for_status()
230 return resp.json().get("blob")
231 except Exception as e:
232 print(f" warning: blob upload failed ({e})")
233 return None
234
235
236def announce_ipfs_content(cid):
237 """Announce content to IPFS DHT so it's discoverable on public gateways."""
238 try:
239 resp = requests.post("http://127.0.0.1:5001/api/v0/routing/provide",
240 params={"arg": cid},
241 timeout=60)
242 return resp.status_code == 200
243 except:
244 return False
245
246
247# --- atproto records ---
248
249def create_capture(session, url, final_url, title, cid, content_hash, word_count,
250 html_size, pinned=False, blob_ref=None):
251 """Create a single-page archive capture record."""
252 now = datetime.now(timezone.utc).isoformat().replace("+00:00", "Z")
253
254 record = {
255 "$type": COLLECTION_CAPTURE,
256 "url": url,
257 "capturedAt": now,
258 "ipfsCid": cid,
259 "contentHash": f"sha256:{content_hash}",
260 "title": title,
261 "wordCount": word_count,
262 "htmlSize": html_size,
263 "pinned": pinned,
264 }
265
266 if final_url != url:
267 record["finalUrl"] = final_url
268 if blob_ref:
269 record["htmlBlob"] = blob_ref
270
271 resp = requests.post(f"{PDS}/xrpc/com.atproto.repo.createRecord",
272 headers={"Authorization": f"Bearer {session['accessJwt']}"},
273 json={
274 "repo": session["did"],
275 "collection": COLLECTION_CAPTURE,
276 "record": record,
277 })
278 resp.raise_for_status()
279 return resp.json()
280
281
282def create_bundle_record(session, name, url, resources_map, blobs_map,
283 captured_at, archive_meta=None):
284 """Create an ing.dasl.masl record on ATProto (Web Tiles format).
285
286 Uses MASL bundle format with archive metadata namespaced under
287 systems.witchcraft.archive.
288
289 resources_map: dict of path -> {src: blob_ref, content-type: str}
290 blobs_map: dict of path -> blob_ref (kept for compat, merged into resources)
291 archive_meta: dict with url, capturedAt, title, etc.
292 """
293 # Merge blob refs into resources (MASL format: resources[path].src = blob ref)
294 merged_resources = {}
295 for path, res_data in resources_map.items():
296 entry = dict(res_data) if isinstance(res_data, dict) else {"src": res_data}
297 if path in blobs_map:
298 entry["src"] = blobs_map[path]
299 merged_resources[path] = entry
300
301 record = {
302 "$type": COLLECTION_BUNDLE,
303 # MASL required fields at top level
304 "name": name,
305 "resources": merged_resources,
306 }
307
308 # Archive metadata namespaced
309 archive_ns = {
310 "url": archive_meta.get("url", url) if archive_meta else url,
311 "title": archive_meta.get("title", name) if archive_meta else name,
312 "capturedAt": captured_at,
313 }
314 if archive_meta:
315 for key in ["wordCount", "totalSize", "resourceCount",
316 "rootIpfsCid", "contentHash", "pinned", "finalUrl"]:
317 if key in archive_meta:
318 archive_ns[key] = archive_meta[key]
319 record["systems.witchcraft.archive"] = archive_ns
320
321 resp = requests.post(f"{PDS}/xrpc/com.atproto.repo.createRecord",
322 headers={"Authorization": f"Bearer {session['accessJwt']}"},
323 json={
324 "repo": session["did"],
325 "collection": COLLECTION_BUNDLE,
326 "record": record,
327 })
328 resp.raise_for_status()
329 return resp.json()
330
331
332# --- commands ---
333
334def cmd_archive(url, no_ipfs=False, no_blob=False):
335 """Archive a single URL."""
336 print(f"fetching {url}...")
337 html_bytes, title, final_url, word_count, status = fetch_page(url)
338 content_hash = hash_content(html_bytes)
339 html_size = len(html_bytes)
340
341 if not no_ipfs:
342 print(f" pinning to IPFS...")
343 cid, pinned = pin_to_ipfs(html_bytes)
344 if pinned:
345 announce_ipfs_content(cid)
346 else:
347 cid = compute_cid(html_bytes)
348 pinned = False
349
350 print(f" title: {title[:80]}")
351 print(f" final url: {final_url}")
352 print(f" cid: {cid}")
353 print(f" pinned: {'yes' if pinned else 'no (local CID only)'}")
354 print(f" hash: sha256:{content_hash[:16]}...")
355 print(f" size: {html_size:,} bytes, ~{word_count:,} words")
356
357 session = get_session()
358
359 blob_ref = None
360 if not no_blob:
361 print(f" uploading blob to PDS...")
362 blob_ref = upload_blob(session, html_bytes)
363 if blob_ref:
364 print(f" blob uploaded!")
365
366 result = create_capture(session, url, final_url, title, cid, content_hash,
367 word_count, html_size, pinned=pinned, blob_ref=blob_ref)
368 uri = result.get("uri", "")
369 rkey = uri.split("/")[-1] if uri else "?"
370
371 print(f"\ncaptured! {uri}")
372 print(f" view: https://pdsls.dev/at/{session['did']}/{COLLECTION_CAPTURE}/{rkey}")
373 return uri
374
375
376def cmd_bundle(url, no_ipfs=False, max_resources=MAX_SUBRESOURCES):
377 """Archive a URL + all its subresources as a MASL bundle."""
378 print(f"fetching {url}...")
379 html_bytes, title, final_url, word_count, status = fetch_page(url)
380 html = html_bytes.decode("utf-8", errors="replace")
381 html_size = len(html_bytes)
382
383 print(f" title: {title[:80]}")
384 print(f" size: {html_size:,} bytes, ~{word_count:,} words")
385
386 # Extract subresource URLs
387 print(f"\nscanning for subresources...")
388 subresource_urls = extract_subresource_urls(html, final_url)
389 print(f" found {len(subresource_urls)} same-origin subresources")
390
391 if len(subresource_urls) > max_resources:
392 print(f" capping at {max_resources} (use --max-resources to change)")
393 # Keep only the first N
394 subresource_urls = dict(list(subresource_urls.items())[:max_resources])
395
396 # Fetch all subresources
397 print(f"\nfetching subresources...")
398 resources = {} # path -> (bytes, content_type, ipfs_cid)
399
400 for path, (abs_url, type_hint) in subresource_urls.items():
401 result = fetch_resource(abs_url)
402 if result:
403 res_bytes, content_type, _ = result
404 print(f" ok ({len(res_bytes):,}b {content_type}): {path}")
405 resources[path] = (res_bytes, content_type)
406
407 # Phase 2: Scan fetched CSS files for url() references (fonts, images, etc)
408 css_extra_urls = {}
409 for path, (res_bytes, content_type) in list(resources.items()):
410 if "css" in content_type:
411 css_text = res_bytes.decode("utf-8", errors="replace")
412 css_base = urljoin(final_url, path)
413 for m in re.finditer(r'url\(["\']?([^"\')\s]+)["\']?\)', css_text):
414 url_val = m.group(1)
415 if url_val.startswith("data:"):
416 continue
417 abs_url = urljoin(css_base, url_val)
418 parsed = urlparse(abs_url)
419 base_parsed = urlparse(final_url)
420 if parsed.scheme in ALLOWED_SCHEMES:
421 css_path = parsed.path or "/"
422 if css_path not in resources and css_path not in css_extra_urls:
423 css_extra_urls[css_path] = (abs_url, "application/octet-stream")
424
425 if css_extra_urls:
426 print(f"\n found {len(css_extra_urls)} extra resources from CSS url() refs")
427 for path, (abs_url, _) in css_extra_urls.items():
428 if len(resources) >= max_resources:
429 break
430 result = fetch_resource(abs_url)
431 if result:
432 res_bytes, content_type, _ = result
433 print(f" ok ({len(res_bytes):,}b {content_type}): {path}")
434 resources[path] = (res_bytes, content_type)
435
436 print(f"\n fetched {len(resources)} total subresources")
437
438 # Get session for blob uploads
439 session = get_session()
440
441 # Process root page
442 print(f"\nprocessing root page...")
443 if not no_ipfs:
444 root_cid, root_pinned = pin_to_ipfs(html_bytes)
445 if root_pinned:
446 announce_ipfs_content(root_cid)
447 else:
448 root_cid = compute_cid(html_bytes)
449 root_pinned = False
450
451 root_blob = upload_blob(session, html_bytes, "text/html")
452 if not root_blob:
453 print(" ERROR: failed to upload root page blob, aborting")
454 sys.exit(1)
455
456 print(f" root cid: {root_cid} (pinned: {root_pinned})")
457
458 # Build MASL resources map (CID strings) and blobs map (ATProto blob refs)
459 # MASL spec: src should be a CID link, not a blob ref
460 resources_map = {
461 "/": {
462 "src": root_cid,
463 "content-type": "text/html",
464 }
465 }
466 blobs_map = {
467 "/": root_blob,
468 }
469
470 # Process subresources
471 total_pinned = 1 if root_pinned else 0
472 total_blobs = 1
473 total_bytes = html_size
474 failed_blobs = 0
475
476 for path, (res_bytes, content_type) in resources.items():
477 total_bytes += len(res_bytes)
478
479 # Pin to IPFS
480 if not no_ipfs:
481 res_cid, res_pinned = pin_to_ipfs(res_bytes,
482 filename=path.split("/")[-1] or "resource",
483 content_type=content_type)
484 if res_pinned:
485 total_pinned += 1
486 announce_ipfs_content(res_cid)
487 else:
488 res_cid = compute_cid(res_bytes)
489
490 # Upload blob to PDS
491 blob_ref = upload_blob(session, res_bytes, content_type)
492 if blob_ref:
493 total_blobs += 1
494 resources_map[path] = {
495 "src": res_cid,
496 "content-type": content_type,
497 }
498 blobs_map[path] = blob_ref
499 else:
500 failed_blobs += 1
501 print(f" blob failed: {path}")
502
503 print(f"\n resources: {len(resources_map)} ({total_blobs} blobs, {total_pinned} pinned)")
504 print(f" total size: {total_bytes:,} bytes")
505 if failed_blobs:
506 print(f" failed blobs: {failed_blobs}")
507
508 # Build archive metadata (namespaced per MASL spec)
509 now = datetime.now(timezone.utc).isoformat().replace("+00:00", "Z")
510 archive_meta = {
511 "url": url,
512 "capturedAt": now,
513 "title": title,
514 "wordCount": word_count,
515 "totalSize": total_bytes,
516 "resourceCount": len(resources_map),
517 "rootIpfsCid": root_cid,
518 "contentHash": f"sha256:{hash_content(html_bytes)}",
519 "pinned": total_pinned > 0,
520 }
521 if final_url != url:
522 archive_meta["finalUrl"] = final_url
523
524 # Create bundle record
525 print(f"\ncreating archive bundle record...")
526 bundle_name = title or urlparse(url).netloc
527 result = create_bundle_record(session, bundle_name, url,
528 resources_map, blobs_map,
529 now, archive_meta=archive_meta)
530 uri = result.get("uri", "")
531 rkey = uri.split("/")[-1] if uri else "?"
532
533 print(f"\nbundled! {uri}")
534 print(f" view: https://pdsls.dev/at/{session['did']}/{COLLECTION_BUNDLE}/{rkey}")
535 print(f" resources: {len(resources_map)} files, {total_bytes:,} bytes total")
536 print(f" root IPFS: ipfs://{root_cid}")
537 return uri
538
539
540def cmd_list(limit, collection=None):
541 """List recent captures from either or both collections."""
542 session = get_session()
543 all_records = []
544
545 collections = ([COLLECTION_CAPTURE, COLLECTION_BUNDLE]
546 if not collection else [collection])
547
548 for coll in collections:
549 try:
550 resp = requests.get(f"{PDS}/xrpc/com.atproto.repo.listRecords",
551 headers={"Authorization": f"Bearer {session['accessJwt']}"},
552 params={
553 "repo": session["did"],
554 "collection": coll,
555 "limit": limit,
556 "reverse": True,
557 })
558 resp.raise_for_status()
559 for rec in resp.json().get("records", []):
560 rec["_collection"] = coll
561 all_records.append(rec)
562 except:
563 pass
564
565 if not all_records:
566 print("no captures yet")
567 return
568
569 # Sort by captured time (newest first)
570 def sort_key(r):
571 val = r.get("value", {})
572 meta = val.get("systems.witchcraft.archive", {})
573 return meta.get("capturedAt", val.get("capturedAt", ""))
574 all_records.sort(key=sort_key, reverse=True)
575
576 for rec in all_records[:limit]:
577 val = rec.get("value", {})
578 rkey = rec["uri"].split("/")[-1]
579 coll = rec["_collection"]
580 is_bundle = coll in (COLLECTION_BUNDLE, COLLECTION_MASL_LEGACY)
581
582 if is_bundle:
583 meta = val.get("systems.witchcraft.archive", {})
584 title = val.get("name", meta.get("title", "(untitled)"))[:60]
585 url = meta.get("url", "")[:60]
586 captured = meta.get("capturedAt", "")[:19]
587 cid = meta.get("rootIpfsCid", "")
588 res_count = meta.get("resourceCount", len(val.get("resources", {})))
589 total_size = meta.get("totalSize", 0)
590 tag = f"[BUNDLE {res_count} files, {total_size:,}b]"
591 else:
592 title = val.get("title", "(untitled)")[:60]
593 url = val.get("url", "")[:60]
594 captured = val.get("capturedAt", "")[:19]
595 cid = val.get("ipfsCid", val.get("cid", ""))
596 tag = "[single]"
597
598 print(f" [{rkey}] {captured} {tag}")
599 print(f" {title}")
600 print(f" {url}")
601 if cid:
602 print(f" ipfs://{cid}")
603 print()
604
605
606def cmd_verify(rkey):
607 """Verify a capture's hash against current page content."""
608 session = get_session()
609
610 # Try all collections
611 rec = None
612 for coll in [COLLECTION_CAPTURE, COLLECTION_BUNDLE]:
613 try:
614 resp = requests.get(f"{PDS}/xrpc/com.atproto.repo.getRecord",
615 headers={"Authorization": f"Bearer {session['accessJwt']}"},
616 params={"repo": session["did"], "collection": coll, "rkey": rkey})
617 resp.raise_for_status()
618 rec = resp.json()
619 rec["_collection"] = coll
620 break
621 except:
622 continue
623
624 if not rec:
625 print(f"capture {rkey} not found in any collection")
626 return
627
628 val = rec.get("value", {})
629 is_bundle = rec["_collection"] in (COLLECTION_BUNDLE, COLLECTION_MASL_LEGACY)
630
631 if is_bundle:
632 meta = val.get("systems.witchcraft.archive", {})
633 url = meta.get("finalUrl", meta.get("url", ""))
634 stored_cid = meta.get("rootIpfsCid", "")
635 stored_hash = meta.get("contentHash", "")
636 captured_at = meta.get("capturedAt", "")
637 else:
638 url = val.get("finalUrl", val.get("url", ""))
639 stored_cid = val.get("ipfsCid", val.get("cid", ""))
640 stored_hash = val.get("contentHash", "")
641 captured_at = val.get("capturedAt", "")
642
643 print(f"verifying capture {rkey} ({'bundle' if is_bundle else 'single'})...")
644 print(f" url: {url}")
645 print(f" captured: {captured_at}")
646 if stored_cid:
647 print(f" stored cid: {stored_cid}")
648 print(f" stored hash: {stored_hash}")
649
650 print(f"\nre-fetching {url}...")
651 html_bytes, _, _, _, _ = fetch_page(url)
652 current_hash = f"sha256:{hash_content(html_bytes)}"
653 current_cid = compute_cid(html_bytes)
654
655 if stored_cid:
656 print(f" current cid: {current_cid}")
657 print(f" current hash: {current_hash}")
658
659 match = (stored_cid == current_cid) if stored_cid else (stored_hash == current_hash)
660 if match:
661 print("\n MATCH - page content unchanged since capture")
662 else:
663 print("\n MISMATCH - page has changed since capture!")
664 print(" (this is expected for dynamic pages)")
665
666
667def cmd_search(query):
668 """Search captures by URL or title."""
669 session = get_session()
670 query_lower = query.lower()
671 matches = []
672
673 for coll in [COLLECTION_CAPTURE, COLLECTION_BUNDLE]:
674 try:
675 resp = requests.get(f"{PDS}/xrpc/com.atproto.repo.listRecords",
676 headers={"Authorization": f"Bearer {session['accessJwt']}"},
677 params={"repo": session["did"], "collection": coll, "limit": 100})
678 resp.raise_for_status()
679 for rec in resp.json().get("records", []):
680 val = rec.get("value", {})
681 if coll in (COLLECTION_BUNDLE, COLLECTION_MASL_LEGACY):
682 meta = val.get("systems.witchcraft.archive", {})
683 url = meta.get("url", "").lower()
684 title = val.get("name", meta.get("title", "")).lower()
685 else:
686 url = val.get("url", "").lower()
687 title = val.get("title", "").lower()
688 if query_lower in url or query_lower in title:
689 rec["_collection"] = coll
690 matches.append(rec)
691 except:
692 pass
693
694 if not matches:
695 print(f"no captures matching '{query}'")
696 return
697
698 print(f"found {len(matches)} capture(s):")
699 for rec in matches:
700 val = rec.get("value", {})
701 rkey = rec["uri"].split("/")[-1]
702 coll = rec["_collection"]
703 is_bundle = coll == COLLECTION_BUNDLE
704
705 if is_bundle:
706 meta = val.get("systems.witchcraft.archive", {})
707 title = val.get("name", meta.get("title", ""))[:60]
708 url = meta.get("url", "")
709 res_count = meta.get("resourceCount", 0)
710 print(f" [{rkey}] [BUNDLE {res_count} files] {title}")
711 else:
712 title = val.get("title", "")[:60]
713 url = val.get("url", "")
714 print(f" [{rkey}] {title}")
715 print(f" {url}")
716 print()
717
718
719# --- main ---
720
721def main():
722 parser = argparse.ArgumentParser(
723 description="Archive web pages to ATProto with IPFS pinning and MASL bundles")
724 parser.add_argument("url", nargs="?", help="URL to archive")
725 parser.add_argument("--bundle", action="store_true",
726 help="Archive URL + subresources as MASL bundle")
727 parser.add_argument("--list", action="store_true", help="List recent captures")
728 parser.add_argument("--limit", type=int, default=20, help="Number of captures to list")
729 parser.add_argument("--verify", metavar="RKEY",
730 help="Verify a capture's hash against current page")
731 parser.add_argument("--search", metavar="QUERY",
732 help="Search captures by URL or title")
733 parser.add_argument("--no-ipfs", action="store_true",
734 help="Skip IPFS pinning (compute CID locally only)")
735 parser.add_argument("--no-blob", action="store_true",
736 help="Skip PDS blob upload (single mode only)")
737 parser.add_argument("--max-resources", type=int, default=MAX_SUBRESOURCES,
738 help=f"Max subresources to fetch for bundles (default: {MAX_SUBRESOURCES})")
739
740 args = parser.parse_args()
741
742 if args.list:
743 cmd_list(args.limit)
744 elif args.verify:
745 cmd_verify(args.verify)
746 elif args.search:
747 cmd_search(args.search)
748 elif args.url:
749 if args.bundle:
750 cmd_bundle(args.url, no_ipfs=args.no_ipfs, max_resources=args.max_resources)
751 else:
752 cmd_archive(args.url, no_ipfs=args.no_ipfs, no_blob=args.no_blob)
753 else:
754 parser.print_help()
755
756
757if __name__ == "__main__":
758 main()