···11+# web-archive
22+33+web archiver with MASL bundle mode for ATProto. captures web pages as content-addressed bundles stored on your PDS with optional IPFS pinning.
44+55+## what it does
66+77+- **single mode**: archives a single HTML page with its CID stored as a `systems.witchcraft.archive.capture` ATProto record
88+- **bundle mode**: archives a page + all subresources (CSS, JS, images, fonts) as a MASL bundle (`ing.dasl.masl` record) — each resource gets its own content-addressed blob on your PDS
99+- **CSS url() scanning**: follows `@import` and `url()` references in stylesheets to capture fonts, background images, etc.
1010+- **IPFS pinning**: optionally pins the HTML to IPFS via a local kubo node
1111+- **PDS blob storage**: uploads all resources as PDS blobs with proper content-type headers
1212+1313+## usage
1414+1515+```bash
1616+# single page archive
1717+python web_archive.py https://example.com
1818+1919+# bundle mode (page + all subresources)
2020+python web_archive.py https://example.com --bundle
2121+2222+# bundle with resource limit
2323+python web_archive.py https://example.com --bundle --max-resources 50
2424+2525+# skip IPFS pinning
2626+python web_archive.py https://example.com --no-ipfs
2727+2828+# list all archives
2929+python web_archive.py --list
3030+3131+# search archives
3232+python web_archive.py --search "example"
3333+3434+# verify archive integrity
3535+python web_archive.py --verify <rkey>
3636+```
3737+3838+## auth
3939+4040+set these environment variables:
4141+4242+```bash
4343+export ATP_PDS_URL=https://your.pds.example.com
4444+export ATP_HANDLE=your.handle
4545+export ATP_PASSWORD=your-app-password
4646+```
4747+4848+## dependencies
4949+5050+```bash
5151+pip install requests beautifulsoup4
5252+```
5353+5454+optional: `ipfs` CLI (kubo) for IPFS pinning
5555+5656+## record types
5757+5858+- `systems.witchcraft.archive.capture` — single page captures
5959+- `ing.dasl.masl` — MASL bundle mode records (see [dasl.ing/masl.html](https://dasl.ing/masl.html))
6060+6161+## viewer
6262+6363+archived pages can be viewed with the [archive viewer](https://sites.wisp.place/kira.pds.witchcraft.systems/archive-viewer/) hosted on wisp.place.
6464+6565+## license
6666+6767+MIT
+730
web_archive.py
···11+#!/usr/bin/env python3
22+"""
33+web_archive.py - Capture and archive web pages to ATProto with IPFS pinning.
44+55+Creates signed, timestamped records of web page captures on your PDS.
66+Each capture includes an IPFS CID (content-addressed identifier) and a PDS blob.
77+88+Supports two modes:
99+ - Single capture: archive one URL as a `systems.witchcraft.archive.capture` record
1010+ - Bundle capture: archive a page + all its subresources (CSS, JS, images) as an
1111+ `ing.dasl.masl` Bundle Mode record (MASL spec: https://dasl.ing/masl.html)
1212+1313+Usage:
1414+ python web_archive.py <url> # Archive a URL (single capture)
1515+ python web_archive.py <url> --bundle # Archive URL + subresources as MASL bundle
1616+ python web_archive.py --list [--limit N] # List recent captures
1717+ python web_archive.py --verify <rkey> # Re-fetch and verify CID
1818+ python web_archive.py --search <query> # Search captures by URL/title
1919+"""
2020+2121+import argparse
2222+import hashlib
2323+import json
2424+import mimetypes
2525+import os
2626+import re
2727+import sys
2828+from datetime import datetime, timezone
2929+from urllib.parse import urljoin, urlparse
3030+3131+import requests
3232+from multiformats import CID, multihash
3333+3434+3535+# --- config ---
3636+3737+PDS = os.environ.get("ATP_PDS_URL", "https://bsky.social")
3838+ATP_HANDLE = os.environ.get("ATP_HANDLE", "")
3939+ATP_PASSWORD = os.environ.get("ATP_PASSWORD", "")
4040+COLLECTION_CAPTURE = "systems.witchcraft.archive.capture"
4141+COLLECTION_MASL = "ing.dasl.masl"
4242+4343+# Max subresources to fetch per bundle (safety limit)
4444+MAX_SUBRESOURCES = 100
4545+# Max size per individual resource (10MB)
4646+MAX_RESOURCE_SIZE = 10 * 1024 * 1024
4747+# Allowed subresource schemes
4848+ALLOWED_SCHEMES = {"http", "https"}
4949+5050+5151+# --- auth ---
5252+5353+def get_session():
5454+ handle = ATP_HANDLE
5555+ password = ATP_PASSWORD
5656+5757+ if not handle or not password:
5858+ print("error: set ATP_HANDLE and ATP_PASSWORD environment variables")
5959+ print(" export ATP_PDS_URL=https://your.pds.example.com")
6060+ print(" export ATP_HANDLE=your.handle")
6161+ print(" export ATP_PASSWORD=your-app-password")
6262+ sys.exit(1)
6363+6464+ resp = requests.post(f"{PDS}/xrpc/com.atproto.server.createSession",
6565+ json={"identifier": handle, "password": password})
6666+ resp.raise_for_status()
6767+ return resp.json()
6868+6969+7070+# --- fetch ---
7171+7272+def fetch_page(url):
7373+ """Fetch a URL and return (html_bytes, title, final_url, word_count, status_code)."""
7474+ headers = {
7575+ "User-Agent": "WebArchive/1.0 (ATProto web archiver)"
7676+ }
7777+ resp = requests.get(url, headers=headers, timeout=30, allow_redirects=True)
7878+ resp.raise_for_status()
7979+ html_bytes = resp.content
8080+ html = html_bytes.decode("utf-8", errors="replace")
8181+ final_url = resp.url
8282+8383+ # Extract title
8484+ title_match = re.search(r"<title[^>]*>(.*?)</title>", html, re.IGNORECASE | re.DOTALL)
8585+ title = title_match.group(1).strip() if title_match else ""
8686+ title = title.replace("&", "&").replace("<", "<").replace(">", ">")
8787+ title = title.replace("'", "'").replace(""", '"')
8888+ title = title[:256]
8989+9090+ # Word count from text extraction
9191+ text = re.sub(r"<script[^>]*>.*?</script>", "", html, flags=re.DOTALL | re.IGNORECASE)
9292+ text = re.sub(r"<style[^>]*>.*?</style>", "", text, flags=re.DOTALL | re.IGNORECASE)
9393+ text = re.sub(r"<[^>]+>", " ", text)
9494+ text = re.sub(r"\s+", " ", text).strip()
9595+ word_count = len(text.split())
9696+9797+ return html_bytes, title, final_url, word_count, resp.status_code
9898+9999+100100+def fetch_resource(url):
101101+ """Fetch a subresource. Returns (bytes, content_type, final_url) or None on failure."""
102102+ headers = {
103103+ "User-Agent": "WebArchive/1.0 (ATProto web archiver)"
104104+ }
105105+ try:
106106+ resp = requests.get(url, headers=headers, timeout=20, allow_redirects=True,
107107+ stream=True)
108108+ resp.raise_for_status()
109109+110110+ # Check size before downloading full body
111111+ content_length = resp.headers.get("content-length")
112112+ if content_length and int(content_length) > MAX_RESOURCE_SIZE:
113113+ print(f" skip (too large: {int(content_length):,} bytes): {url[:80]}")
114114+ return None
115115+116116+ content = resp.content
117117+ if len(content) > MAX_RESOURCE_SIZE:
118118+ print(f" skip (too large: {len(content):,} bytes): {url[:80]}")
119119+ return None
120120+121121+ content_type = resp.headers.get("content-type", "application/octet-stream")
122122+ # Strip charset/params from content-type for MASL
123123+ content_type = content_type.split(";")[0].strip()
124124+ return content, content_type, resp.url
125125+ except Exception as e:
126126+ print(f" skip (error: {e}): {url[:80]}")
127127+ return None
128128+129129+130130+def extract_subresource_urls(html, base_url):
131131+ """Extract URLs of subresources (CSS, JS, images, fonts) from HTML."""
132132+ urls = {} # url -> expected type hint
133133+134134+ # CSS: <link rel="stylesheet" href="...">
135135+ for m in re.finditer(r'<link[^>]+rel=["\']stylesheet["\'][^>]+href=["\']([^"\']+)["\']', html, re.IGNORECASE):
136136+ urls[m.group(1)] = "text/css"
137137+ for m in re.finditer(r'<link[^>]+href=["\']([^"\']+)["\'][^>]+rel=["\']stylesheet["\']', html, re.IGNORECASE):
138138+ urls[m.group(1)] = "text/css"
139139+140140+ # JS: <script src="...">
141141+ for m in re.finditer(r'<script[^>]+src=["\']([^"\']+)["\']', html, re.IGNORECASE):
142142+ urls[m.group(1)] = "application/javascript"
143143+144144+ # Images: <img src="...">, <source srcset="...">
145145+ for m in re.finditer(r'<img[^>]+src=["\']([^"\']+)["\']', html, re.IGNORECASE):
146146+ urls[m.group(1)] = "image/*"
147147+ for m in re.finditer(r'<source[^>]+srcset=["\']([^"\']+)["\']', html, re.IGNORECASE):
148148+ # srcset can have multiple URLs with widths
149149+ for part in m.group(1).split(","):
150150+ src = part.strip().split()[0]
151151+ if src:
152152+ urls[src] = "image/*"
153153+154154+ # Favicons and other <link> with href
155155+ for m in re.finditer(r'<link[^>]+href=["\']([^"\']+)["\'][^>]*>', html, re.IGNORECASE):
156156+ href = m.group(1)
157157+ if href not in urls:
158158+ # Check if it's an icon or other resource
159159+ tag = m.group(0).lower()
160160+ if 'rel="icon"' in tag or "rel='icon'" in tag or 'rel="apple-touch-icon"' in tag:
161161+ urls[href] = "image/*"
162162+163163+ # CSS @import and url() in inline <style>
164164+ for style_match in re.finditer(r'<style[^>]*>(.*?)</style>', html, re.DOTALL | re.IGNORECASE):
165165+ style_content = style_match.group(1)
166166+ for m in re.finditer(r'url\(["\']?([^"\')\s]+)["\']?\)', style_content):
167167+ url_val = m.group(1)
168168+ if not url_val.startswith("data:"):
169169+ urls[url_val] = "application/octet-stream"
170170+ for m in re.finditer(r'@import\s+["\']([^"\']+)["\']', style_content):
171171+ urls[m.group(1)] = "text/css"
172172+173173+ # Resolve relative URLs and filter
174174+ resolved = {}
175175+ base_parsed = urlparse(base_url)
176176+ for url, type_hint in urls.items():
177177+ if url.startswith("data:") or url.startswith("#") or url.startswith("javascript:"):
178178+ continue
179179+ absolute = urljoin(base_url, url)
180180+ parsed = urlparse(absolute)
181181+ # Only same-origin resources (MASL paths must start with /)
182182+ if parsed.scheme in ALLOWED_SCHEMES and parsed.netloc == base_parsed.netloc:
183183+ path = parsed.path or "/"
184184+ if path not in resolved:
185185+ resolved[path] = (absolute, type_hint)
186186+187187+ return resolved
188188+189189+190190+def compute_cid(content_bytes):
191191+ """Compute IPFS CIDv1 (raw codec, sha2-256) for content bytes."""
192192+ digest = hashlib.sha256(content_bytes).digest()
193193+ mh = multihash.wrap(digest, "sha2-256")
194194+ cid = CID("base32", 1, "raw", mh)
195195+ return str(cid)
196196+197197+198198+def hash_content(content_bytes):
199199+ """SHA-256 hash of content bytes."""
200200+ return hashlib.sha256(content_bytes).hexdigest()
201201+202202+203203+def pin_to_ipfs(content_bytes, filename="capture.html", content_type="text/html"):
204204+ """Pin content to local IPFS node. Returns (cid, pinned)."""
205205+ try:
206206+ resp = requests.post("http://127.0.0.1:5001/api/v0/add",
207207+ files={"file": (filename, content_bytes, content_type)},
208208+ params={"pin": "true", "cid-version": "1", "raw-leaves": "false"},
209209+ timeout=30)
210210+ resp.raise_for_status()
211211+ data = resp.json()
212212+ return data["Hash"], True
213213+ except Exception as e:
214214+ return compute_cid(content_bytes), False
215215+216216+217217+def upload_blob(session, content_bytes, content_type="text/html"):
218218+ """Upload content as a blob to the PDS. Returns blob ref."""
219219+ try:
220220+ resp = requests.post(f"{PDS}/xrpc/com.atproto.repo.uploadBlob",
221221+ headers={
222222+ "Authorization": f"Bearer {session['accessJwt']}",
223223+ "Content-Type": content_type,
224224+ },
225225+ data=content_bytes,
226226+ timeout=60)
227227+ resp.raise_for_status()
228228+ return resp.json().get("blob")
229229+ except Exception as e:
230230+ print(f" warning: blob upload failed ({e})")
231231+ return None
232232+233233+234234+def announce_ipfs_content(cid):
235235+ """Announce content to IPFS DHT so it's discoverable on public gateways."""
236236+ try:
237237+ resp = requests.post("http://127.0.0.1:5001/api/v0/routing/provide",
238238+ params={"arg": cid},
239239+ timeout=60)
240240+ return resp.status_code == 200
241241+ except:
242242+ return False
243243+244244+245245+# --- atproto records ---
246246+247247+def create_capture(session, url, final_url, title, cid, content_hash, word_count,
248248+ html_size, pinned=False, blob_ref=None):
249249+ """Create a single-page archive capture record."""
250250+ now = datetime.now(timezone.utc).isoformat().replace("+00:00", "Z")
251251+252252+ record = {
253253+ "$type": COLLECTION_CAPTURE,
254254+ "url": url,
255255+ "capturedAt": now,
256256+ "ipfsCid": cid,
257257+ "contentHash": f"sha256:{content_hash}",
258258+ "title": title,
259259+ "wordCount": word_count,
260260+ "htmlSize": html_size,
261261+ "pinned": pinned,
262262+ }
263263+264264+ if final_url != url:
265265+ record["finalUrl"] = final_url
266266+ if blob_ref:
267267+ record["htmlBlob"] = blob_ref
268268+269269+ resp = requests.post(f"{PDS}/xrpc/com.atproto.repo.createRecord",
270270+ headers={"Authorization": f"Bearer {session['accessJwt']}"},
271271+ json={
272272+ "repo": session["did"],
273273+ "collection": COLLECTION_CAPTURE,
274274+ "record": record,
275275+ })
276276+ resp.raise_for_status()
277277+ return resp.json()
278278+279279+280280+def create_masl_bundle(session, name, url, resources_map, captured_at,
281281+ archive_meta=None):
282282+ """Create a MASL Bundle Mode record on ATProto.
283283+284284+ resources_map: dict of path -> {src: blob_ref, content-type: str, ...}
285285+ archive_meta: optional dict with archival metadata (url, capturedAt, etc)
286286+ """
287287+ record = {
288288+ "$type": COLLECTION_MASL,
289289+ "name": name,
290290+ "resources": resources_map,
291291+ }
292292+293293+ # Add our archival metadata in a namespaced object (per MASL spec recommendation)
294294+ if archive_meta:
295295+ record["systems.witchcraft.archive"] = archive_meta
296296+297297+ resp = requests.post(f"{PDS}/xrpc/com.atproto.repo.createRecord",
298298+ headers={"Authorization": f"Bearer {session['accessJwt']}"},
299299+ json={
300300+ "repo": session["did"],
301301+ "collection": COLLECTION_MASL,
302302+ "record": record,
303303+ })
304304+ resp.raise_for_status()
305305+ return resp.json()
306306+307307+308308+# --- commands ---
309309+310310+def cmd_archive(url, no_ipfs=False, no_blob=False):
311311+ """Archive a single URL."""
312312+ print(f"fetching {url}...")
313313+ html_bytes, title, final_url, word_count, status = fetch_page(url)
314314+ content_hash = hash_content(html_bytes)
315315+ html_size = len(html_bytes)
316316+317317+ if not no_ipfs:
318318+ print(f" pinning to IPFS...")
319319+ cid, pinned = pin_to_ipfs(html_bytes)
320320+ if pinned:
321321+ announce_ipfs_content(cid)
322322+ else:
323323+ cid = compute_cid(html_bytes)
324324+ pinned = False
325325+326326+ print(f" title: {title[:80]}")
327327+ print(f" final url: {final_url}")
328328+ print(f" cid: {cid}")
329329+ print(f" pinned: {'yes' if pinned else 'no (local CID only)'}")
330330+ print(f" hash: sha256:{content_hash[:16]}...")
331331+ print(f" size: {html_size:,} bytes, ~{word_count:,} words")
332332+333333+ session = get_session()
334334+335335+ blob_ref = None
336336+ if not no_blob:
337337+ print(f" uploading blob to PDS...")
338338+ blob_ref = upload_blob(session, html_bytes)
339339+ if blob_ref:
340340+ print(f" blob uploaded!")
341341+342342+ result = create_capture(session, url, final_url, title, cid, content_hash,
343343+ word_count, html_size, pinned=pinned, blob_ref=blob_ref)
344344+ uri = result.get("uri", "")
345345+ rkey = uri.split("/")[-1] if uri else "?"
346346+347347+ print(f"\ncaptured! {uri}")
348348+ print(f" view: https://pdsls.dev/at/{session['did']}/{COLLECTION_CAPTURE}/{rkey}")
349349+ return uri
350350+351351+352352+def cmd_bundle(url, no_ipfs=False, max_resources=MAX_SUBRESOURCES):
353353+ """Archive a URL + all its subresources as a MASL bundle."""
354354+ print(f"fetching {url}...")
355355+ html_bytes, title, final_url, word_count, status = fetch_page(url)
356356+ html = html_bytes.decode("utf-8", errors="replace")
357357+ html_size = len(html_bytes)
358358+359359+ print(f" title: {title[:80]}")
360360+ print(f" size: {html_size:,} bytes, ~{word_count:,} words")
361361+362362+ # Extract subresource URLs
363363+ print(f"\nscanning for subresources...")
364364+ subresource_urls = extract_subresource_urls(html, final_url)
365365+ print(f" found {len(subresource_urls)} same-origin subresources")
366366+367367+ if len(subresource_urls) > max_resources:
368368+ print(f" capping at {max_resources} (use --max-resources to change)")
369369+ # Keep only the first N
370370+ subresource_urls = dict(list(subresource_urls.items())[:max_resources])
371371+372372+ # Fetch all subresources
373373+ print(f"\nfetching subresources...")
374374+ resources = {} # path -> (bytes, content_type, ipfs_cid)
375375+376376+ for path, (abs_url, type_hint) in subresource_urls.items():
377377+ result = fetch_resource(abs_url)
378378+ if result:
379379+ res_bytes, content_type, _ = result
380380+ print(f" ok ({len(res_bytes):,}b {content_type}): {path}")
381381+ resources[path] = (res_bytes, content_type)
382382+383383+ # Phase 2: Scan fetched CSS files for url() references (fonts, images, etc)
384384+ css_extra_urls = {}
385385+ for path, (res_bytes, content_type) in list(resources.items()):
386386+ if "css" in content_type:
387387+ css_text = res_bytes.decode("utf-8", errors="replace")
388388+ css_base = urljoin(final_url, path)
389389+ for m in re.finditer(r'url\(["\']?([^"\')\s]+)["\']?\)', css_text):
390390+ url_val = m.group(1)
391391+ if url_val.startswith("data:"):
392392+ continue
393393+ abs_url = urljoin(css_base, url_val)
394394+ parsed = urlparse(abs_url)
395395+ base_parsed = urlparse(final_url)
396396+ if parsed.scheme in ALLOWED_SCHEMES:
397397+ css_path = parsed.path or "/"
398398+ if css_path not in resources and css_path not in css_extra_urls:
399399+ css_extra_urls[css_path] = (abs_url, "application/octet-stream")
400400+401401+ if css_extra_urls:
402402+ print(f"\n found {len(css_extra_urls)} extra resources from CSS url() refs")
403403+ for path, (abs_url, _) in css_extra_urls.items():
404404+ if len(resources) >= max_resources:
405405+ break
406406+ result = fetch_resource(abs_url)
407407+ if result:
408408+ res_bytes, content_type, _ = result
409409+ print(f" ok ({len(res_bytes):,}b {content_type}): {path}")
410410+ resources[path] = (res_bytes, content_type)
411411+412412+ print(f"\n fetched {len(resources)} total subresources")
413413+414414+ # Get session for blob uploads
415415+ session = get_session()
416416+417417+ # Process root page
418418+ print(f"\nprocessing root page...")
419419+ if not no_ipfs:
420420+ root_cid, root_pinned = pin_to_ipfs(html_bytes)
421421+ if root_pinned:
422422+ announce_ipfs_content(root_cid)
423423+ else:
424424+ root_cid = compute_cid(html_bytes)
425425+ root_pinned = False
426426+427427+ root_blob = upload_blob(session, html_bytes, "text/html")
428428+ if not root_blob:
429429+ print(" ERROR: failed to upload root page blob, aborting")
430430+ sys.exit(1)
431431+432432+ print(f" root cid: {root_cid} (pinned: {root_pinned})")
433433+434434+ # Build MASL resources map
435435+ # Per MASL spec: keys are paths starting with /, values have src (blob ref) + content-type
436436+ resources_map = {
437437+ "/": {
438438+ "src": root_blob,
439439+ "content-type": "text/html",
440440+ }
441441+ }
442442+443443+ # Process subresources
444444+ total_pinned = 1 if root_pinned else 0
445445+ total_blobs = 1
446446+ total_bytes = html_size
447447+ failed_blobs = 0
448448+449449+ for path, (res_bytes, content_type) in resources.items():
450450+ total_bytes += len(res_bytes)
451451+452452+ # Pin to IPFS
453453+ if not no_ipfs:
454454+ res_cid, res_pinned = pin_to_ipfs(res_bytes,
455455+ filename=path.split("/")[-1] or "resource",
456456+ content_type=content_type)
457457+ if res_pinned:
458458+ total_pinned += 1
459459+ announce_ipfs_content(res_cid)
460460+ else:
461461+ res_cid = compute_cid(res_bytes)
462462+463463+ # Upload blob to PDS
464464+ blob_ref = upload_blob(session, res_bytes, content_type)
465465+ if blob_ref:
466466+ total_blobs += 1
467467+ entry = {
468468+ "src": blob_ref,
469469+ "content-type": content_type,
470470+ }
471471+ resources_map[path] = entry
472472+ else:
473473+ failed_blobs += 1
474474+ print(f" blob failed: {path}")
475475+476476+ print(f"\n resources: {len(resources_map)} ({total_blobs} blobs, {total_pinned} pinned)")
477477+ print(f" total size: {total_bytes:,} bytes")
478478+ if failed_blobs:
479479+ print(f" failed blobs: {failed_blobs}")
480480+481481+ # Build archive metadata (namespaced per MASL spec)
482482+ now = datetime.now(timezone.utc).isoformat().replace("+00:00", "Z")
483483+ archive_meta = {
484484+ "url": url,
485485+ "capturedAt": now,
486486+ "title": title,
487487+ "wordCount": word_count,
488488+ "totalSize": total_bytes,
489489+ "resourceCount": len(resources_map),
490490+ "rootIpfsCid": root_cid,
491491+ "contentHash": f"sha256:{hash_content(html_bytes)}",
492492+ "pinned": total_pinned > 0,
493493+ }
494494+ if final_url != url:
495495+ archive_meta["finalUrl"] = final_url
496496+497497+ # Create MASL bundle record
498498+ print(f"\ncreating MASL bundle record...")
499499+ bundle_name = title or urlparse(url).netloc
500500+ result = create_masl_bundle(session, bundle_name, url, resources_map,
501501+ now, archive_meta=archive_meta)
502502+ uri = result.get("uri", "")
503503+ rkey = uri.split("/")[-1] if uri else "?"
504504+505505+ print(f"\nbundled! {uri}")
506506+ print(f" view: https://pdsls.dev/at/{session['did']}/{COLLECTION_MASL}/{rkey}")
507507+ print(f" resources: {len(resources_map)} files, {total_bytes:,} bytes total")
508508+ print(f" root IPFS: ipfs://{root_cid}")
509509+ return uri
510510+511511+512512+def cmd_list(limit, collection=None):
513513+ """List recent captures from either or both collections."""
514514+ session = get_session()
515515+ all_records = []
516516+517517+ collections = [COLLECTION_CAPTURE, COLLECTION_MASL] if not collection else [collection]
518518+519519+ for coll in collections:
520520+ try:
521521+ resp = requests.get(f"{PDS}/xrpc/com.atproto.repo.listRecords",
522522+ headers={"Authorization": f"Bearer {session['accessJwt']}"},
523523+ params={
524524+ "repo": session["did"],
525525+ "collection": coll,
526526+ "limit": limit,
527527+ "reverse": True,
528528+ })
529529+ resp.raise_for_status()
530530+ for rec in resp.json().get("records", []):
531531+ rec["_collection"] = coll
532532+ all_records.append(rec)
533533+ except:
534534+ pass
535535+536536+ if not all_records:
537537+ print("no captures yet")
538538+ return
539539+540540+ # Sort by captured time (newest first)
541541+ def sort_key(r):
542542+ val = r.get("value", {})
543543+ # MASL bundles store capturedAt in namespaced metadata
544544+ meta = val.get("systems.witchcraft.archive", {})
545545+ return meta.get("capturedAt", val.get("capturedAt", ""))
546546+ all_records.sort(key=sort_key, reverse=True)
547547+548548+ for rec in all_records[:limit]:
549549+ val = rec.get("value", {})
550550+ rkey = rec["uri"].split("/")[-1]
551551+ coll = rec["_collection"]
552552+ is_bundle = coll == COLLECTION_MASL
553553+554554+ if is_bundle:
555555+ meta = val.get("systems.witchcraft.archive", {})
556556+ title = val.get("name", meta.get("title", "(untitled)"))[:60]
557557+ url = meta.get("url", "")[:60]
558558+ captured = meta.get("capturedAt", "")[:19]
559559+ cid = meta.get("rootIpfsCid", "")
560560+ res_count = meta.get("resourceCount", len(val.get("resources", {})))
561561+ total_size = meta.get("totalSize", 0)
562562+ tag = f"[BUNDLE {res_count} files, {total_size:,}b]"
563563+ else:
564564+ title = val.get("title", "(untitled)")[:60]
565565+ url = val.get("url", "")[:60]
566566+ captured = val.get("capturedAt", "")[:19]
567567+ cid = val.get("ipfsCid", val.get("cid", ""))
568568+ tag = "[single]"
569569+570570+ print(f" [{rkey}] {captured} {tag}")
571571+ print(f" {title}")
572572+ print(f" {url}")
573573+ if cid:
574574+ print(f" ipfs://{cid}")
575575+ print()
576576+577577+578578+def cmd_verify(rkey):
579579+ """Verify a capture's hash against current page content."""
580580+ session = get_session()
581581+582582+ # Try capture collection first, then MASL
583583+ rec = None
584584+ for coll in [COLLECTION_CAPTURE, COLLECTION_MASL]:
585585+ try:
586586+ resp = requests.get(f"{PDS}/xrpc/com.atproto.repo.getRecord",
587587+ headers={"Authorization": f"Bearer {session['accessJwt']}"},
588588+ params={"repo": session["did"], "collection": coll, "rkey": rkey})
589589+ resp.raise_for_status()
590590+ rec = resp.json()
591591+ rec["_collection"] = coll
592592+ break
593593+ except:
594594+ continue
595595+596596+ if not rec:
597597+ print(f"capture {rkey} not found in any collection")
598598+ return
599599+600600+ val = rec.get("value", {})
601601+ is_bundle = rec["_collection"] == COLLECTION_MASL
602602+603603+ if is_bundle:
604604+ meta = val.get("systems.witchcraft.archive", {})
605605+ url = meta.get("finalUrl", meta.get("url", ""))
606606+ stored_cid = meta.get("rootIpfsCid", "")
607607+ stored_hash = meta.get("contentHash", "")
608608+ captured_at = meta.get("capturedAt", "")
609609+ else:
610610+ url = val.get("finalUrl", val.get("url", ""))
611611+ stored_cid = val.get("ipfsCid", val.get("cid", ""))
612612+ stored_hash = val.get("contentHash", "")
613613+ captured_at = val.get("capturedAt", "")
614614+615615+ print(f"verifying capture {rkey} ({'bundle' if is_bundle else 'single'})...")
616616+ print(f" url: {url}")
617617+ print(f" captured: {captured_at}")
618618+ if stored_cid:
619619+ print(f" stored cid: {stored_cid}")
620620+ print(f" stored hash: {stored_hash}")
621621+622622+ print(f"\nre-fetching {url}...")
623623+ html_bytes, _, _, _, _ = fetch_page(url)
624624+ current_hash = f"sha256:{hash_content(html_bytes)}"
625625+ current_cid = compute_cid(html_bytes)
626626+627627+ if stored_cid:
628628+ print(f" current cid: {current_cid}")
629629+ print(f" current hash: {current_hash}")
630630+631631+ match = (stored_cid == current_cid) if stored_cid else (stored_hash == current_hash)
632632+ if match:
633633+ print("\n MATCH - page content unchanged since capture")
634634+ else:
635635+ print("\n MISMATCH - page has changed since capture!")
636636+ print(" (this is expected for dynamic pages)")
637637+638638+639639+def cmd_search(query):
640640+ """Search captures by URL or title."""
641641+ session = get_session()
642642+ query_lower = query.lower()
643643+ matches = []
644644+645645+ for coll in [COLLECTION_CAPTURE, COLLECTION_MASL]:
646646+ try:
647647+ resp = requests.get(f"{PDS}/xrpc/com.atproto.repo.listRecords",
648648+ headers={"Authorization": f"Bearer {session['accessJwt']}"},
649649+ params={"repo": session["did"], "collection": coll, "limit": 100})
650650+ resp.raise_for_status()
651651+ for rec in resp.json().get("records", []):
652652+ val = rec.get("value", {})
653653+ if coll == COLLECTION_MASL:
654654+ meta = val.get("systems.witchcraft.archive", {})
655655+ url = meta.get("url", "").lower()
656656+ title = val.get("name", meta.get("title", "")).lower()
657657+ else:
658658+ url = val.get("url", "").lower()
659659+ title = val.get("title", "").lower()
660660+ if query_lower in url or query_lower in title:
661661+ rec["_collection"] = coll
662662+ matches.append(rec)
663663+ except:
664664+ pass
665665+666666+ if not matches:
667667+ print(f"no captures matching '{query}'")
668668+ return
669669+670670+ print(f"found {len(matches)} capture(s):")
671671+ for rec in matches:
672672+ val = rec.get("value", {})
673673+ rkey = rec["uri"].split("/")[-1]
674674+ coll = rec["_collection"]
675675+ is_bundle = coll == COLLECTION_MASL
676676+677677+ if is_bundle:
678678+ meta = val.get("systems.witchcraft.archive", {})
679679+ title = val.get("name", meta.get("title", ""))[:60]
680680+ url = meta.get("url", "")
681681+ res_count = meta.get("resourceCount", 0)
682682+ print(f" [{rkey}] [BUNDLE {res_count} files] {title}")
683683+ else:
684684+ title = val.get("title", "")[:60]
685685+ url = val.get("url", "")
686686+ print(f" [{rkey}] {title}")
687687+ print(f" {url}")
688688+ print()
689689+690690+691691+# --- main ---
692692+693693+def main():
694694+ parser = argparse.ArgumentParser(
695695+ description="Archive web pages to ATProto with IPFS pinning and MASL bundles")
696696+ parser.add_argument("url", nargs="?", help="URL to archive")
697697+ parser.add_argument("--bundle", action="store_true",
698698+ help="Archive URL + subresources as MASL bundle")
699699+ parser.add_argument("--list", action="store_true", help="List recent captures")
700700+ parser.add_argument("--limit", type=int, default=20, help="Number of captures to list")
701701+ parser.add_argument("--verify", metavar="RKEY",
702702+ help="Verify a capture's hash against current page")
703703+ parser.add_argument("--search", metavar="QUERY",
704704+ help="Search captures by URL or title")
705705+ parser.add_argument("--no-ipfs", action="store_true",
706706+ help="Skip IPFS pinning (compute CID locally only)")
707707+ parser.add_argument("--no-blob", action="store_true",
708708+ help="Skip PDS blob upload (single mode only)")
709709+ parser.add_argument("--max-resources", type=int, default=MAX_SUBRESOURCES,
710710+ help=f"Max subresources to fetch for bundles (default: {MAX_SUBRESOURCES})")
711711+712712+ args = parser.parse_args()
713713+714714+ if args.list:
715715+ cmd_list(args.limit)
716716+ elif args.verify:
717717+ cmd_verify(args.verify)
718718+ elif args.search:
719719+ cmd_search(args.search)
720720+ elif args.url:
721721+ if args.bundle:
722722+ cmd_bundle(args.url, no_ipfs=args.no_ipfs, max_resources=args.max_resources)
723723+ else:
724724+ cmd_archive(args.url, no_ipfs=args.no_ipfs, no_blob=args.no_blob)
725725+ else:
726726+ parser.print_help()
727727+728728+729729+if __name__ == "__main__":
730730+ main()