web_archive.py at main · kira.pds.witchcraft.systems/web-archive

kira.pds.witchcraft.systems / web-archive
web archiver with MASL bundle mode for ATProto. captures web pages as content-addressed bundles stored on your PDS with optional IPFS pinning.
fork
web-archive / web_archive.py
at main 758 lines 28 kB view raw
wrap content
Kira consolidate to ing.dasl.masl collection (web tiles format) 2mo ago
210d569b
  1#!/usr/bin/env python3
  2"""
  3web_archive.py - Capture and archive web pages to ATProto with IPFS pinning.
  4
  5Creates signed, timestamped records of web page captures on your PDS.
  6Each capture includes an IPFS CID (content-addressed identifier) and a PDS blob.
  7
  8Supports two modes:
  9  - Single capture: archive one URL as a `systems.witchcraft.archive.capture` record
 10  - Bundle capture: archive a page + all its subresources (CSS, JS, images) as a
 11    `systems.witchcraft.archive.bundle` record containing a MASL-shaped manifest
 12    with CID-addressed resources (MASL spec: https://dasl.ing/masl.html)
 13
 14Usage:
 15    python web_archive.py <url>                    # Archive a URL (single capture)
 16    python web_archive.py <url> --bundle           # Archive URL + subresources as MASL bundle
 17    python web_archive.py --list [--limit N]       # List recent captures
 18    python web_archive.py --verify <rkey>          # Re-fetch and verify CID
 19    python web_archive.py --search <query>         # Search captures by URL/title
 20"""
 21
 22import argparse
 23import hashlib
 24import json
 25import mimetypes
 26import os
 27import re
 28import sys
 29from datetime import datetime, timezone
 30from urllib.parse import urljoin, urlparse
 31
 32import requests
 33from multiformats import CID, multihash
 34
 35
 36# --- config ---
 37
 38PDS = os.environ.get("ATP_PDS_URL", "https://bsky.social")
 39ATP_HANDLE = os.environ.get("ATP_HANDLE", "")
 40ATP_PASSWORD = os.environ.get("ATP_PASSWORD", "")
 41COLLECTION_CAPTURE = "systems.witchcraft.archive.capture"
 42COLLECTION_BUNDLE = "ing.dasl.masl"
 43COLLECTION_MASL_LEGACY = COLLECTION_BUNDLE  # consolidated
 44
 45# Max subresources to fetch per bundle (safety limit)
 46MAX_SUBRESOURCES = 100
 47# Max size per individual resource (10MB)
 48MAX_RESOURCE_SIZE = 10 * 1024 * 1024
 49# Allowed subresource schemes
 50ALLOWED_SCHEMES = {"http", "https"}
 51
 52
 53# --- auth ---
 54
 55def get_session():
 56    handle = ATP_HANDLE
 57    password = ATP_PASSWORD
 58
 59    if not handle or not password:
 60        print("error: set ATP_HANDLE and ATP_PASSWORD environment variables")
 61        print("  export ATP_PDS_URL=https://your.pds.example.com")
 62        print("  export ATP_HANDLE=your.handle")
 63        print("  export ATP_PASSWORD=your-app-password")
 64        sys.exit(1)
 65
 66    resp = requests.post(f"{PDS}/xrpc/com.atproto.server.createSession",
 67        json={"identifier": handle, "password": password})
 68    resp.raise_for_status()
 69    return resp.json()
 70
 71
 72# --- fetch ---
 73
 74def fetch_page(url):
 75    """Fetch a URL and return (html_bytes, title, final_url, word_count, status_code)."""
 76    headers = {
 77        "User-Agent": "WebArchive/1.0 (ATProto web archiver)"
 78    }
 79    resp = requests.get(url, headers=headers, timeout=30, allow_redirects=True)
 80    resp.raise_for_status()
 81    html_bytes = resp.content
 82    html = html_bytes.decode("utf-8", errors="replace")
 83    final_url = resp.url
 84
 85    # Extract title
 86    title_match = re.search(r"<title[^>]*>(.*?)</title>", html, re.IGNORECASE | re.DOTALL)
 87    title = title_match.group(1).strip() if title_match else ""
 88    title = title.replace("&amp;", "&").replace("&lt;", "<").replace("&gt;", ">")
 89    title = title.replace("&#39;", "'").replace("&quot;", '"')
 90    title = title[:256]
 91
 92    # Word count from text extraction
 93    text = re.sub(r"<script[^>]*>.*?</script>", "", html, flags=re.DOTALL | re.IGNORECASE)
 94    text = re.sub(r"<style[^>]*>.*?</style>", "", text, flags=re.DOTALL | re.IGNORECASE)
 95    text = re.sub(r"<[^>]+>", " ", text)
 96    text = re.sub(r"\s+", " ", text).strip()
 97    word_count = len(text.split())
 98
 99    return html_bytes, title, final_url, word_count, resp.status_code
100
101
102def fetch_resource(url):
103    """Fetch a subresource. Returns (bytes, content_type, final_url) or None on failure."""
104    headers = {
105        "User-Agent": "WebArchive/1.0 (ATProto web archiver)"
106    }
107    try:
108        resp = requests.get(url, headers=headers, timeout=20, allow_redirects=True,
109                           stream=True)
110        resp.raise_for_status()
111
112        # Check size before downloading full body
113        content_length = resp.headers.get("content-length")
114        if content_length and int(content_length) > MAX_RESOURCE_SIZE:
115            print(f"    skip (too large: {int(content_length):,} bytes): {url[:80]}")
116            return None
117
118        content = resp.content
119        if len(content) > MAX_RESOURCE_SIZE:
120            print(f"    skip (too large: {len(content):,} bytes): {url[:80]}")
121            return None
122
123        content_type = resp.headers.get("content-type", "application/octet-stream")
124        # Strip charset/params from content-type for MASL
125        content_type = content_type.split(";")[0].strip()
126        return content, content_type, resp.url
127    except Exception as e:
128        print(f"    skip (error: {e}): {url[:80]}")
129        return None
130
131
132def extract_subresource_urls(html, base_url):
133    """Extract URLs of subresources (CSS, JS, images, fonts) from HTML."""
134    urls = {}  # url -> expected type hint
135
136    # CSS: <link rel="stylesheet" href="...">
137    for m in re.finditer(r'<link[^>]+rel=["\']stylesheet["\'][^>]+href=["\']([^"\']+)["\']', html, re.IGNORECASE):
138        urls[m.group(1)] = "text/css"
139    for m in re.finditer(r'<link[^>]+href=["\']([^"\']+)["\'][^>]+rel=["\']stylesheet["\']', html, re.IGNORECASE):
140        urls[m.group(1)] = "text/css"
141
142    # JS: <script src="...">
143    for m in re.finditer(r'<script[^>]+src=["\']([^"\']+)["\']', html, re.IGNORECASE):
144        urls[m.group(1)] = "application/javascript"
145
146    # Images: <img src="...">, <source srcset="...">
147    for m in re.finditer(r'<img[^>]+src=["\']([^"\']+)["\']', html, re.IGNORECASE):
148        urls[m.group(1)] = "image/*"
149    for m in re.finditer(r'<source[^>]+srcset=["\']([^"\']+)["\']', html, re.IGNORECASE):
150        # srcset can have multiple URLs with widths
151        for part in m.group(1).split(","):
152            src = part.strip().split()[0]
153            if src:
154                urls[src] = "image/*"
155
156    # Favicons and other <link> with href
157    for m in re.finditer(r'<link[^>]+href=["\']([^"\']+)["\'][^>]*>', html, re.IGNORECASE):
158        href = m.group(1)
159        if href not in urls:
160            # Check if it's an icon or other resource
161            tag = m.group(0).lower()
162            if 'rel="icon"' in tag or "rel='icon'" in tag or 'rel="apple-touch-icon"' in tag:
163                urls[href] = "image/*"
164
165    # CSS @import and url() in inline <style>
166    for style_match in re.finditer(r'<style[^>]*>(.*?)</style>', html, re.DOTALL | re.IGNORECASE):
167        style_content = style_match.group(1)
168        for m in re.finditer(r'url\(["\']?([^"\')\s]+)["\']?\)', style_content):
169            url_val = m.group(1)
170            if not url_val.startswith("data:"):
171                urls[url_val] = "application/octet-stream"
172        for m in re.finditer(r'@import\s+["\']([^"\']+)["\']', style_content):
173            urls[m.group(1)] = "text/css"
174
175    # Resolve relative URLs and filter
176    resolved = {}
177    base_parsed = urlparse(base_url)
178    for url, type_hint in urls.items():
179        if url.startswith("data:") or url.startswith("#") or url.startswith("javascript:"):
180            continue
181        absolute = urljoin(base_url, url)
182        parsed = urlparse(absolute)
183        # Only same-origin resources (MASL paths must start with /)
184        if parsed.scheme in ALLOWED_SCHEMES and parsed.netloc == base_parsed.netloc:
185            path = parsed.path or "/"
186            if path not in resolved:
187                resolved[path] = (absolute, type_hint)
188
189    return resolved
190
191
192def compute_cid(content_bytes):
193    """Compute IPFS CIDv1 (raw codec, sha2-256) for content bytes."""
194    digest = hashlib.sha256(content_bytes).digest()
195    mh = multihash.wrap(digest, "sha2-256")
196    cid = CID("base32", 1, "raw", mh)
197    return str(cid)
198
199
200def hash_content(content_bytes):
201    """SHA-256 hash of content bytes."""
202    return hashlib.sha256(content_bytes).hexdigest()
203
204
205def pin_to_ipfs(content_bytes, filename="capture.html", content_type="text/html"):
206    """Pin content to local IPFS node. Returns (cid, pinned)."""
207    try:
208        resp = requests.post("http://127.0.0.1:5001/api/v0/add",
209            files={"file": (filename, content_bytes, content_type)},
210            params={"pin": "true", "cid-version": "1", "raw-leaves": "false"},
211            timeout=30)
212        resp.raise_for_status()
213        data = resp.json()
214        return data["Hash"], True
215    except Exception as e:
216        return compute_cid(content_bytes), False
217
218
219def upload_blob(session, content_bytes, content_type="text/html"):
220    """Upload content as a blob to the PDS. Returns blob ref."""
221    try:
222        resp = requests.post(f"{PDS}/xrpc/com.atproto.repo.uploadBlob",
223            headers={
224                "Authorization": f"Bearer {session['accessJwt']}",
225                "Content-Type": content_type,
226            },
227            data=content_bytes,
228            timeout=60)
229        resp.raise_for_status()
230        return resp.json().get("blob")
231    except Exception as e:
232        print(f"    warning: blob upload failed ({e})")
233        return None
234
235
236def announce_ipfs_content(cid):
237    """Announce content to IPFS DHT so it's discoverable on public gateways."""
238    try:
239        resp = requests.post("http://127.0.0.1:5001/api/v0/routing/provide",
240            params={"arg": cid},
241            timeout=60)
242        return resp.status_code == 200
243    except:
244        return False
245
246
247# --- atproto records ---
248
249def create_capture(session, url, final_url, title, cid, content_hash, word_count,
250                   html_size, pinned=False, blob_ref=None):
251    """Create a single-page archive capture record."""
252    now = datetime.now(timezone.utc).isoformat().replace("+00:00", "Z")
253
254    record = {
255        "$type": COLLECTION_CAPTURE,
256        "url": url,
257        "capturedAt": now,
258        "ipfsCid": cid,
259        "contentHash": f"sha256:{content_hash}",
260        "title": title,
261        "wordCount": word_count,
262        "htmlSize": html_size,
263        "pinned": pinned,
264    }
265
266    if final_url != url:
267        record["finalUrl"] = final_url
268    if blob_ref:
269        record["htmlBlob"] = blob_ref
270
271    resp = requests.post(f"{PDS}/xrpc/com.atproto.repo.createRecord",
272        headers={"Authorization": f"Bearer {session['accessJwt']}"},
273        json={
274            "repo": session["did"],
275            "collection": COLLECTION_CAPTURE,
276            "record": record,
277        })
278    resp.raise_for_status()
279    return resp.json()
280
281
282def create_bundle_record(session, name, url, resources_map, blobs_map,
283                         captured_at, archive_meta=None):
284    """Create an ing.dasl.masl record on ATProto (Web Tiles format).
285
286    Uses MASL bundle format with archive metadata namespaced under
287    systems.witchcraft.archive.
288
289    resources_map: dict of path -> {src: blob_ref, content-type: str}
290    blobs_map:     dict of path -> blob_ref (kept for compat, merged into resources)
291    archive_meta:  dict with url, capturedAt, title, etc.
292    """
293    # Merge blob refs into resources (MASL format: resources[path].src = blob ref)
294    merged_resources = {}
295    for path, res_data in resources_map.items():
296        entry = dict(res_data) if isinstance(res_data, dict) else {"src": res_data}
297        if path in blobs_map:
298            entry["src"] = blobs_map[path]
299        merged_resources[path] = entry
300
301    record = {
302        "$type": COLLECTION_BUNDLE,
303        # MASL required fields at top level
304        "name": name,
305        "resources": merged_resources,
306    }
307
308    # Archive metadata namespaced
309    archive_ns = {
310        "url": archive_meta.get("url", url) if archive_meta else url,
311        "title": archive_meta.get("title", name) if archive_meta else name,
312        "capturedAt": captured_at,
313    }
314    if archive_meta:
315        for key in ["wordCount", "totalSize", "resourceCount",
316                     "rootIpfsCid", "contentHash", "pinned", "finalUrl"]:
317            if key in archive_meta:
318                archive_ns[key] = archive_meta[key]
319    record["systems.witchcraft.archive"] = archive_ns
320
321    resp = requests.post(f"{PDS}/xrpc/com.atproto.repo.createRecord",
322        headers={"Authorization": f"Bearer {session['accessJwt']}"},
323        json={
324            "repo": session["did"],
325            "collection": COLLECTION_BUNDLE,
326            "record": record,
327        })
328    resp.raise_for_status()
329    return resp.json()
330
331
332# --- commands ---
333
334def cmd_archive(url, no_ipfs=False, no_blob=False):
335    """Archive a single URL."""
336    print(f"fetching {url}...")
337    html_bytes, title, final_url, word_count, status = fetch_page(url)
338    content_hash = hash_content(html_bytes)
339    html_size = len(html_bytes)
340
341    if not no_ipfs:
342        print(f"  pinning to IPFS...")
343        cid, pinned = pin_to_ipfs(html_bytes)
344        if pinned:
345            announce_ipfs_content(cid)
346    else:
347        cid = compute_cid(html_bytes)
348        pinned = False
349
350    print(f"  title: {title[:80]}")
351    print(f"  final url: {final_url}")
352    print(f"  cid: {cid}")
353    print(f"  pinned: {'yes' if pinned else 'no (local CID only)'}")
354    print(f"  hash: sha256:{content_hash[:16]}...")
355    print(f"  size: {html_size:,} bytes, ~{word_count:,} words")
356
357    session = get_session()
358
359    blob_ref = None
360    if not no_blob:
361        print(f"  uploading blob to PDS...")
362        blob_ref = upload_blob(session, html_bytes)
363        if blob_ref:
364            print(f"  blob uploaded!")
365
366    result = create_capture(session, url, final_url, title, cid, content_hash,
367                           word_count, html_size, pinned=pinned, blob_ref=blob_ref)
368    uri = result.get("uri", "")
369    rkey = uri.split("/")[-1] if uri else "?"
370
371    print(f"\ncaptured! {uri}")
372    print(f"  view: https://pdsls.dev/at/{session['did']}/{COLLECTION_CAPTURE}/{rkey}")
373    return uri
374
375
376def cmd_bundle(url, no_ipfs=False, max_resources=MAX_SUBRESOURCES):
377    """Archive a URL + all its subresources as a MASL bundle."""
378    print(f"fetching {url}...")
379    html_bytes, title, final_url, word_count, status = fetch_page(url)
380    html = html_bytes.decode("utf-8", errors="replace")
381    html_size = len(html_bytes)
382
383    print(f"  title: {title[:80]}")
384    print(f"  size: {html_size:,} bytes, ~{word_count:,} words")
385
386    # Extract subresource URLs
387    print(f"\nscanning for subresources...")
388    subresource_urls = extract_subresource_urls(html, final_url)
389    print(f"  found {len(subresource_urls)} same-origin subresources")
390
391    if len(subresource_urls) > max_resources:
392        print(f"  capping at {max_resources} (use --max-resources to change)")
393        # Keep only the first N
394        subresource_urls = dict(list(subresource_urls.items())[:max_resources])
395
396    # Fetch all subresources
397    print(f"\nfetching subresources...")
398    resources = {}  # path -> (bytes, content_type, ipfs_cid)
399
400    for path, (abs_url, type_hint) in subresource_urls.items():
401        result = fetch_resource(abs_url)
402        if result:
403            res_bytes, content_type, _ = result
404            print(f"    ok ({len(res_bytes):,}b {content_type}): {path}")
405            resources[path] = (res_bytes, content_type)
406
407    # Phase 2: Scan fetched CSS files for url() references (fonts, images, etc)
408    css_extra_urls = {}
409    for path, (res_bytes, content_type) in list(resources.items()):
410        if "css" in content_type:
411            css_text = res_bytes.decode("utf-8", errors="replace")
412            css_base = urljoin(final_url, path)
413            for m in re.finditer(r'url\(["\']?([^"\')\s]+)["\']?\)', css_text):
414                url_val = m.group(1)
415                if url_val.startswith("data:"):
416                    continue
417                abs_url = urljoin(css_base, url_val)
418                parsed = urlparse(abs_url)
419                base_parsed = urlparse(final_url)
420                if parsed.scheme in ALLOWED_SCHEMES:
421                    css_path = parsed.path or "/"
422                    if css_path not in resources and css_path not in css_extra_urls:
423                        css_extra_urls[css_path] = (abs_url, "application/octet-stream")
424
425    if css_extra_urls:
426        print(f"\n  found {len(css_extra_urls)} extra resources from CSS url() refs")
427        for path, (abs_url, _) in css_extra_urls.items():
428            if len(resources) >= max_resources:
429                break
430            result = fetch_resource(abs_url)
431            if result:
432                res_bytes, content_type, _ = result
433                print(f"    ok ({len(res_bytes):,}b {content_type}): {path}")
434                resources[path] = (res_bytes, content_type)
435
436    print(f"\n  fetched {len(resources)} total subresources")
437
438    # Get session for blob uploads
439    session = get_session()
440
441    # Process root page
442    print(f"\nprocessing root page...")
443    if not no_ipfs:
444        root_cid, root_pinned = pin_to_ipfs(html_bytes)
445        if root_pinned:
446            announce_ipfs_content(root_cid)
447    else:
448        root_cid = compute_cid(html_bytes)
449        root_pinned = False
450
451    root_blob = upload_blob(session, html_bytes, "text/html")
452    if not root_blob:
453        print("  ERROR: failed to upload root page blob, aborting")
454        sys.exit(1)
455
456    print(f"  root cid: {root_cid} (pinned: {root_pinned})")
457
458    # Build MASL resources map (CID strings) and blobs map (ATProto blob refs)
459    # MASL spec: src should be a CID link, not a blob ref
460    resources_map = {
461        "/": {
462            "src": root_cid,
463            "content-type": "text/html",
464        }
465    }
466    blobs_map = {
467        "/": root_blob,
468    }
469
470    # Process subresources
471    total_pinned = 1 if root_pinned else 0
472    total_blobs = 1
473    total_bytes = html_size
474    failed_blobs = 0
475
476    for path, (res_bytes, content_type) in resources.items():
477        total_bytes += len(res_bytes)
478
479        # Pin to IPFS
480        if not no_ipfs:
481            res_cid, res_pinned = pin_to_ipfs(res_bytes,
482                filename=path.split("/")[-1] or "resource",
483                content_type=content_type)
484            if res_pinned:
485                total_pinned += 1
486                announce_ipfs_content(res_cid)
487        else:
488            res_cid = compute_cid(res_bytes)
489
490        # Upload blob to PDS
491        blob_ref = upload_blob(session, res_bytes, content_type)
492        if blob_ref:
493            total_blobs += 1
494            resources_map[path] = {
495                "src": res_cid,
496                "content-type": content_type,
497            }
498            blobs_map[path] = blob_ref
499        else:
500            failed_blobs += 1
501            print(f"    blob failed: {path}")
502
503    print(f"\n  resources: {len(resources_map)} ({total_blobs} blobs, {total_pinned} pinned)")
504    print(f"  total size: {total_bytes:,} bytes")
505    if failed_blobs:
506        print(f"  failed blobs: {failed_blobs}")
507
508    # Build archive metadata (namespaced per MASL spec)
509    now = datetime.now(timezone.utc).isoformat().replace("+00:00", "Z")
510    archive_meta = {
511        "url": url,
512        "capturedAt": now,
513        "title": title,
514        "wordCount": word_count,
515        "totalSize": total_bytes,
516        "resourceCount": len(resources_map),
517        "rootIpfsCid": root_cid,
518        "contentHash": f"sha256:{hash_content(html_bytes)}",
519        "pinned": total_pinned > 0,
520    }
521    if final_url != url:
522        archive_meta["finalUrl"] = final_url
523
524    # Create bundle record
525    print(f"\ncreating archive bundle record...")
526    bundle_name = title or urlparse(url).netloc
527    result = create_bundle_record(session, bundle_name, url,
528                                  resources_map, blobs_map,
529                                  now, archive_meta=archive_meta)
530    uri = result.get("uri", "")
531    rkey = uri.split("/")[-1] if uri else "?"
532
533    print(f"\nbundled! {uri}")
534    print(f"  view: https://pdsls.dev/at/{session['did']}/{COLLECTION_BUNDLE}/{rkey}")
535    print(f"  resources: {len(resources_map)} files, {total_bytes:,} bytes total")
536    print(f"  root IPFS: ipfs://{root_cid}")
537    return uri
538
539
540def cmd_list(limit, collection=None):
541    """List recent captures from either or both collections."""
542    session = get_session()
543    all_records = []
544
545    collections = ([COLLECTION_CAPTURE, COLLECTION_BUNDLE]
546                   if not collection else [collection])
547
548    for coll in collections:
549        try:
550            resp = requests.get(f"{PDS}/xrpc/com.atproto.repo.listRecords",
551                headers={"Authorization": f"Bearer {session['accessJwt']}"},
552                params={
553                    "repo": session["did"],
554                    "collection": coll,
555                    "limit": limit,
556                    "reverse": True,
557                })
558            resp.raise_for_status()
559            for rec in resp.json().get("records", []):
560                rec["_collection"] = coll
561                all_records.append(rec)
562        except:
563            pass
564
565    if not all_records:
566        print("no captures yet")
567        return
568
569    # Sort by captured time (newest first)
570    def sort_key(r):
571        val = r.get("value", {})
572        meta = val.get("systems.witchcraft.archive", {})
573        return meta.get("capturedAt", val.get("capturedAt", ""))
574    all_records.sort(key=sort_key, reverse=True)
575
576    for rec in all_records[:limit]:
577        val = rec.get("value", {})
578        rkey = rec["uri"].split("/")[-1]
579        coll = rec["_collection"]
580        is_bundle = coll in (COLLECTION_BUNDLE, COLLECTION_MASL_LEGACY)
581
582        if is_bundle:
583            meta = val.get("systems.witchcraft.archive", {})
584            title = val.get("name", meta.get("title", "(untitled)"))[:60]
585            url = meta.get("url", "")[:60]
586            captured = meta.get("capturedAt", "")[:19]
587            cid = meta.get("rootIpfsCid", "")
588            res_count = meta.get("resourceCount", len(val.get("resources", {})))
589            total_size = meta.get("totalSize", 0)
590            tag = f"[BUNDLE {res_count} files, {total_size:,}b]"
591        else:
592            title = val.get("title", "(untitled)")[:60]
593            url = val.get("url", "")[:60]
594            captured = val.get("capturedAt", "")[:19]
595            cid = val.get("ipfsCid", val.get("cid", ""))
596            tag = "[single]"
597
598        print(f"  [{rkey}] {captured} {tag}")
599        print(f"    {title}")
600        print(f"    {url}")
601        if cid:
602            print(f"    ipfs://{cid}")
603        print()
604
605
606def cmd_verify(rkey):
607    """Verify a capture's hash against current page content."""
608    session = get_session()
609
610    # Try all collections
611    rec = None
612    for coll in [COLLECTION_CAPTURE, COLLECTION_BUNDLE]:
613        try:
614            resp = requests.get(f"{PDS}/xrpc/com.atproto.repo.getRecord",
615                headers={"Authorization": f"Bearer {session['accessJwt']}"},
616                params={"repo": session["did"], "collection": coll, "rkey": rkey})
617            resp.raise_for_status()
618            rec = resp.json()
619            rec["_collection"] = coll
620            break
621        except:
622            continue
623
624    if not rec:
625        print(f"capture {rkey} not found in any collection")
626        return
627
628    val = rec.get("value", {})
629    is_bundle = rec["_collection"] in (COLLECTION_BUNDLE, COLLECTION_MASL_LEGACY)
630
631    if is_bundle:
632        meta = val.get("systems.witchcraft.archive", {})
633        url = meta.get("finalUrl", meta.get("url", ""))
634        stored_cid = meta.get("rootIpfsCid", "")
635        stored_hash = meta.get("contentHash", "")
636        captured_at = meta.get("capturedAt", "")
637    else:
638        url = val.get("finalUrl", val.get("url", ""))
639        stored_cid = val.get("ipfsCid", val.get("cid", ""))
640        stored_hash = val.get("contentHash", "")
641        captured_at = val.get("capturedAt", "")
642
643    print(f"verifying capture {rkey} ({'bundle' if is_bundle else 'single'})...")
644    print(f"  url: {url}")
645    print(f"  captured: {captured_at}")
646    if stored_cid:
647        print(f"  stored cid: {stored_cid}")
648    print(f"  stored hash: {stored_hash}")
649
650    print(f"\nre-fetching {url}...")
651    html_bytes, _, _, _, _ = fetch_page(url)
652    current_hash = f"sha256:{hash_content(html_bytes)}"
653    current_cid = compute_cid(html_bytes)
654
655    if stored_cid:
656        print(f"  current cid: {current_cid}")
657    print(f"  current hash: {current_hash}")
658
659    match = (stored_cid == current_cid) if stored_cid else (stored_hash == current_hash)
660    if match:
661        print("\n  MATCH - page content unchanged since capture")
662    else:
663        print("\n  MISMATCH - page has changed since capture!")
664        print("  (this is expected for dynamic pages)")
665
666
667def cmd_search(query):
668    """Search captures by URL or title."""
669    session = get_session()
670    query_lower = query.lower()
671    matches = []
672
673    for coll in [COLLECTION_CAPTURE, COLLECTION_BUNDLE]:
674        try:
675            resp = requests.get(f"{PDS}/xrpc/com.atproto.repo.listRecords",
676                headers={"Authorization": f"Bearer {session['accessJwt']}"},
677                params={"repo": session["did"], "collection": coll, "limit": 100})
678            resp.raise_for_status()
679            for rec in resp.json().get("records", []):
680                val = rec.get("value", {})
681                if coll in (COLLECTION_BUNDLE, COLLECTION_MASL_LEGACY):
682                    meta = val.get("systems.witchcraft.archive", {})
683                    url = meta.get("url", "").lower()
684                    title = val.get("name", meta.get("title", "")).lower()
685                else:
686                    url = val.get("url", "").lower()
687                    title = val.get("title", "").lower()
688                if query_lower in url or query_lower in title:
689                    rec["_collection"] = coll
690                    matches.append(rec)
691        except:
692            pass
693
694    if not matches:
695        print(f"no captures matching '{query}'")
696        return
697
698    print(f"found {len(matches)} capture(s):")
699    for rec in matches:
700        val = rec.get("value", {})
701        rkey = rec["uri"].split("/")[-1]
702        coll = rec["_collection"]
703        is_bundle = coll == COLLECTION_BUNDLE
704
705        if is_bundle:
706            meta = val.get("systems.witchcraft.archive", {})
707            title = val.get("name", meta.get("title", ""))[:60]
708            url = meta.get("url", "")
709            res_count = meta.get("resourceCount", 0)
710            print(f"  [{rkey}] [BUNDLE {res_count} files] {title}")
711        else:
712            title = val.get("title", "")[:60]
713            url = val.get("url", "")
714            print(f"  [{rkey}] {title}")
715        print(f"    {url}")
716        print()
717
718
719# --- main ---
720
721def main():
722    parser = argparse.ArgumentParser(
723        description="Archive web pages to ATProto with IPFS pinning and MASL bundles")
724    parser.add_argument("url", nargs="?", help="URL to archive")
725    parser.add_argument("--bundle", action="store_true",
726        help="Archive URL + subresources as MASL bundle")
727    parser.add_argument("--list", action="store_true", help="List recent captures")
728    parser.add_argument("--limit", type=int, default=20, help="Number of captures to list")
729    parser.add_argument("--verify", metavar="RKEY",
730        help="Verify a capture's hash against current page")
731    parser.add_argument("--search", metavar="QUERY",
732        help="Search captures by URL or title")
733    parser.add_argument("--no-ipfs", action="store_true",
734        help="Skip IPFS pinning (compute CID locally only)")
735    parser.add_argument("--no-blob", action="store_true",
736        help="Skip PDS blob upload (single mode only)")
737    parser.add_argument("--max-resources", type=int, default=MAX_SUBRESOURCES,
738        help=f"Max subresources to fetch for bundles (default: {MAX_SUBRESOURCES})")
739
740    args = parser.parse_args()
741
742    if args.list:
743        cmd_list(args.limit)
744    elif args.verify:
745        cmd_verify(args.verify)
746    elif args.search:
747        cmd_search(args.search)
748    elif args.url:
749        if args.bundle:
750            cmd_bundle(args.url, no_ipfs=args.no_ipfs, max_resources=args.max_resources)
751        else:
752            cmd_archive(args.url, no_ipfs=args.no_ipfs, no_blob=args.no_blob)
753    else:
754        parser.print_help()
755
756
757if __name__ == "__main__":
758    main()
Configure Feed

Configure Feed