#!/usr/bin/env python3 """ web_archive.py - Capture and archive web pages to ATProto with IPFS pinning. Creates signed, timestamped records of web page captures on your PDS. Each capture includes an IPFS CID (content-addressed identifier) and a PDS blob. Supports two modes: - Single capture: archive one URL as a `systems.witchcraft.archive.capture` record - Bundle capture: archive a page + all its subresources (CSS, JS, images) as a `systems.witchcraft.archive.bundle` record containing a MASL-shaped manifest with CID-addressed resources (MASL spec: https://dasl.ing/masl.html) Usage: python web_archive.py # Archive a URL (single capture) python web_archive.py --bundle # Archive URL + subresources as MASL bundle python web_archive.py --list [--limit N] # List recent captures python web_archive.py --verify # Re-fetch and verify CID python web_archive.py --search # Search captures by URL/title """ import argparse import hashlib import json import mimetypes import os import re import sys from datetime import datetime, timezone from urllib.parse import urljoin, urlparse import requests from multiformats import CID, multihash # --- config --- PDS = os.environ.get("ATP_PDS_URL", "https://bsky.social") ATP_HANDLE = os.environ.get("ATP_HANDLE", "") ATP_PASSWORD = os.environ.get("ATP_PASSWORD", "") COLLECTION_CAPTURE = "systems.witchcraft.archive.capture" COLLECTION_BUNDLE = "ing.dasl.masl" COLLECTION_MASL_LEGACY = COLLECTION_BUNDLE # consolidated # Max subresources to fetch per bundle (safety limit) MAX_SUBRESOURCES = 100 # Max size per individual resource (10MB) MAX_RESOURCE_SIZE = 10 * 1024 * 1024 # Allowed subresource schemes ALLOWED_SCHEMES = {"http", "https"} # --- auth --- def get_session(): handle = ATP_HANDLE password = ATP_PASSWORD if not handle or not password: print("error: set ATP_HANDLE and ATP_PASSWORD environment variables") print(" export ATP_PDS_URL=https://your.pds.example.com") print(" export ATP_HANDLE=your.handle") print(" export ATP_PASSWORD=your-app-password") sys.exit(1) resp = requests.post(f"{PDS}/xrpc/com.atproto.server.createSession", json={"identifier": handle, "password": password}) resp.raise_for_status() return resp.json() # --- fetch --- def fetch_page(url): """Fetch a URL and return (html_bytes, title, final_url, word_count, status_code).""" headers = { "User-Agent": "WebArchive/1.0 (ATProto web archiver)" } resp = requests.get(url, headers=headers, timeout=30, allow_redirects=True) resp.raise_for_status() html_bytes = resp.content html = html_bytes.decode("utf-8", errors="replace") final_url = resp.url # Extract title title_match = re.search(r"]*>(.*?)", html, re.IGNORECASE | re.DOTALL) title = title_match.group(1).strip() if title_match else "" title = title.replace("&", "&").replace("<", "<").replace(">", ">") title = title.replace("'", "'").replace(""", '"') title = title[:256] # Word count from text extraction text = re.sub(r"]*>.*?", "", html, flags=re.DOTALL | re.IGNORECASE) text = re.sub(r"]*>.*?", "", text, flags=re.DOTALL | re.IGNORECASE) text = re.sub(r"<[^>]+>", " ", text) text = re.sub(r"\s+", " ", text).strip() word_count = len(text.split()) return html_bytes, title, final_url, word_count, resp.status_code def fetch_resource(url): """Fetch a subresource. Returns (bytes, content_type, final_url) or None on failure.""" headers = { "User-Agent": "WebArchive/1.0 (ATProto web archiver)" } try: resp = requests.get(url, headers=headers, timeout=20, allow_redirects=True, stream=True) resp.raise_for_status() # Check size before downloading full body content_length = resp.headers.get("content-length") if content_length and int(content_length) > MAX_RESOURCE_SIZE: print(f" skip (too large: {int(content_length):,} bytes): {url[:80]}") return None content = resp.content if len(content) > MAX_RESOURCE_SIZE: print(f" skip (too large: {len(content):,} bytes): {url[:80]}") return None content_type = resp.headers.get("content-type", "application/octet-stream") # Strip charset/params from content-type for MASL content_type = content_type.split(";")[0].strip() return content, content_type, resp.url except Exception as e: print(f" skip (error: {e}): {url[:80]}") return None def extract_subresource_urls(html, base_url): """Extract URLs of subresources (CSS, JS, images, fonts) from HTML.""" urls = {} # url -> expected type hint # CSS: for m in re.finditer(r']+rel=["\']stylesheet["\'][^>]+href=["\']([^"\']+)["\']', html, re.IGNORECASE): urls[m.group(1)] = "text/css" for m in re.finditer(r']+href=["\']([^"\']+)["\'][^>]+rel=["\']stylesheet["\']', html, re.IGNORECASE): urls[m.group(1)] = "text/css" # JS: