web archiver with MASL bundle mode for ATProto. captures web pages as content-addressed bundles stored on your PDS with optional IPFS pinning.
6
fork

Configure Feed

Select the types of activity you want to include in your feed.

refactor: proper record type + MASL spec conformance

- new collection: systems.witchcraft.archive.bundle (was ing.dasl.masl)
- MASL data in .masl field with CID strings in src (spec-conformant)
- ATProto blob refs in separate .blobs map for content retrieval
- archive metadata (url, title, capturedAt) at record top level
- backwards-compatible: list/search/verify handle legacy records
- thanks to nel.pet for the spec feedback!

Kira cd8b632d 47d64f07

+101 -48
+5 -1
README.md
··· 56 56 ## record types 57 57 58 58 - `systems.witchcraft.archive.capture` — single page captures 59 - - `ing.dasl.masl` — MASL bundle mode records (see [dasl.ing/masl.html](https://dasl.ing/masl.html)) 59 + - `systems.witchcraft.archive.bundle` — bundle archives with MASL-shaped manifest 60 + - `masl.resources`: path → {src: CID, content-type} (spec-conformant CID strings) 61 + - `blobs`: path → ATProto blob ref (for content retrieval from PDS) 62 + - archive metadata (url, title, capturedAt, etc) at top level 63 + - see [MASL spec](https://dasl.ing/masl.html) for the manifest format 60 64 61 65 ## viewer 62 66
+96 -47
web_archive.py
··· 7 7 8 8 Supports two modes: 9 9 - Single capture: archive one URL as a `systems.witchcraft.archive.capture` record 10 - - Bundle capture: archive a page + all its subresources (CSS, JS, images) as an 11 - `ing.dasl.masl` Bundle Mode record (MASL spec: https://dasl.ing/masl.html) 10 + - Bundle capture: archive a page + all its subresources (CSS, JS, images) as a 11 + `systems.witchcraft.archive.bundle` record containing a MASL-shaped manifest 12 + with CID-addressed resources (MASL spec: https://dasl.ing/masl.html) 12 13 13 14 Usage: 14 15 python web_archive.py <url> # Archive a URL (single capture) ··· 38 39 ATP_HANDLE = os.environ.get("ATP_HANDLE", "") 39 40 ATP_PASSWORD = os.environ.get("ATP_PASSWORD", "") 40 41 COLLECTION_CAPTURE = "systems.witchcraft.archive.capture" 41 - COLLECTION_MASL = "ing.dasl.masl" 42 + COLLECTION_BUNDLE = "systems.witchcraft.archive.bundle" 43 + # Legacy collection for backwards compat with old records 44 + COLLECTION_MASL_LEGACY = "ing.dasl.masl" 42 45 43 46 # Max subresources to fetch per bundle (safety limit) 44 47 MAX_SUBRESOURCES = 100 ··· 277 280 return resp.json() 278 281 279 282 280 - def create_masl_bundle(session, name, url, resources_map, captured_at, 281 - archive_meta=None): 282 - """Create a MASL Bundle Mode record on ATProto. 283 - 284 - resources_map: dict of path -> {src: blob_ref, content-type: str, ...} 285 - archive_meta: optional dict with archival metadata (url, capturedAt, etc) 283 + def create_bundle_record(session, name, url, resources_map, blobs_map, 284 + captured_at, archive_meta=None): 285 + """Create a systems.witchcraft.archive.bundle record on ATProto. 286 + 287 + The record wraps a MASL-shaped bundle (with CID strings in src fields) 288 + alongside ATProto blob refs for actual content retrieval. 289 + 290 + resources_map: dict of path -> {src: "cid_string", content-type: str} 291 + (MASL-conformant: src is a CID, not a blob ref) 292 + blobs_map: dict of path -> blob_ref (ATProto blob refs for fetching) 293 + archive_meta: dict with url, capturedAt, title, etc. 286 294 """ 287 295 record = { 288 - "$type": COLLECTION_MASL, 289 - "name": name, 290 - "resources": resources_map, 296 + "$type": COLLECTION_BUNDLE, 297 + # Archive metadata at top level 298 + "url": archive_meta.get("url", url) if archive_meta else url, 299 + "capturedAt": captured_at, 300 + "title": archive_meta.get("title", name) if archive_meta else name, 301 + # MASL-shaped bundle data 302 + "masl": { 303 + "name": name, 304 + "resources": resources_map, 305 + }, 306 + # ATProto blob refs keyed by path (for content retrieval from PDS) 307 + "blobs": blobs_map, 291 308 } 292 309 293 - # Add our archival metadata in a namespaced object (per MASL spec recommendation) 310 + # Add extra archive metadata 294 311 if archive_meta: 295 - record["systems.witchcraft.archive"] = archive_meta 312 + for key in ["wordCount", "totalSize", "resourceCount", 313 + "rootIpfsCid", "contentHash", "pinned", "finalUrl"]: 314 + if key in archive_meta: 315 + record[key] = archive_meta[key] 296 316 297 317 resp = requests.post(f"{PDS}/xrpc/com.atproto.repo.createRecord", 298 318 headers={"Authorization": f"Bearer {session['accessJwt']}"}, 299 319 json={ 300 320 "repo": session["did"], 301 - "collection": COLLECTION_MASL, 321 + "collection": COLLECTION_BUNDLE, 302 322 "record": record, 303 323 }) 304 324 resp.raise_for_status() ··· 431 451 432 452 print(f" root cid: {root_cid} (pinned: {root_pinned})") 433 453 434 - # Build MASL resources map 435 - # Per MASL spec: keys are paths starting with /, values have src (blob ref) + content-type 454 + # Build MASL resources map (CID strings) and blobs map (ATProto blob refs) 455 + # MASL spec: src should be a CID link, not a blob ref 436 456 resources_map = { 437 457 "/": { 438 - "src": root_blob, 458 + "src": root_cid, 439 459 "content-type": "text/html", 440 460 } 461 + } 462 + blobs_map = { 463 + "/": root_blob, 441 464 } 442 465 443 466 # Process subresources ··· 464 487 blob_ref = upload_blob(session, res_bytes, content_type) 465 488 if blob_ref: 466 489 total_blobs += 1 467 - entry = { 468 - "src": blob_ref, 490 + resources_map[path] = { 491 + "src": res_cid, 469 492 "content-type": content_type, 470 493 } 471 - resources_map[path] = entry 494 + blobs_map[path] = blob_ref 472 495 else: 473 496 failed_blobs += 1 474 497 print(f" blob failed: {path}") ··· 494 517 if final_url != url: 495 518 archive_meta["finalUrl"] = final_url 496 519 497 - # Create MASL bundle record 498 - print(f"\ncreating MASL bundle record...") 520 + # Create bundle record 521 + print(f"\ncreating archive bundle record...") 499 522 bundle_name = title or urlparse(url).netloc 500 - result = create_masl_bundle(session, bundle_name, url, resources_map, 501 - now, archive_meta=archive_meta) 523 + result = create_bundle_record(session, bundle_name, url, 524 + resources_map, blobs_map, 525 + now, archive_meta=archive_meta) 502 526 uri = result.get("uri", "") 503 527 rkey = uri.split("/")[-1] if uri else "?" 504 528 505 529 print(f"\nbundled! {uri}") 506 - print(f" view: https://pdsls.dev/at/{session['did']}/{COLLECTION_MASL}/{rkey}") 530 + print(f" view: https://pdsls.dev/at/{session['did']}/{COLLECTION_BUNDLE}/{rkey}") 507 531 print(f" resources: {len(resources_map)} files, {total_bytes:,} bytes total") 508 532 print(f" root IPFS: ipfs://{root_cid}") 509 533 return uri ··· 514 538 session = get_session() 515 539 all_records = [] 516 540 517 - collections = [COLLECTION_CAPTURE, COLLECTION_MASL] if not collection else [collection] 541 + collections = ([COLLECTION_CAPTURE, COLLECTION_BUNDLE, COLLECTION_MASL_LEGACY] 542 + if not collection else [collection]) 518 543 519 544 for coll in collections: 520 545 try: ··· 540 565 # Sort by captured time (newest first) 541 566 def sort_key(r): 542 567 val = r.get("value", {}) 543 - # MASL bundles store capturedAt in namespaced metadata 568 + # New bundles: capturedAt at top level 569 + # Legacy MASL: capturedAt in systems.witchcraft.archive namespace 544 570 meta = val.get("systems.witchcraft.archive", {}) 545 - return meta.get("capturedAt", val.get("capturedAt", "")) 571 + return val.get("capturedAt", meta.get("capturedAt", "")) 546 572 all_records.sort(key=sort_key, reverse=True) 547 573 548 574 for rec in all_records[:limit]: 549 575 val = rec.get("value", {}) 550 576 rkey = rec["uri"].split("/")[-1] 551 577 coll = rec["_collection"] 552 - is_bundle = coll == COLLECTION_MASL 578 + is_bundle = coll in (COLLECTION_BUNDLE, COLLECTION_MASL_LEGACY) 553 579 554 580 if is_bundle: 555 - meta = val.get("systems.witchcraft.archive", {}) 556 - title = val.get("name", meta.get("title", "(untitled)"))[:60] 557 - url = meta.get("url", "")[:60] 558 - captured = meta.get("capturedAt", "")[:19] 559 - cid = meta.get("rootIpfsCid", "") 560 - res_count = meta.get("resourceCount", len(val.get("resources", {}))) 561 - total_size = meta.get("totalSize", 0) 581 + if coll == COLLECTION_BUNDLE: 582 + # New format: metadata at top level, MASL in .masl field 583 + title = val.get("title", "(untitled)")[:60] 584 + url = val.get("url", "")[:60] 585 + captured = val.get("capturedAt", "")[:19] 586 + cid = val.get("rootIpfsCid", "") 587 + res_count = val.get("resourceCount", 588 + len(val.get("masl", {}).get("resources", {}))) 589 + total_size = val.get("totalSize", 0) 590 + else: 591 + # Legacy MASL format 592 + meta = val.get("systems.witchcraft.archive", {}) 593 + title = val.get("name", meta.get("title", "(untitled)"))[:60] 594 + url = meta.get("url", "")[:60] 595 + captured = meta.get("capturedAt", "")[:19] 596 + cid = meta.get("rootIpfsCid", "") 597 + res_count = meta.get("resourceCount", 598 + len(val.get("resources", {}))) 599 + total_size = meta.get("totalSize", 0) 562 600 tag = f"[BUNDLE {res_count} files, {total_size:,}b]" 563 601 else: 564 602 title = val.get("title", "(untitled)")[:60] ··· 579 617 """Verify a capture's hash against current page content.""" 580 618 session = get_session() 581 619 582 - # Try capture collection first, then MASL 620 + # Try all collections 583 621 rec = None 584 - for coll in [COLLECTION_CAPTURE, COLLECTION_MASL]: 622 + for coll in [COLLECTION_CAPTURE, COLLECTION_BUNDLE, COLLECTION_MASL_LEGACY]: 585 623 try: 586 624 resp = requests.get(f"{PDS}/xrpc/com.atproto.repo.getRecord", 587 625 headers={"Authorization": f"Bearer {session['accessJwt']}"}, ··· 598 636 return 599 637 600 638 val = rec.get("value", {}) 601 - is_bundle = rec["_collection"] == COLLECTION_MASL 639 + is_bundle = rec["_collection"] in (COLLECTION_BUNDLE, COLLECTION_MASL_LEGACY) 602 640 603 641 if is_bundle: 604 - meta = val.get("systems.witchcraft.archive", {}) 605 - url = meta.get("finalUrl", meta.get("url", "")) 606 - stored_cid = meta.get("rootIpfsCid", "") 607 - stored_hash = meta.get("contentHash", "") 608 - captured_at = meta.get("capturedAt", "") 642 + if rec["_collection"] == COLLECTION_BUNDLE: 643 + # New format 644 + url = val.get("finalUrl", val.get("url", "")) 645 + stored_cid = val.get("rootIpfsCid", "") 646 + stored_hash = val.get("contentHash", "") 647 + captured_at = val.get("capturedAt", "") 648 + else: 649 + # Legacy MASL format 650 + meta = val.get("systems.witchcraft.archive", {}) 651 + url = meta.get("finalUrl", meta.get("url", "")) 652 + stored_cid = meta.get("rootIpfsCid", "") 653 + stored_hash = meta.get("contentHash", "") 654 + captured_at = meta.get("capturedAt", "") 609 655 else: 610 656 url = val.get("finalUrl", val.get("url", "")) 611 657 stored_cid = val.get("ipfsCid", val.get("cid", "")) ··· 642 688 query_lower = query.lower() 643 689 matches = [] 644 690 645 - for coll in [COLLECTION_CAPTURE, COLLECTION_MASL]: 691 + for coll in [COLLECTION_CAPTURE, COLLECTION_BUNDLE, COLLECTION_MASL_LEGACY]: 646 692 try: 647 693 resp = requests.get(f"{PDS}/xrpc/com.atproto.repo.listRecords", 648 694 headers={"Authorization": f"Bearer {session['accessJwt']}"}, ··· 650 696 resp.raise_for_status() 651 697 for rec in resp.json().get("records", []): 652 698 val = rec.get("value", {}) 653 - if coll == COLLECTION_MASL: 699 + if coll == COLLECTION_BUNDLE: 700 + url = val.get("url", "").lower() 701 + title = val.get("title", "").lower() 702 + elif coll == COLLECTION_MASL_LEGACY: 654 703 meta = val.get("systems.witchcraft.archive", {}) 655 704 url = meta.get("url", "").lower() 656 705 title = val.get("name", meta.get("title", "")).lower()