feat: detect leaflet platform from content.$type for custom domains

+7

backend/src/extractor.zig

··· 43 43 platform: Platform, 44 44 source_collection: []const u8, 45 45 path: ?[]const u8, // URL path from record (e.g., "/001" for zat.dev) 46 + content_type: ?[]const u8, // content.$type (e.g., "pub.leaflet.content") for platform detection 46 47 47 48 pub fn deinit(self: *ExtractedDocument) void { 48 49 self.allocator.free(self.content); ··· 103 104 // extract URL path (site.standard.document uses "path" field like "/001") 104 105 const path = zat.json.getString(record_val, "path"); 105 106 107 + // extract content.$type for platform detection (e.g., "pub.leaflet.content") 108 + const content_type = zat.json.getString(record_val, "content.$type"); 109 + 106 110 // extract tags - allocate owned slice 107 111 const tags = try extractTags(allocator, record_val); 108 112 errdefer allocator.free(tags); ··· 120 124 .platform = platform, 121 125 .source_collection = collection, 122 126 .path = path, 127 + .content_type = content_type, 123 128 }; 124 129 } 125 130 ··· 291 296 292 297 try std.testing.expectEqualStrings("Test Post", doc.title); 293 298 try std.testing.expectEqualStrings("Hello world", doc.content); 299 + // content_type should be extracted for platform detection (custom domain support) 300 + try std.testing.expectEqualStrings("pub.leaflet.content", doc.content_type.?); 294 301 }

+7

backend/src/indexer.zig

··· 13 13 platform: []const u8, 14 14 source_collection: []const u8, 15 15 path: ?[]const u8, 16 + content_type: ?[]const u8, 16 17 ) !void { 17 18 const c = db.getClient() orelse return error.NotInitialized; 18 19 ··· 108 109 actual_platform = "offprint"; 109 110 } else if (std.mem.indexOf(u8, base_path, "greengale.app") != null) { 110 111 actual_platform = "greengale"; 112 + } else if (content_type) |ct| { 113 + // fallback: detect platform from content.$type for custom domains 114 + // e.g., "pub.leaflet.content" indicates leaflet even with custom domain 115 + if (std.mem.startsWith(u8, ct, "pub.leaflet.")) { 116 + actual_platform = "leaflet"; 117 + } 111 118 } 112 119 } 113 120

+1

backend/src/tap.zig

··· 240 240 doc.platformName(), 241 241 doc.source_collection, 242 242 doc.path, 243 + doc.content_type, 243 244 ); 244 245 logfire.counter("tap.documents_indexed", 1); 245 246 }

+14 -5

docs/content-extraction.md

··· 67 67 68 68 ## platform detection 69 69 70 - collection name doesn't indicate platform for `site.standard.*` records. infer from publication `basePath`: 70 + collection name doesn't indicate platform for `site.standard.*` records. detection order: 71 + 72 + 1. **basePath** - infer from publication basePath: 71 73 72 74 | basePath contains | platform | 73 75 |-------------------|----------| ··· 75 77 | `pckt.blog` | pckt | 76 78 | `offprint.app` | offprint | 77 79 | `greengale.app` | greengale | 78 - | (none) | other | 80 + 81 + 2. **content.$type** - fallback for custom domains (e.g., `cailean.journal.ewancroft.uk`): 82 + 83 + | content.$type starts with | platform | 84 + |---------------------------|----------| 85 + | `pub.leaflet.` | leaflet | 86 + 87 + 3. if neither matches → `other` 79 88 80 89 ## summary 81 90 82 91 - **pckt/offprint/greengale**: use `textContent` directly 83 92 - **leaflet**: extract from `content.pages[].blocks[].block.plaintext` 84 93 - **deduplication**: `ON CONFLICT` on `(did, rkey)` or `uri` 85 - - **platform**: infer from publication basePath, not collection name 94 + - **platform**: infer from basePath, fallback to content.$type for custom domains 86 95 87 96 ## code references 88 97 89 - - `backend/src/extractor.zig` - content extraction logic 90 - - `backend/src/indexer.zig:99-112` - platform detection from basePath 98 + - `backend/src/extractor.zig` - content extraction logic, content_type field 99 + - `backend/src/indexer.zig:99-118` - platform detection from basePath + content_type

Configure Feed

Configure Feed