search for standard sites pub-search.waow.tech
search zig blog atproto
11
fork

Configure Feed

Select the types of activity you want to include in your feed.

feat: add path field for proper document URL building

site.standard.document records have a `path` field (e.g., "/001") that
should be used with the publication's base URL to build the full document
URL. Previously was incorrectly using rkey which caused 404s.

- extract path field from records in extractor.zig
- store path in documents table via schema migration
- return path in search results
- frontend uses basePath + path when path is available

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>

zzstoatzz 087e595a 7f49b5ea

+56 -17
+4
backend/src/db/schema.zig
··· 158 158 \\WHERE platform IN ('standardsite', 'unknown') 159 159 \\AND publication_uri IN (SELECT uri FROM publications WHERE base_path LIKE '%leaflet.pub%') 160 160 , &.{}) catch {}; 161 + 162 + // URL path field for documents (e.g., "/001" for zat.dev) 163 + // used to build full URL: publication.url + document.path 164 + client.exec("ALTER TABLE documents ADD COLUMN path TEXT", &.{}) catch {}; 161 165 }
+5
backend/src/extractor.zig
··· 45 45 tags: [][]const u8, 46 46 platform: Platform, 47 47 source_collection: []const u8, 48 + path: ?[]const u8, // URL path from record (e.g., "/001" for zat.dev) 48 49 49 50 pub fn deinit(self: *ExtractedDocument) void { 50 51 self.allocator.free(self.content); ··· 94 95 zat.json.getString(record_val, "site") orelse 95 96 zat.json.getString(record_val, "site.uri"); 96 97 98 + // extract URL path (site.standard.document uses "path" field like "/001") 99 + const path = zat.json.getString(record_val, "path"); 100 + 97 101 // extract tags - allocate owned slice 98 102 const tags = try extractTags(allocator, record_val); 99 103 errdefer allocator.free(tags); ··· 110 114 .tags = tags, 111 115 .platform = platform, 112 116 .source_collection = collection, 117 + .path = path, 113 118 }; 114 119 } 115 120
+3 -2
backend/src/indexer.zig
··· 12 12 tags: []const []const u8, 13 13 platform: []const u8, 14 14 source_collection: []const u8, 15 + path: ?[]const u8, 15 16 ) !void { 16 17 const c = db.getClient() orelse return error.NotInitialized; 17 18 ··· 31 32 } else |_| {} 32 33 33 34 try c.exec( 34 - "INSERT OR REPLACE INTO documents (uri, did, rkey, title, content, created_at, publication_uri, platform, source_collection) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)", 35 - &.{ uri, did, rkey, title, content, created_at orelse "", publication_uri orelse "", platform, source_collection }, 35 + "INSERT OR REPLACE INTO documents (uri, did, rkey, title, content, created_at, publication_uri, platform, source_collection, path) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?)", 36 + &.{ uri, did, rkey, title, content, created_at orelse "", publication_uri orelse "", platform, source_collection, path orelse "" }, 36 37 ); 37 38 38 39 // update FTS index
+8 -4
backend/src/search.zig
··· 16 16 rkey: []const u8, 17 17 basePath: []const u8, 18 18 platform: []const u8, 19 + path: []const u8 = "", // URL path from record (e.g., "/001") 19 20 }; 20 21 21 22 /// Document search result (internal) ··· 29 30 basePath: []const u8, 30 31 hasPublication: bool, 31 32 platform: []const u8, 33 + path: []const u8, 32 34 33 35 fn fromRow(row: db.Row) Doc { 34 36 return .{ ··· 41 43 .basePath = row.text(6), 42 44 .hasPublication = row.int(7) != 0, 43 45 .platform = row.text(8), 46 + .path = row.text(9), 44 47 }; 45 48 } 46 49 ··· 55 58 .rkey = self.rkey, 56 59 .basePath = self.basePath, 57 60 .platform = self.platform, 61 + .path = self.path, 58 62 }; 59 63 } 60 64 }; ··· 63 67 \\SELECT d.uri, d.did, d.title, '' as snippet, 64 68 \\ d.created_at, d.rkey, COALESCE(p.base_path, '') as base_path, 65 69 \\ CASE WHEN d.publication_uri != '' THEN 1 ELSE 0 END as has_publication, 66 - \\ d.platform 70 + \\ d.platform, COALESCE(d.path, '') as path 67 71 \\FROM documents d 68 72 \\LEFT JOIN publications p ON d.publication_uri = p.uri 69 73 \\JOIN document_tags dt ON d.uri = dt.document_uri ··· 76 80 \\ snippet(documents_fts, 2, '', '', '...', 32) as snippet, 77 81 \\ d.created_at, d.rkey, COALESCE(p.base_path, '') as base_path, 78 82 \\ CASE WHEN d.publication_uri != '' THEN 1 ELSE 0 END as has_publication, 79 - \\ d.platform 83 + \\ d.platform, COALESCE(d.path, '') as path 80 84 \\FROM documents_fts f 81 85 \\JOIN documents d ON f.uri = d.uri 82 86 \\LEFT JOIN publications p ON d.publication_uri = p.uri ··· 90 94 \\ snippet(documents_fts, 2, '', '', '...', 32) as snippet, 91 95 \\ d.created_at, d.rkey, COALESCE(p.base_path, '') as base_path, 92 96 \\ CASE WHEN d.publication_uri != '' THEN 1 ELSE 0 END as has_publication, 93 - \\ d.platform 97 + \\ d.platform, COALESCE(d.path, '') as path 94 98 \\FROM documents_fts f 95 99 \\JOIN documents d ON f.uri = d.uri 96 100 \\LEFT JOIN publications p ON d.publication_uri = p.uri ··· 226 230 \\SELECT d2.uri, d2.did, d2.title, '' as snippet, 227 231 \\ d2.created_at, d2.rkey, COALESCE(p.base_path, '') as base_path, 228 232 \\ CASE WHEN d2.publication_uri != '' THEN 1 ELSE 0 END as has_publication, 229 - \\ d2.platform 233 + \\ d2.platform, COALESCE(d2.path, '') as path 230 234 \\FROM documents d1, documents d2 231 235 \\LEFT JOIN publications p ON d2.publication_uri = p.uri 232 236 \\WHERE d1.uri = ?
+1
backend/src/tap.zig
··· 264 264 doc.tags, 265 265 doc.platformName(), 266 266 doc.source_collection, 267 + doc.path, 267 268 ); 268 269 std.debug.print("indexed document: {s} [{s}] ({} chars, {} tags)\n", .{ uri, doc.platformName(), doc.content.len, doc.tags.len }); 269 270 }
+29 -10
scripts/backfill-pds
··· 113 113 if not title: 114 114 return None 115 115 116 - # Get content - try multiple field names 117 - content = value.get("content") or value.get("text") or "" 116 + # Get content - try textContent (site.standard), then content/text 117 + content = value.get("textContent") or value.get("content") or value.get("text") or "" 118 118 if isinstance(content, dict): 119 119 # Handle richtext format 120 120 content = content.get("text", "") ··· 122 122 # Get created_at 123 123 created_at = value.get("createdAt", "") 124 124 125 - # Get publication reference 126 - publication = value.get("publication") 125 + # Get publication reference - try "publication" (leaflet) then "site" (site.standard) 126 + publication = value.get("publication") or value.get("site") 127 127 publication_uri = None 128 - if publication and isinstance(publication, dict): 129 - publication_uri = publication.get("uri") 128 + if publication: 129 + if isinstance(publication, dict): 130 + publication_uri = publication.get("uri") 131 + elif isinstance(publication, str): 132 + publication_uri = publication 133 + 134 + # Get URL path (site.standard.document uses "path" field like "/001") 135 + path = value.get("path") 130 136 131 137 # Get tags 132 138 tags = value.get("tags", []) ··· 150 156 "tags": tags, 151 157 "platform": platform, 152 158 "collection": collection, 159 + "path": path, 153 160 } 154 161 155 162 ··· 226 233 turso_exec( 227 234 settings, 228 235 """ 229 - INSERT INTO documents (uri, did, rkey, title, content, created_at, publication_uri, platform, source_collection) 230 - VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?) 236 + INSERT INTO documents (uri, did, rkey, title, content, created_at, publication_uri, platform, source_collection, path) 237 + VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?) 231 238 ON CONFLICT(did, rkey) DO UPDATE SET 232 239 uri = excluded.uri, 233 240 title = excluded.title, ··· 235 242 created_at = excluded.created_at, 236 243 publication_uri = excluded.publication_uri, 237 244 platform = excluded.platform, 238 - source_collection = excluded.source_collection 245 + source_collection = excluded.source_collection, 246 + path = excluded.path 239 247 """, 240 - [uri, did, rkey, doc["title"], doc["content"], doc["created_at"], doc["publication_uri"], doc["platform"], doc["collection"]], 248 + [uri, did, rkey, doc["title"], doc["content"], doc["created_at"], doc["publication_uri"], doc["platform"], doc["collection"], doc["path"]], 241 249 ) 242 250 # Insert tags 243 251 for tag in doc["tags"]: ··· 260 268 value = record["value"] 261 269 name = value.get("name", "") 262 270 description = value.get("description") 271 + # base_path: try leaflet's "base_path", then strip scheme from site.standard's "url" 263 272 base_path = value.get("base_path") 273 + if not base_path: 274 + url = value.get("url") 275 + if url: 276 + # Strip https:// or http:// prefix 277 + if url.startswith("https://"): 278 + base_path = url[len("https://"):] 279 + elif url.startswith("http://"): 280 + base_path = url[len("http://"):] 281 + else: 282 + base_path = url 264 283 265 284 if args.dry_run: 266 285 print(f" would insert pub: {name}")
+6 -1
site/index.html
··· 512 512 if (entityType === 'publication') { 513 513 return doc.basePath ? `https://${doc.basePath}` : null; 514 514 } 515 - // documents: prefer basePath from publication 515 + // documents: prefer basePath + path (from record) over rkey 516 + if (doc.basePath && doc.path) { 517 + // path already includes leading slash (e.g., "/001") 518 + return `https://${doc.basePath}${doc.path}`; 519 + } 520 + // fallback to basePath + rkey (legacy pattern) 516 521 if (doc.basePath && doc.rkey) { 517 522 return `https://${doc.basePath}/${doc.rkey}`; 518 523 }