add extractor module for platform-agnostic content extraction

+23

backend/build.zig

··· 43 43 44 44 const run_step = b.step("run", "Run the server"); 45 45 run_step.dependOn(&run_cmd.step); 46 + 47 + // test step 48 + const test_step = b.step("test", "Run unit tests"); 49 + 50 + const test_files = [_][]const u8{ 51 + "src/search.zig", 52 + "src/extractor.zig", 53 + }; 54 + 55 + for (test_files) |file| { 56 + const unit_tests = b.addTest(.{ 57 + .root_module = b.createModule(.{ 58 + .root_source_file = b.path(file), 59 + .target = target, 60 + .optimize = optimize, 61 + .imports = &.{ 62 + .{ .name = "zat", .module = zat.module("zat") }, 63 + }, 64 + }), 65 + }); 66 + const run_tests = b.addRunArtifact(unit_tests); 67 + test_step.dependOn(&run_tests.step); 68 + } 46 69 }

+249

backend/src/extractor.zig

··· 1 + const std = @import("std"); 2 + const mem = std.mem; 3 + const json = std.json; 4 + const Allocator = mem.Allocator; 5 + const zat = @import("zat"); 6 + 7 + /// Detected platform from content.$type 8 + pub const Platform = enum { 9 + leaflet, 10 + pckt, 11 + offprint, 12 + unknown, 13 + 14 + pub fn fromContentType(content_type: []const u8) Platform { 15 + if (mem.startsWith(u8, content_type, "pub.leaflet.")) return .leaflet; 16 + if (mem.startsWith(u8, content_type, "blog.pckt.")) return .pckt; 17 + if (mem.startsWith(u8, content_type, "app.offprint.")) return .offprint; 18 + return .unknown; 19 + } 20 + 21 + pub fn name(self: Platform) []const u8 { 22 + return @tagName(self); 23 + } 24 + }; 25 + 26 + /// Extracted document data ready for indexing. 27 + /// All string fields are owned by this struct and must be freed via deinit(). 28 + pub const ExtractedDocument = struct { 29 + allocator: Allocator, 30 + title: []const u8, 31 + content: []u8, 32 + created_at: ?[]const u8, 33 + publication_uri: ?[]const u8, 34 + tags: [][]const u8, 35 + platform: Platform, 36 + source_collection: []const u8, 37 + 38 + pub fn deinit(self: *ExtractedDocument) void { 39 + self.allocator.free(self.content); 40 + self.allocator.free(self.tags); 41 + } 42 + 43 + /// Platform name as string (for DB storage) 44 + pub fn platformName(self: ExtractedDocument) []const u8 { 45 + return self.platform.name(); 46 + } 47 + }; 48 + 49 + /// Block types that have a plaintext field 50 + const plaintext_blocks = std.StaticStringMap(void).initComptime(.{ 51 + .{ "pub.leaflet.blocks.text", {} }, 52 + .{ "pub.leaflet.blocks.header", {} }, 53 + .{ "pub.leaflet.blocks.blockquote", {} }, 54 + .{ "pub.leaflet.blocks.code", {} }, 55 + }); 56 + 57 + /// Detect platform from record's content.$type field 58 + pub fn detectPlatform(record: json.ObjectMap) Platform { 59 + const content = record.get("content") orelse return .unknown; 60 + if (content != .object) return .unknown; 61 + 62 + const type_val = content.object.get("$type") orelse return .unknown; 63 + if (type_val != .string) return .unknown; 64 + 65 + return Platform.fromContentType(type_val.string); 66 + } 67 + 68 + /// Extract document content from a record. 69 + /// Caller owns the returned ExtractedDocument and must call deinit(). 70 + pub fn extractDocument( 71 + allocator: Allocator, 72 + record: json.ObjectMap, 73 + collection: []const u8, 74 + ) !ExtractedDocument { 75 + const record_val: json.Value = .{ .object = record }; 76 + const platform = detectPlatform(record); 77 + 78 + // extract required fields 79 + const title = zat.json.getString(record_val, "title") orelse return error.MissingTitle; 80 + 81 + // extract optional fields 82 + const created_at = zat.json.getString(record_val, "publishedAt") orelse 83 + zat.json.getString(record_val, "createdAt"); 84 + const publication_uri = zat.json.getString(record_val, "publication") orelse 85 + zat.json.getString(record_val, "site"); // site.standard uses "site" 86 + 87 + // extract tags - allocate owned slice 88 + const tags = try extractTags(allocator, record_val); 89 + errdefer allocator.free(tags); 90 + 91 + // extract content - try textContent first (standard.site), then parse blocks 92 + const content = try extractContent(allocator, record_val); 93 + 94 + return .{ 95 + .allocator = allocator, 96 + .title = title, 97 + .content = content, 98 + .created_at = created_at, 99 + .publication_uri = publication_uri, 100 + .tags = tags, 101 + .platform = platform, 102 + .source_collection = collection, 103 + }; 104 + } 105 + 106 + fn extractTags(allocator: Allocator, record: json.Value) ![][]const u8 { 107 + const tags_array = zat.json.getArray(record, "tags") orelse return &.{}; 108 + 109 + var count: usize = 0; 110 + for (tags_array) |item| { 111 + if (item == .string) count += 1; 112 + } 113 + if (count == 0) return &.{}; 114 + 115 + const tags = try allocator.alloc([]const u8, count); 116 + var i: usize = 0; 117 + for (tags_array) |item| { 118 + if (item == .string) { 119 + tags[i] = item.string; 120 + i += 1; 121 + } 122 + } 123 + return tags; 124 + } 125 + 126 + fn extractContent(allocator: Allocator, record: json.Value) ![]u8 { 127 + var buf: std.ArrayList(u8) = .{}; 128 + errdefer buf.deinit(allocator); 129 + 130 + // try textContent first (site.standard.document has this pre-flattened) 131 + if (zat.json.getString(record, "textContent")) |text| { 132 + try buf.appendSlice(allocator, text); 133 + return try buf.toOwnedSlice(allocator); 134 + } 135 + 136 + // fall back to leaflet-style block parsing 137 + if (zat.json.getString(record, "description")) |desc| { 138 + try buf.appendSlice(allocator, desc); 139 + } 140 + 141 + if (zat.json.getArray(record, "pages")) |pages| { 142 + for (pages) |page| { 143 + if (page == .object) { 144 + try extractPageContent(allocator, &buf, page.object); 145 + } 146 + } 147 + } 148 + 149 + if (buf.items.len == 0) return error.NoContent; 150 + return try buf.toOwnedSlice(allocator); 151 + } 152 + 153 + fn extractPageContent(allocator: Allocator, buf: *std.ArrayList(u8), page: json.ObjectMap) Allocator.Error!void { 154 + const blocks_val = page.get("blocks") orelse return; 155 + if (blocks_val != .array) return; 156 + 157 + for (blocks_val.array.items) |wrapper| { 158 + if (wrapper != .object) continue; 159 + const block_val = wrapper.object.get("block") orelse continue; 160 + if (block_val != .object) continue; 161 + 162 + try extractBlockText(allocator, buf, block_val.object); 163 + } 164 + } 165 + 166 + fn extractBlockText(allocator: Allocator, buf: *std.ArrayList(u8), block: json.ObjectMap) Allocator.Error!void { 167 + const type_val = block.get("$type") orelse return; 168 + if (type_val != .string) return; 169 + const block_type = type_val.string; 170 + 171 + // blocks with plaintext field 172 + if (plaintext_blocks.has(block_type)) { 173 + try appendTextField(allocator, buf, block, "plaintext"); 174 + } 175 + // button has text field 176 + else if (mem.eql(u8, block_type, "pub.leaflet.blocks.button")) { 177 + try appendTextField(allocator, buf, block, "text"); 178 + } 179 + // list with nested children 180 + else if (mem.eql(u8, block_type, "pub.leaflet.blocks.unorderedList")) { 181 + try extractListContent(allocator, buf, block); 182 + } 183 + } 184 + 185 + fn appendTextField(allocator: Allocator, buf: *std.ArrayList(u8), obj: json.ObjectMap, field: []const u8) Allocator.Error!void { 186 + const val = obj.get(field) orelse return; 187 + if (val != .string) return; 188 + if (val.string.len == 0) return; 189 + 190 + if (buf.items.len > 0) try buf.append(allocator, ' '); 191 + try buf.appendSlice(allocator, val.string); 192 + } 193 + 194 + fn extractListContent(allocator: Allocator, buf: *std.ArrayList(u8), block: json.ObjectMap) Allocator.Error!void { 195 + const children = block.get("children") orelse return; 196 + if (children != .array) return; 197 + 198 + for (children.array.items) |child| { 199 + try extractListItem(allocator, buf, child); 200 + } 201 + } 202 + 203 + fn extractListItem(allocator: Allocator, buf: *std.ArrayList(u8), item: json.Value) Allocator.Error!void { 204 + if (item != .object) return; 205 + 206 + // list item content 207 + if (item.object.get("content")) |content| { 208 + if (content == .object) { 209 + try appendTextField(allocator, buf, content.object, "plaintext"); 210 + } 211 + } 212 + 213 + // nested children (recursive) 214 + if (item.object.get("children")) |children| { 215 + if (children == .array) { 216 + for (children.array.items) |child| { 217 + try extractListItem(allocator, buf, child); 218 + } 219 + } 220 + } 221 + } 222 + 223 + // --- tests --- 224 + 225 + test "Platform.fromContentType: leaflet" { 226 + try std.testing.expectEqual(Platform.leaflet, Platform.fromContentType("pub.leaflet.content")); 227 + try std.testing.expectEqual(Platform.leaflet, Platform.fromContentType("pub.leaflet.blocks.text")); 228 + } 229 + 230 + test "Platform.fromContentType: pckt" { 231 + try std.testing.expectEqual(Platform.pckt, Platform.fromContentType("blog.pckt.content")); 232 + try std.testing.expectEqual(Platform.pckt, Platform.fromContentType("blog.pckt.blocks.whatever")); 233 + } 234 + 235 + test "Platform.fromContentType: offprint" { 236 + try std.testing.expectEqual(Platform.offprint, Platform.fromContentType("app.offprint.content")); 237 + } 238 + 239 + test "Platform.fromContentType: unknown" { 240 + try std.testing.expectEqual(Platform.unknown, Platform.fromContentType("something.else")); 241 + try std.testing.expectEqual(Platform.unknown, Platform.fromContentType("")); 242 + } 243 + 244 + test "Platform.name" { 245 + try std.testing.expectEqualStrings("leaflet", Platform.leaflet.name()); 246 + try std.testing.expectEqualStrings("pckt", Platform.pckt.name()); 247 + try std.testing.expectEqualStrings("offprint", Platform.offprint.name()); 248 + try std.testing.expectEqualStrings("unknown", Platform.unknown.name()); 249 + }

+4 -2

backend/src/indexer.zig

··· 10 10 created_at: ?[]const u8, 11 11 publication_uri: ?[]const u8, 12 12 tags: []const []const u8, 13 + platform: []const u8, 14 + source_collection: []const u8, 13 15 ) !void { 14 16 const c = db.getClient() orelse return error.NotInitialized; 15 17 16 18 try c.exec( 17 - "INSERT OR REPLACE INTO documents (uri, did, rkey, title, content, created_at, publication_uri) VALUES (?, ?, ?, ?, ?, ?, ?)", 18 - &.{ uri, did, rkey, title, content, created_at orelse "", publication_uri orelse "" }, 19 + "INSERT OR REPLACE INTO documents (uri, did, rkey, title, content, created_at, publication_uri, platform, source_collection) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)", 20 + &.{ uri, did, rkey, title, content, created_at orelse "", publication_uri orelse "", platform, source_collection }, 19 21 ); 20 22 21 23 // update FTS index

+48

backend/src/search.zig

··· 344 344 buf[pos] = '*'; 345 345 return buf; 346 346 } 347 + 348 + // --- tests --- 349 + 350 + test "buildFtsQuery: empty string" { 351 + const result = try buildFtsQuery(std.testing.allocator, ""); 352 + try std.testing.expectEqualStrings("", result); 353 + } 354 + 355 + test "buildFtsQuery: whitespace only" { 356 + const result = try buildFtsQuery(std.testing.allocator, " "); 357 + try std.testing.expectEqualStrings("", result); 358 + } 359 + 360 + test "buildFtsQuery: single word" { 361 + const result = try buildFtsQuery(std.testing.allocator, "hello"); 362 + defer std.testing.allocator.free(result); 363 + try std.testing.expectEqualStrings("hello*", result); 364 + } 365 + 366 + test "buildFtsQuery: single word with whitespace" { 367 + const result = try buildFtsQuery(std.testing.allocator, " hello "); 368 + defer std.testing.allocator.free(result); 369 + try std.testing.expectEqualStrings("hello*", result); 370 + } 371 + 372 + test "buildFtsQuery: multiple words" { 373 + const result = try buildFtsQuery(std.testing.allocator, "cat dog"); 374 + defer std.testing.allocator.free(result); 375 + try std.testing.expectEqualStrings("cat OR dog*", result); 376 + } 377 + 378 + test "buildFtsQuery: three words" { 379 + const result = try buildFtsQuery(std.testing.allocator, "one two three"); 380 + defer std.testing.allocator.free(result); 381 + try std.testing.expectEqualStrings("one OR two OR three*", result); 382 + } 383 + 384 + test "buildFtsQuery: quoted phrase passthrough" { 385 + const result = try buildFtsQuery(std.testing.allocator, "\"exact phrase\""); 386 + defer std.testing.allocator.free(result); 387 + try std.testing.expectEqualStrings("\"exact phrase\"", result); 388 + } 389 + 390 + test "buildFtsQuery: dots as separators" { 391 + const result = try buildFtsQuery(std.testing.allocator, "foo.bar"); 392 + defer std.testing.allocator.free(result); 393 + try std.testing.expectEqualStrings("foo OR bar*", result); 394 + }

+22 -129

backend/src/tap.zig

··· 6 6 const websocket = @import("websocket"); 7 7 const zat = @import("zat"); 8 8 const indexer = @import("indexer.zig"); 9 + const extractor = @import("extractor.zig"); 9 10 10 11 const DOCUMENT_COLLECTION = "pub.leaflet.document"; 11 12 const PUBLICATION_COLLECTION = "pub.leaflet.publication"; ··· 105 106 rkey: []const u8, 106 107 }; 107 108 108 - /// Leaflet document fields 109 - const LeafletDocument = struct { 110 - title: []const u8, 111 - publication: ?[]const u8 = null, 112 - publishedAt: ?[]const u8 = null, 113 - createdAt: ?[]const u8 = null, 114 - description: ?[]const u8 = null, 115 - }; 116 - 117 109 /// Leaflet publication fields 118 110 const LeafletPublication = struct { 119 111 name: []const u8, ··· 144 136 const record_obj = zat.json.getObject(parsed.value, "record.record") orelse return; 145 137 146 138 if (mem.eql(u8, rec.collection, DOCUMENT_COLLECTION)) { 147 - processDocument(allocator, uri, did.raw, rec.rkey, record_obj) catch |err| { 139 + processDocument(allocator, uri, did.raw, rec.rkey, record_obj, rec.collection) catch |err| { 148 140 std.debug.print("document processing error: {}\n", .{err}); 149 141 }; 150 142 } else if (mem.eql(u8, rec.collection, PUBLICATION_COLLECTION)) { ··· 165 157 } 166 158 } 167 159 168 - fn processDocument(allocator: Allocator, uri: []const u8, did: []const u8, rkey: []const u8, record: json.ObjectMap) !void { 169 - const record_val: json.Value = .{ .object = record }; 170 - 171 - // extract known fields via struct 172 - const doc = zat.json.extractAt(LeafletDocument, allocator, record_val, .{}) catch return; 173 - const created_at = doc.publishedAt orelse doc.createdAt; 174 - 175 - // extract tags array 176 - var tags_list: std.ArrayList([]const u8) = .{}; 177 - defer tags_list.deinit(allocator); 178 - if (zat.json.getArray(record_val, "tags")) |tags| { 179 - for (tags) |tag_item| { 180 - if (tag_item == .string) { 181 - try tags_list.append(allocator, tag_item.string); 182 - } 183 - } 184 - } 185 - 186 - // extract plaintext from pages 187 - var content_buf: std.ArrayList(u8) = .{}; 188 - defer content_buf.deinit(allocator); 189 - 190 - if (doc.description) |desc| { 191 - if (desc.len > 0) { 192 - try content_buf.appendSlice(allocator, desc); 193 - } 194 - } 195 - 196 - if (zat.json.getArray(record_val, "pages")) |pages| { 197 - for (pages) |page| { 198 - if (page == .object) { 199 - try extractPlaintextFromPage(allocator, &content_buf, page.object); 200 - } 160 + fn processDocument(allocator: Allocator, uri: []const u8, did: []const u8, rkey: []const u8, record: json.ObjectMap, collection: []const u8) !void { 161 + var doc = extractor.extractDocument(allocator, record, collection) catch |err| { 162 + if (err != error.NoContent and err != error.MissingTitle) { 163 + std.debug.print("extraction error for {s}: {}\n", .{ uri, err }); 201 164 } 202 - } 203 - 204 - if (content_buf.items.len == 0) return; 205 - 206 - try indexer.insertDocument(uri, did, rkey, doc.title, content_buf.items, created_at, doc.publication, tags_list.items); 207 - std.debug.print("indexed document: {s} ({} chars, {} tags)\n", .{ uri, content_buf.items.len, tags_list.items.len }); 208 - } 165 + return; 166 + }; 167 + defer doc.deinit(); 209 168 210 - fn extractPlaintextFromPage(allocator: Allocator, buf: *std.ArrayList(u8), page: json.ObjectMap) !void { 211 - // pages can be linearDocument or canvas 212 - // linearDocument has blocks array 213 - const blocks_val = page.get("blocks") orelse return; 214 - if (blocks_val != .array) return; 215 - 216 - for (blocks_val.array.items) |block_wrapper| { 217 - if (block_wrapper != .object) continue; 218 - 219 - // block wrapper has "block" field with actual content 220 - const block_val = block_wrapper.object.get("block") orelse continue; 221 - if (block_val != .object) continue; 222 - 223 - try extractTextFromBlock(allocator, buf, block_val.object); 224 - } 225 - } 226 - 227 - fn extractTextFromBlock(allocator: Allocator, buf: *std.ArrayList(u8), block: json.ObjectMap) Allocator.Error!void { 228 - const type_val = block.get("$type") orelse return; 229 - if (type_val != .string) return; 230 - 231 - const block_type = type_val.string; 232 - 233 - // blocks with plaintext field: text, header, blockquote, code 234 - if (mem.eql(u8, block_type, "pub.leaflet.blocks.text") or 235 - mem.eql(u8, block_type, "pub.leaflet.blocks.header") or 236 - mem.eql(u8, block_type, "pub.leaflet.blocks.blockquote") or 237 - mem.eql(u8, block_type, "pub.leaflet.blocks.code")) 238 - { 239 - if (block.get("plaintext")) |plaintext_val| { 240 - if (plaintext_val == .string) { 241 - if (buf.items.len > 0) { 242 - try buf.appendSlice(allocator, " "); 243 - } 244 - try buf.appendSlice(allocator, plaintext_val.string); 245 - } 246 - } 247 - } 248 - // button has text field 249 - else if (mem.eql(u8, block_type, "pub.leaflet.blocks.button")) { 250 - if (block.get("text")) |text_val| { 251 - if (text_val == .string) { 252 - if (buf.items.len > 0) { 253 - try buf.appendSlice(allocator, " "); 254 - } 255 - try buf.appendSlice(allocator, text_val.string); 256 - } 257 - } 258 - } 259 - // unorderedList has children array with nested content 260 - else if (mem.eql(u8, block_type, "pub.leaflet.blocks.unorderedList")) { 261 - if (block.get("children")) |children_val| { 262 - if (children_val == .array) { 263 - for (children_val.array.items) |child| { 264 - try extractListItemText(allocator, buf, child); 265 - } 266 - } 267 - } 268 - } 269 - } 270 - 271 - fn extractListItemText(allocator: Allocator, buf: *std.ArrayList(u8), item: json.Value) Allocator.Error!void { 272 - if (item != .object) return; 273 - 274 - // list item has content field which is a block 275 - if (item.object.get("content")) |content_val| { 276 - if (content_val == .object) { 277 - try extractTextFromBlock(allocator, buf, content_val.object); 278 - } 279 - } 280 - 281 - // nested children 282 - if (item.object.get("children")) |children_val| { 283 - if (children_val == .array) { 284 - for (children_val.array.items) |child| { 285 - try extractListItemText(allocator, buf, child); 286 - } 287 - } 288 - } 169 + try indexer.insertDocument( 170 + uri, 171 + did, 172 + rkey, 173 + doc.title, 174 + doc.content, 175 + doc.created_at, 176 + doc.publication_uri, 177 + doc.tags, 178 + doc.platformName(), 179 + doc.source_collection, 180 + ); 181 + std.debug.print("indexed document: {s} [{s}] ({} chars, {} tags)\n", .{ uri, doc.platformName(), doc.content.len, doc.tags.len }); 289 182 } 290 183 291 184 fn processPublication(allocator: Allocator, uri: []const u8, did: []const u8, rkey: []const u8, record: json.ObjectMap) !void {

Configure Feed

Configure Feed