fix: hash AT-URIs to 32-char hex IDs for turbopuffer · zzstoatzz.io/leaflet-search@8803886

+7 -1

backend/src/ingest/embedder.zig

··· 145 145 var tpuf_docs = try allocator.alloc(tpuf.VectorDoc, docs.items.len); 146 146 defer allocator.free(tpuf_docs); 147 147 148 + // pre-compute hashed IDs (tpuf has 64-byte ID limit, AT-URIs are longer) 149 + var hashed_ids = try allocator.alloc([32]u8, docs.items.len); 150 + defer allocator.free(hashed_ids); 151 + 148 152 for (docs.items, embeddings, 0..) |doc, embedding, i| { 153 + hashed_ids[i] = tpuf.hashId(doc.uri); 149 154 tpuf_docs[i] = .{ 150 - .id = doc.uri, 155 + .id = &hashed_ids[i], 151 156 .vector = embedding, 157 + .uri = doc.uri, 152 158 .title = doc.title, 153 159 .did = doc.did, 154 160 .created_at = doc.created_at,

+7 -3

backend/src/search.zig

··· 591 591 /// 2. ANN nearest-neighbor query (~50ms) 592 592 /// 3. Filter out source URI, serialize results 593 593 pub fn findSimilar(alloc: Allocator, uri: []const u8, limit: usize) ![]const u8 { 594 + // hash URI to tpuf ID format (AT-URIs exceed tpuf's 64-byte limit) 595 + const hashed = tpuf.hashId(uri); 596 + 594 597 // get source document's vector 595 - const vector = tpuf.getVectorById(alloc, uri) catch |err| { 598 + const vector = tpuf.getVectorById(alloc, &hashed) catch |err| { 596 599 logfire.warn("similar: getVectorById failed for {s}: {}", .{ uri, err }); 597 600 return error.VectorNotFound; 598 601 }; ··· 606 609 defer { 607 610 for (results) |r| { 608 611 alloc.free(r.id); 612 + alloc.free(r.uri); 609 613 alloc.free(r.title); 610 614 alloc.free(r.did); 611 615 alloc.free(r.created_at); ··· 625 629 try jw.beginArray(); 626 630 var count: usize = 0; 627 631 for (results) |r| { 628 - if (std.mem.eql(u8, r.id, uri)) continue; 632 + if (std.mem.eql(u8, r.uri, uri)) continue; 629 633 if (count >= limit) break; 630 634 try jw.write(SearchResultJson{ 631 635 .type = if (r.has_publication) "article" else "looseleaf", 632 - .uri = r.id, 636 + .uri = r.uri, 633 637 .did = r.did, 634 638 .title = r.title, 635 639 .snippet = "",

+23 -1

backend/src/tpuf.zig

··· 16 16 const Allocator = mem.Allocator; 17 17 const logfire = @import("logfire"); 18 18 19 + const Sha256 = std.crypto.hash.sha2.Sha256; 20 + 19 21 const API_BASE = "https://api.turbopuffer.com/v2/namespaces/"; 20 22 21 23 var api_key: ?[]const u8 = null; ··· 31 33 /// Fields mirror the SearchResultJson output so query results 32 34 /// can be returned directly without a DB roundtrip. 33 35 pub const VectorDoc = struct { 34 - id: []const u8, // AT-URI (used as turbopuffer document ID) 36 + id: []const u8, // hashed ID for tpuf (via hashId) 35 37 vector: []const f32, // embedding (voyage-3-lite, 512 dims) 38 + uri: []const u8, // full AT-URI (stored as metadata) 36 39 title: []const u8, 37 40 did: []const u8, 38 41 created_at: []const u8, ··· 47 50 pub const QueryResult = struct { 48 51 id: []const u8, 49 52 dist: f64, 53 + uri: []const u8, 50 54 title: []const u8, 51 55 did: []const u8, 52 56 created_at: []const u8, ··· 86 90 return api_key != null; 87 91 } 88 92 93 + /// Hash a URI to a tpuf-safe ID (max 64 bytes). 94 + /// Uses first 32 hex chars of SHA256 (128 bits — no collisions at our scale). 95 + pub fn hashId(uri: []const u8) [32]u8 { 96 + const hex_chars = "0123456789abcdef"; 97 + var digest: [32]u8 = undefined; 98 + Sha256.hash(uri, &digest, .{}); 99 + var hex: [32]u8 = undefined; 100 + for (digest[0..16], 0..) |byte, i| { 101 + hex[i * 2] = hex_chars[byte >> 4]; 102 + hex[i * 2 + 1] = hex_chars[byte & 0xf]; 103 + } 104 + return hex; 105 + } 106 + 89 107 /// Upsert document vectors with metadata. Creates the namespace on first write. 90 108 /// Errors are logged but should not be fatal — the system works without vector search. 91 109 pub fn upsert(allocator: Allocator, docs: []const VectorDoc) !void { ··· 183 201 for (doc.vector) |v| try jw.write(v); 184 202 try jw.endArray(); 185 203 204 + try jw.objectField("uri"); 205 + try jw.write(doc.uri); 186 206 try jw.objectField("title"); 187 207 try jw.write(doc.title); 188 208 try jw.objectField("did"); ··· 232 252 try jw.objectField("include_attributes"); 233 253 try jw.beginArray(); 234 254 for ([_][]const u8{ 255 + "uri", 235 256 "title", 236 257 "did", 237 258 "created_at", ··· 333 354 results[count] = .{ 334 355 .id = try allocator.dupe(u8, jsonStr(obj, "id")), 335 356 .dist = jsonFloat(obj, "$dist"), 357 + .uri = try allocator.dupe(u8, jsonStr(obj, "uri")), 336 358 .title = try allocator.dupe(u8, jsonStr(obj, "title")), 337 359 .did = try allocator.dupe(u8, jsonStr(obj, "did")), 338 360 .created_at = try allocator.dupe(u8, jsonStr(obj, "created_at")),

Configure Feed

Configure Feed