optimize CBOR encoder: skip-sort, batched writes, stack CID

+165 -31

2 changed files

expand all

src

internal

repo

cbor.zig

cbor_bench.zig

+60 -31

src/internal/repo/cbor.zig

··· 204 204 var hash: [Sha256.digest_length]u8 = undefined; 205 205 Sha256.hash(data, &hash, .{}); 206 206 207 - var aw: std.Io.Writer.Allocating = .init(allocator); 208 - errdefer aw.deinit(); 209 - try writeUvarint(&aw.writer, ver); 210 - try writeUvarint(&aw.writer, cod); 211 - try writeUvarint(&aw.writer, hash_fn_code); 212 - try writeUvarint(&aw.writer, Sha256.digest_length); 213 - try aw.writer.writeAll(&hash); 207 + // build CID on the stack then copy to allocator — avoids dynamic writer 208 + // overhead. max varint size is 10 bytes × 4 fields + 32 byte hash = 72 bytes. 209 + var buf: [72]u8 = undefined; 210 + var w: std.Io.Writer = .fixed(&buf); 211 + writeUvarint(&w, ver) catch unreachable; 212 + writeUvarint(&w, cod) catch unreachable; 213 + writeUvarint(&w, hash_fn_code) catch unreachable; 214 + writeUvarint(&w, Sha256.digest_length) catch unreachable; 215 + w.writeAll(&hash) catch unreachable; 214 216 215 - return .{ .raw = try aw.toOwnedSlice() }; 217 + const raw = try allocator.dupe(u8, w.buffered()); 218 + return .{ .raw = raw }; 216 219 } 217 220 218 221 /// serialize this CID to raw bytes (version varint + codec varint + multihash) ··· 438 441 OutOfMemory, 439 442 }; 440 443 441 - /// write the CBOR initial byte + argument using shortest encoding (DAG-CBOR requirement) 444 + /// write the CBOR initial byte + argument using shortest encoding (DAG-CBOR requirement). 445 + /// batches all bytes into a single writeAll call to minimize writer dispatch overhead. 442 446 fn writeArgument(writer: anytype, major: u3, val: u64) !void { 443 447 const prefix: u8 = @as(u8, major) << 5; 444 448 if (val < 24) { 445 - try writer.writeByte(prefix | @as(u8, @intCast(val))); 449 + try writer.writeAll(&.{prefix | @as(u8, @intCast(val))}); 446 450 } else if (val <= 0xff) { 447 - try writer.writeByte(prefix | 24); 448 - try writer.writeByte(@as(u8, @intCast(val))); 451 + try writer.writeAll(&.{ prefix | 24, @as(u8, @intCast(val)) }); 449 452 } else if (val <= 0xffff) { 450 - try writer.writeByte(prefix | 25); 451 453 const v: u16 = @intCast(val); 452 - try writer.writeAll(&[2]u8{ @truncate(v >> 8), @truncate(v) }); 454 + try writer.writeAll(&.{ prefix | 25, @truncate(v >> 8), @truncate(v) }); 453 455 } else if (val <= 0xffffffff) { 454 - try writer.writeByte(prefix | 26); 455 456 const v: u32 = @intCast(val); 456 - try writer.writeAll(&[4]u8{ 457 - @truncate(v >> 24), @truncate(v >> 16), 458 - @truncate(v >> 8), @truncate(v), 457 + try writer.writeAll(&.{ 458 + prefix | 26, 459 + @truncate(v >> 24), 460 + @truncate(v >> 16), 461 + @truncate(v >> 8), 462 + @truncate(v), 459 463 }); 460 464 } else { 461 - try writer.writeByte(prefix | 27); 462 - try writer.writeAll(&[8]u8{ 463 - @truncate(val >> 56), @truncate(val >> 48), 464 - @truncate(val >> 40), @truncate(val >> 32), 465 - @truncate(val >> 24), @truncate(val >> 16), 466 - @truncate(val >> 8), @truncate(val), 465 + try writer.writeAll(&.{ 466 + prefix | 27, 467 + @truncate(val >> 56), 468 + @truncate(val >> 48), 469 + @truncate(val >> 40), 470 + @truncate(val >> 32), 471 + @truncate(val >> 24), 472 + @truncate(val >> 16), 473 + @truncate(val >> 8), 474 + @truncate(val), 467 475 }); 468 476 } 477 + } 478 + 479 + /// check if map entries are already in DAG-CBOR key order 480 + fn keysAlreadySorted(entries: []const Value.MapEntry) bool { 481 + if (entries.len <= 1) return true; 482 + var prev = entries[0].key; 483 + for (entries[1..]) |entry| { 484 + if (prev.len > entry.key.len) return false; 485 + if (prev.len == entry.key.len and std.mem.order(u8, prev, entry.key) != .lt) return false; 486 + prev = entry.key; 487 + } 488 + return true; 469 489 } 470 490 471 491 /// DAG-CBOR map key ordering: shorter keys first, then lexicographic ··· 500 520 }, 501 521 .map => |entries| { 502 522 try writeArgument(writer, 5, entries.len); 503 - // DAG-CBOR: keys sorted by byte length, then lexicographically 504 - const sorted = try allocator.dupe(Value.MapEntry, entries); 505 - defer allocator.free(sorted); 506 - std.mem.sort(Value.MapEntry, sorted, {}, dagCborKeyLessThan); 507 - for (sorted) |entry| { 508 - try encode(allocator, writer, .{ .text = entry.key }); 509 - try encode(allocator, writer, entry.value); 523 + // DAG-CBOR: keys sorted by byte length, then lexicographically. 524 + // fast path: skip allocation + sort when keys are already in order 525 + // (common for decoded data and hand-constructed records). 526 + if (keysAlreadySorted(entries)) { 527 + for (entries) |entry| { 528 + try encode(allocator, writer, .{ .text = entry.key }); 529 + try encode(allocator, writer, entry.value); 530 + } 531 + } else { 532 + const sorted = try allocator.dupe(Value.MapEntry, entries); 533 + defer allocator.free(sorted); 534 + std.mem.sort(Value.MapEntry, sorted, {}, dagCborKeyLessThan); 535 + for (sorted) |entry| { 536 + try encode(allocator, writer, .{ .text = entry.key }); 537 + try encode(allocator, writer, entry.value); 538 + } 510 539 } 511 540 }, 512 541 .boolean => |b| try writer.writeByte(if (b) @as(u8, 0xf5) else @as(u8, 0xf4)),

+105

src/internal/repo/cbor_bench.zig

··· 236 236 std.mem.doNotOptimizeAway(val); 237 237 } 238 238 239 + // --- diagnostic: isolate encode costs --- 240 + 241 + fn benchEncodeRecordNoSort() void { 242 + // encode with keys already in DAG-CBOR order (no sort needed) 243 + // bench_record keys are already sorted, so the sort is a no-op, 244 + // but we still pay for allocator.dupe + allocator.free per map. 245 + // this measures the sorting overhead vs raw encoding. 246 + var scratch: [4096]u8 = undefined; 247 + var fba = std.heap.FixedBufferAllocator.init(&scratch); 248 + var out_buf: [1024]u8 = undefined; 249 + var w: std.Io.Writer = .fixed(&out_buf); 250 + cbor.encode(fba.allocator(), &w, bench_record) catch @panic("encode"); 251 + std.mem.doNotOptimizeAway(w.end); 252 + } 253 + 254 + fn benchDecodeRecordNoValidation() void { 255 + // decode without UTF-8 validation or key order checks 256 + // (not possible with current API — this measures the same as benchUnmarshal 257 + // to show the overhead of validation is included) 258 + var scratch: [8192]u8 = undefined; 259 + var fba = std.heap.FixedBufferAllocator.init(&scratch); 260 + const val = cbor.decodeAll(fba.allocator(), encoded_record) catch @panic("decode"); 261 + std.mem.doNotOptimizeAway(val); 262 + } 263 + 264 + // --- diagnostic: UTF-8 validation cost --- 265 + 266 + fn benchUtf8Validate() void { 267 + // just the UTF-8 validation on the encoded record's text content 268 + // the record has ~300 bytes of text across all string fields 269 + std.mem.doNotOptimizeAway(std.unicode.utf8ValidateSlice(encoded_record)); 270 + } 271 + 272 + // --- diagnostic: SHA-256 only --- 273 + 274 + fn benchSha256() void { 275 + const Sha256 = std.crypto.hash.sha2.Sha256; 276 + var hash: [Sha256.digest_length]u8 = undefined; 277 + Sha256.hash(encoded_record, &hash, .{}); 278 + std.mem.doNotOptimizeAway(hash); 279 + } 280 + 281 + // --- larger payloads --- 282 + 283 + var encoded_record_10x: []const u8 = undefined; 284 + 285 + fn initLargePayload() void { 286 + const alloc = bench_arena.allocator(); 287 + // build a 10-element array of the bench record 288 + var items: [10]Value = undefined; 289 + for (&items) |*item| { 290 + item.* = bench_record; 291 + } 292 + const large: Value = .{ .array = &items }; 293 + encoded_record_10x = cbor.encodeAlloc(alloc, large) catch @panic("encode 10x"); 294 + } 295 + 296 + fn benchEncodeLarge() void { 297 + var scratch: [65536]u8 = undefined; 298 + var fba = std.heap.FixedBufferAllocator.init(&scratch); 299 + var items: [10]Value = undefined; 300 + for (&items) |*item| { 301 + item.* = bench_record; 302 + } 303 + const large: Value = .{ .array = &items }; 304 + var out_buf: [8192]u8 = undefined; 305 + var w: std.Io.Writer = .fixed(&out_buf); 306 + cbor.encode(fba.allocator(), &w, large) catch @panic("encode"); 307 + std.mem.doNotOptimizeAway(w.end); 308 + } 309 + 310 + fn benchDecodeLarge() void { 311 + var scratch: [65536]u8 = undefined; 312 + var fba = std.heap.FixedBufferAllocator.init(&scratch); 313 + const val = cbor.decodeAll(fba.allocator(), encoded_record_10x) catch @panic("decode"); 314 + std.mem.doNotOptimizeAway(val); 315 + } 316 + 317 + // --- CID: stack vs heap allocation --- 318 + 319 + fn benchComputeCIDStack() void { 320 + // compute CID writing to a stack buffer (no allocator) 321 + const Sha256 = std.crypto.hash.sha2.Sha256; 322 + var hash: [Sha256.digest_length]u8 = undefined; 323 + Sha256.hash(encoded_record, &hash, .{}); 324 + // manually build CID bytes on stack: version(1) + codec(0x71) + hash_fn(0x12) + len(0x20) + hash 325 + var cid_buf: [36]u8 = undefined; 326 + cid_buf[0] = 0x01; 327 + cid_buf[1] = 0x71; 328 + cid_buf[2] = 0x12; 329 + cid_buf[3] = 0x20; 330 + @memcpy(cid_buf[4..36], &hash); 331 + std.mem.doNotOptimizeAway(cid_buf); 332 + } 333 + 239 334 // --------------------------------------------------------------------------- 240 335 // main 241 336 // --------------------------------------------------------------------------- 242 337 243 338 pub fn main() void { 244 339 initBenchData(); 340 + initLargePayload(); 245 341 defer bench_arena.deinit(); 246 342 247 343 std.debug.print("\nDAG-CBOR benchmarks (record: {d} bytes encoded)\n", .{encoded_record.len}); ··· 255 351 256 352 std.debug.print("\nCID operations:\n", .{}); 257 353 bench("compute CID (SHA-256)", benchComputeCID); 354 + bench("compute CID (stack, no alloc)", benchComputeCIDStack); 355 + bench("SHA-256 only (434 bytes)", benchSha256); 258 356 bench("encode + compute CID", benchEncodeAndCID); 259 357 260 358 std.debug.print("\ntext string:\n", .{}); ··· 275 373 276 374 std.debug.print("\ncomposite:\n", .{}); 277 375 bench("decode + key lookup (3 keys)", benchMapKeyLookup); 376 + 377 + std.debug.print("\ndiagnostic (cost breakdown):\n", .{}); 378 + bench("UTF-8 validate (434 bytes)", benchUtf8Validate); 379 + 380 + std.debug.print("\nscaling (10x array = {d} bytes):\n", .{encoded_record_10x.len}); 381 + bench("encode 10x records", benchEncodeLarge); 382 + bench("decode 10x records", benchDecodeLarge); 278 383 279 384 std.debug.print("\n", .{}); 280 385 }

Configure Feed

Configure Feed