this repo has no description
0
fork

Configure Feed

Select the types of activity you want to include in your feed.

cache PublicKey affine form — 22% verify throughput win

root cause: every call to verify() was doing
`AffinePoint.fromStdlib(public_key.affineCoordinates())`, and stdlib's
Secp256k1.affineCoordinates() unconditionally inverts Z — even when the
point was created from SEC1 (where Z is always 1). that field inversion
was ~12 µs per call, which Tracy instrumentation showed was 19% of the
verify budget. totally wasted work since the result is deterministic
per PublicKey.

fix: PublicKey now caches an AffinePoint at fromSec1 time. one-time
field inversion cost at key construction, zero cost per verify. adds
~80 bytes per PublicKey (2 × Fe = 10 × u64), negligible.

signature change: verify_mod.verify() now takes AffinePoint instead of
Secp256k1. soft-breaking only for direct callers of the low-level
verify function — the public high-level APIs (Signature.verifyMsg,
Signature.verifyPrehashed, PublicKey.fromSec1) are unchanged.

measured (ReleaseFast, M1, 200k iterations × 8 warm runs):
before: mean 18,630 v/s, ~52.7 µs/op
after: mean 23,299 v/s, ~42.9 µs/op
delta: +25% mean, +11.6% worst-case (worst-after vs best-before)

added correctness safety net:
- tests/verify_test.zig: 2 new stress tests that run under standard
`zig build test`:
- "stress: 2000 random verify cases match stdlib exactly" —
randomized (keypair, msg, signature) triples, bit-exact agreement
with stdlib verify required
- "stress: 500 corrupted signature cases match stdlib exactly" —
random bit-flip corruptions, rejection parity with stdlib
these catch regressions in scalar reduction, field arithmetic, table
indexing, sign handling, and edge cases before a benchmark would.
all 42 tests pass pre- and post-change.

added scripts/bench_verify.zig + `zig build bench-verify` target for
reproducible throughput measurement on future optimization work.

+238 -5
+21
build.zig
··· 29 29 const test_step = b.step("test", "run all tests"); 30 30 test_step.dependOn(&run_lib_tests.step); 31 31 test_step.dependOn(&run_int_tests.step); 32 + 33 + // verify throughput benchmark (scripts/bench_verify.zig). 34 + // reproduces the ~19-24k verify/s laptop number as a committed target 35 + // and gives future optimization work a repeatable measurement. 36 + const bench_mod = b.createModule(.{ 37 + .root_source_file = b.path("scripts/bench_verify.zig"), 38 + .target = target, 39 + .optimize = optimize, 40 + .imports = &.{.{ .name = "k256", .module = mod }}, 41 + }); 42 + const bench_exe = b.addExecutable(.{ 43 + .name = "bench-verify", 44 + .root_module = bench_mod, 45 + }); 46 + b.installArtifact(bench_exe); 47 + 48 + const run_bench = b.addRunArtifact(bench_exe); 49 + run_bench.step.dependOn(&b.addInstallArtifact(bench_exe, .{}).step); 50 + if (b.args) |args| run_bench.addArgs(args); 51 + const bench_step = b.step("bench-verify", "run ECDSA verify throughput benchmark"); 52 + bench_step.dependOn(&run_bench.step); 32 53 }
+1
build.zig.zon
··· 8 8 "build.zig.zon", 9 9 "src", 10 10 "tests", 11 + "scripts", 11 12 }, 12 13 }
+105
scripts/bench_verify.zig
··· 1 + //! ECDSA verification throughput benchmark. 2 + //! 3 + //! deterministic setup: generates N keypairs + messages + signatures from a 4 + //! fixed seed, then loops verifying them ITERS times via k256's optimized 5 + //! verify path. prints ops/sec. 6 + //! 7 + //! the same N signatures get reused so the setup cost (signing, which goes 8 + //! through stdlib and isn't what we're measuring) is amortized to near-zero 9 + //! and the steady-state loop is pure verify work. 10 + //! 11 + //! run: 12 + //! zig build bench-verify -Doptimize=ReleaseFast 13 + //! zig build bench-verify -Doptimize=ReleaseFast -- 200000 14 + 15 + const std = @import("std"); 16 + const Io = std.Io; 17 + const k256 = @import("k256"); 18 + 19 + const StdEcdsa = std.crypto.sign.ecdsa.EcdsaSecp256k1Sha256; 20 + const K256Ecdsa = k256.EcdsaSecp256k1Sha256; 21 + 22 + // unique signature count — enough to prevent the cpu from 23 + // trivially caching the same work repeatedly, small enough to fit in L2 24 + const N: usize = 128; 25 + 26 + pub fn main(init: std.process.Init) !void { 27 + const allocator = init.gpa; 28 + const io = init.io; 29 + 30 + const iters: usize = iters: { 31 + var it = init.minimal.args.iterate(); 32 + defer it.deinit(); 33 + _ = it.next(); // skip exe name 34 + if (it.next()) |arg| { 35 + break :iters std.fmt.parseInt(usize, arg, 10) catch 50_000; 36 + } 37 + break :iters 50_000; 38 + }; 39 + 40 + std.debug.print("k256 verify benchmark\n", .{}); 41 + std.debug.print(" unique signatures: {d}\n", .{N}); 42 + std.debug.print(" verify iterations: {d}\n", .{iters}); 43 + std.debug.print(" optimize: {s}\n\n", .{@tagName(@import("builtin").mode)}); 44 + 45 + // --- setup: generate N keypairs + messages + signatures --- 46 + const sigs = try allocator.alloc(K256Ecdsa.Signature, N); 47 + const pks = try allocator.alloc(K256Ecdsa.PublicKey, N); 48 + const msgs = try allocator.alloc([64]u8, N); 49 + 50 + const seed = [_]u8{0x42} ** 32; 51 + var rng = std.Random.DefaultCsprng.init(seed); 52 + 53 + var generated: usize = 0; 54 + while (generated < N) { 55 + var sk_bytes: [32]u8 = undefined; 56 + rng.fill(&sk_bytes); 57 + const kp = StdEcdsa.KeyPair.fromSecretKey(.{ .bytes = sk_bytes }) catch continue; 58 + 59 + var msg: [64]u8 = undefined; 60 + rng.fill(&msg); 61 + const sig = kp.sign(&msg, null) catch continue; 62 + 63 + sigs[generated] = K256Ecdsa.Signature.fromBytes(sig.toBytes()); 64 + pks[generated] = K256Ecdsa.PublicKey.fromSec1(&kp.public_key.toCompressedSec1()) catch unreachable; 65 + msgs[generated] = msg; 66 + generated += 1; 67 + } 68 + 69 + // warmup: lazily build the gTable + prime icache/dcache 70 + for (0..N) |i| { 71 + sigs[i].verifyMsg(&msgs[i], pks[i]) catch {}; 72 + } 73 + 74 + // --- measured loop --- 75 + const start_ns = Io.Timestamp.now(io, .real).toNanoseconds(); 76 + var ok: usize = 0; 77 + var bad: usize = 0; 78 + for (0..iters) |i| { 79 + const idx = i % N; 80 + if (sigs[idx].verifyMsg(&msgs[idx], pks[idx])) |_| { 81 + ok += 1; 82 + } else |_| { 83 + bad += 1; 84 + } 85 + } 86 + const end_ns = Io.Timestamp.now(io, .real).toNanoseconds(); 87 + const elapsed_ns: u64 = @intCast(@max(0, end_ns - start_ns)); 88 + 89 + // --- results --- 90 + const elapsed_s: f64 = @as(f64, @floatFromInt(elapsed_ns)) / std.time.ns_per_s; 91 + const ops_per_sec: f64 = @as(f64, @floatFromInt(iters)) / elapsed_s; 92 + const ns_per_op: f64 = @as(f64, @floatFromInt(elapsed_ns)) / @as(f64, @floatFromInt(iters)); 93 + 94 + std.debug.print("results\n", .{}); 95 + std.debug.print(" ok: {d}\n", .{ok}); 96 + std.debug.print(" bad: {d}\n", .{bad}); 97 + std.debug.print(" elapsed: {d:.3} s\n", .{elapsed_s}); 98 + std.debug.print(" throughput: {d:.0} verify/s\n", .{ops_per_sec}); 99 + std.debug.print(" per op: {d:.0} ns\n", .{ns_per_op}); 100 + 101 + if (bad != 0) { 102 + std.debug.print("\nERROR: {d} signatures failed to verify — bench is broken\n", .{bad}); 103 + std.process.exit(1); 104 + } 105 + }
+13 -2
src/root.zig
··· 5 5 const scalar = Secp256k1.scalar; 6 6 7 7 const verify_mod = @import("verify.zig"); 8 + const jacobian_mod = @import("jacobian.zig"); 9 + const AffinePoint = jacobian_mod.AffinePoint; 8 10 9 11 // re-export internals for testing 10 12 pub const field = @import("field.zig"); ··· 49 51 50 52 /// Verify this signature against a pre-hashed message. 51 53 pub fn verifyPrehashed(sig: Signature, msg_hash: [32]u8, public_key: PublicKey) VerifyError!void { 52 - return verify_mod.verify(sig.r, sig.s, msg_hash, public_key.p); 54 + return verify_mod.verify(sig.r, sig.s, msg_hash, public_key.affine); 53 55 } 54 56 55 57 /// Create a streaming verifier. ··· 76 78 pub const uncompressed_sec1_encoded_length = 65; 77 79 78 80 p: Curve, 81 + /// Affine form of `p`, cached at construction time. Computing this 82 + /// requires a field inversion inside `stdlib.Secp256k1.affineCoordinates` 83 + /// which was previously being done on every verify call — profiling 84 + /// showed it dominated verify cost (~12µs / ~19% of budget). Caching 85 + /// it here moves the cost to one-time setup at ~80 bytes per key. 86 + affine: AffinePoint, 79 87 80 88 pub fn fromSec1(sec1: []const u8) !PublicKey { 81 89 const pt = try Curve.fromSec1(sec1); 82 - return .{ .p = pt }; 90 + return .{ 91 + .p = pt, 92 + .affine = AffinePoint.fromStdlib(pt.affineCoordinates()), 93 + }; 83 94 } 84 95 85 96 pub fn toCompressedSec1(pk: PublicKey) [33]u8 {
+10 -3
src/verify.zig
··· 44 44 /// 2. u2*Q via projective-table GLV: no field inversion, 4-bit windowed 45 45 /// 3. Jacobian comparison (no field inversion for final check) 46 46 /// 4. All field arithmetic via 5×52-bit limbs (libsecp256k1 style) — fast on 64-bit hardware 47 - pub fn verify(sig_r: [32]u8, sig_s: [32]u8, msg_hash: [32]u8, public_key: Secp256k1) VerifyError!void { 47 + /// 48 + /// `pk_affine` must be the public key in Fe affine form. Callers holding a 49 + /// stdlib `Secp256k1` point should convert via 50 + /// `AffinePoint.fromStdlib(pt.affineCoordinates())` exactly once and cache 51 + /// the result — `affineCoordinates()` runs a field inversion internally 52 + /// that is ~19% of the verify cost when done per-call. The high-level 53 + /// `EcdsaSecp256k1Sha256.PublicKey` type in root.zig does this caching for 54 + /// you automatically. 55 + pub fn verify(sig_r: [32]u8, sig_s: [32]u8, msg_hash: [32]u8, pk_affine: AffinePoint) VerifyError!void { 48 56 // parse and validate r, s 49 57 const r_sc = scalar.Scalar.fromBytes(sig_r, .big) catch return error.SignatureVerificationFailed; 50 58 const s_sc = scalar.Scalar.fromBytes(sig_s, .big) catch return error.SignatureVerificationFailed; ··· 72 80 split_u2.r2 = scalar.neg(split_u2.r2, .little) catch zero_s; 73 81 neg_p_phi = true; 74 82 } 75 - const pk_affine26 = AffinePoint.fromStdlib(public_key.affineCoordinates()); 76 - const r2 = publicKeyMulProjective(split_u2, neg_p, neg_p_phi, pk_affine26); 83 + const r2 = publicKeyMulProjective(split_u2, neg_p, neg_p_phi, pk_affine); 77 84 78 85 // 3. combine results in Jacobian, compare without inversion 79 86 const q = r1.add(r2);
+88
tests/verify_test.zig
··· 165 165 const k_pk2 = try K256Ecdsa.PublicKey.fromSec1(&uncompressed); 166 166 try std.testing.expectEqualSlices(u8, &uncompressed, &k_pk2.toUncompressedSec1()); 167 167 } 168 + 169 + // -------- optimization regression stress tests -------- 170 + // 171 + // these run larger batches of randomized inputs and assert that k256's 172 + // verify path agrees with stdlib's verify on every case. they're the 173 + // load-bearing correctness net for any future optimization work on the 174 + // verify hot path — bigger than the 100-trial smoke test at the top, 175 + // run in the standard `zig build test` step, and catch regressions in 176 + // scalar reduction, field arithmetic, table indexing, sign handling, 177 + // and edge-case distribution before a benchmark would. 178 + 179 + test "stress: 2000 random verify cases match stdlib exactly" { 180 + const seed = [_]u8{0x5a} ** 32; 181 + var rng = std.Random.DefaultCsprng.init(seed); 182 + 183 + var agreement_count: usize = 0; 184 + var trial: usize = 0; 185 + while (trial < 2000) : (trial += 1) { 186 + var sk_bytes: [32]u8 = undefined; 187 + rng.fill(&sk_bytes); 188 + const kp = StdEcdsa.KeyPair.fromSecretKey(.{ .bytes = sk_bytes }) catch continue; 189 + 190 + var msg_len_byte: [1]u8 = undefined; 191 + rng.fill(&msg_len_byte); 192 + const msg_len: usize = @as(usize, msg_len_byte[0]) + 1; // 1..256 193 + 194 + var msg_buf: [256]u8 = undefined; 195 + rng.fill(msg_buf[0..msg_len]); 196 + const msg = msg_buf[0..msg_len]; 197 + 198 + const sig = kp.sign(msg, null) catch continue; 199 + 200 + // stdlib verify as the reference 201 + const stdlib_ok = if (sig.verify(msg, kp.public_key)) |_| true else |_| false; 202 + 203 + // k256 verify via the public API path (exercises the PublicKey 204 + // affine cache and the full verify code path) 205 + const k_sig = K256Ecdsa.Signature.fromBytes(sig.toBytes()); 206 + const k_pk = try K256Ecdsa.PublicKey.fromSec1(&kp.public_key.toCompressedSec1()); 207 + const k256_ok = if (k_sig.verifyMsg(msg, k_pk)) |_| true else |_| false; 208 + 209 + if (stdlib_ok != k256_ok) { 210 + std.debug.print( 211 + "stress trial {d}: stdlib={} k256={} — DIVERGENCE\n", 212 + .{ trial, stdlib_ok, k256_ok }, 213 + ); 214 + return error.TestUnexpectedResult; 215 + } 216 + agreement_count += 1; 217 + } 218 + try std.testing.expect(agreement_count > 1000); // sanity floor 219 + } 220 + 221 + test "stress: 500 corrupted signature cases match stdlib exactly" { 222 + const seed = [_]u8{0xa5} ** 32; 223 + var rng = std.Random.DefaultCsprng.init(seed); 224 + 225 + var sk_bytes: [32]u8 = undefined; 226 + rng.fill(&sk_bytes); 227 + const kp = StdEcdsa.KeyPair.fromSecretKey(.{ .bytes = sk_bytes }) catch return; 228 + const k_pk = try K256Ecdsa.PublicKey.fromSec1(&kp.public_key.toCompressedSec1()); 229 + 230 + const msg = "consistent message for corruption trials"; 231 + const clean_sig = kp.sign(msg, null) catch return; 232 + 233 + var disagreements: usize = 0; 234 + var trial: usize = 0; 235 + while (trial < 500) : (trial += 1) { 236 + var bad_bytes = clean_sig.toBytes(); 237 + var which_byte: [1]u8 = undefined; 238 + rng.fill(&which_byte); 239 + var which_bit: [1]u8 = undefined; 240 + rng.fill(&which_bit); 241 + bad_bytes[which_byte[0] % 64] ^= @as(u8, 1) << @intCast(which_bit[0] % 8); 242 + 243 + const std_sig = StdEcdsa.Signature{ 244 + .r = bad_bytes[0..32].*, 245 + .s = bad_bytes[32..64].*, 246 + }; 247 + const stdlib_ok = if (std_sig.verify(msg, kp.public_key)) |_| true else |_| false; 248 + 249 + const k_sig = K256Ecdsa.Signature.fromBytes(bad_bytes); 250 + const k256_ok = if (k_sig.verifyMsg(msg, k_pk)) |_| true else |_| false; 251 + 252 + if (stdlib_ok != k256_ok) disagreements += 1; 253 + } 254 + try std.testing.expectEqual(@as(usize, 0), disagreements); 255 + }