host_authority: slot recovery + pool metrics + preload account count

+2 -2

build.zig.zon

··· 5 5 .minimum_zig_version = "0.16.0", 6 6 .dependencies = .{ 7 7 .zat = .{ 8 - .url = "https://tangled.org/zat.dev/zat/archive/v0.3.0-alpha.22.tar.gz", 9 - .hash = "zat-0.3.0-alpha.22-5PuC7p9OCAAz4jpPNPoM4alGgqehPnpZCzIUy4LBajPh", 8 + .url = "https://tangled.org/zat.dev/zat/archive/v0.3.0-alpha.23.tar.gz", 9 + .hash = "zat-0.3.0-alpha.23-5PuC7k1VCACPkCoMnIstIbVu1yIrVP5Yx3l0G-lZ2Qoa", 10 10 }, 11 11 .websocket = .{ 12 12 .url = "https://github.com/zzstoatzz/websocket.zig/archive/3c6794a.tar.gz",

+44

src/broadcaster.zig

··· 55 55 host_authority_is_new: std.atomic.Value(u64) = .{ .raw = 0 }, 56 56 host_authority_host_changed: std.atomic.Value(u64) = .{ .raw = 0 }, 57 57 host_authority_time_us: std.atomic.Value(u64) = .{ .raw = 0 }, 58 + // host_authority resolver pool mechanics (added 2026-04-09 per external 59 + // review). these expose pool contention and the slot-recovery code path 60 + // that previous reject-branch counters couldn't see. see relay 61 + // docs/zlay-external-review-2026-04-09.md. 62 + host_resolver_acquire_wait_us_total: std.atomic.Value(u64) = .{ .raw = 0 }, 63 + host_resolver_in_use: std.atomic.Value(u32) = .{ .raw = 0 }, 64 + host_resolver_resets_total: std.atomic.Value(u64) = .{ .raw = 0 }, 65 + host_resolver_resolve_fail_total: std.atomic.Value(u64) = .{ .raw = 0 }, 66 + // background DID resolveLoop ok/fail. previously these failures were 67 + // log.debug + continue with no observability — we treated the loop as 68 + // "working" without ever measuring it. baseline the rate so we can 69 + // tell if it's silently degraded. 70 + resolve_loop_resolve_ok_total: std.atomic.Value(u64) = .{ .raw = 0 }, 71 + resolve_loop_resolve_fail_total: std.atomic.Value(u64) = .{ .raw = 0 }, 58 72 // per-branch reject breakdown (subsets of failed_host_authority). 59 73 // added 2026-04-08 to diagnose the 100% host_authority failure rate — 60 74 // without this breakdown we can't tell whether the DID doc lookup is ··· 1065 1079 \\# HELP relay_broadcast_no_consumers_total frames skipped broadcast (no consumers) 1066 1080 \\relay_broadcast_no_consumers_total {d} 1067 1081 \\ 1082 + \\# TYPE relay_host_resolver_acquire_wait_us_total counter 1083 + \\# HELP relay_host_resolver_acquire_wait_us_total cumulative microseconds spent spinning in acquireHostResolver 1084 + \\relay_host_resolver_acquire_wait_us_total {d} 1085 + \\ 1086 + \\# TYPE relay_host_resolver_in_use gauge 1087 + \\# HELP relay_host_resolver_in_use host_authority resolver pool slots currently held by callers 1088 + \\relay_host_resolver_in_use {d} 1089 + \\ 1090 + \\# TYPE relay_host_resolver_resets_total counter 1091 + \\# HELP relay_host_resolver_resets_total slot deinit+reinit on first-attempt resolve failure (slot recovery path) 1092 + \\relay_host_resolver_resets_total {d} 1093 + \\ 1094 + \\# TYPE relay_host_resolver_resolve_fail_total counter 1095 + \\# HELP relay_host_resolver_resolve_fail_total first-attempt resolve failures in the host_authority pool (before recovery/retry) 1096 + \\relay_host_resolver_resolve_fail_total {d} 1097 + \\ 1098 + \\# TYPE relay_resolve_loop_resolve_ok_total counter 1099 + \\# HELP relay_resolve_loop_resolve_ok_total successful resolves in the background signing-key resolveLoop 1100 + \\relay_resolve_loop_resolve_ok_total {d} 1101 + \\ 1102 + \\# TYPE relay_resolve_loop_resolve_fail_total counter 1103 + \\# HELP relay_resolve_loop_resolve_fail_total failed resolves in the background signing-key resolveLoop 1104 + \\relay_resolve_loop_resolve_fail_total {d} 1105 + \\ 1068 1106 , .{ 1069 1107 stats.persist_order_spins.load(.acquire), 1070 1108 stats.broadcast_queue_push_lock_spins.load(.acquire), 1071 1109 stats.broadcast_queue_full.load(.acquire), 1072 1110 stats.broadcast_queue_depth_hwm.load(.acquire), 1073 1111 stats.broadcast_no_consumers.load(.acquire), 1112 + stats.host_resolver_acquire_wait_us_total.load(.acquire), 1113 + stats.host_resolver_in_use.load(.acquire), 1114 + stats.host_resolver_resets_total.load(.acquire), 1115 + stats.host_resolver_resolve_fail_total.load(.acquire), 1116 + stats.resolve_loop_resolve_ok_total.load(.acquire), 1117 + stats.resolve_loop_resolve_fail_total.load(.acquire), 1074 1118 }) catch {}; 1075 1119 1076 1120 // validation failure breakdown by reason

+16 -1

src/event_log.zig

··· 622 622 last_seq: u64, 623 623 failed_attempts: u32, 624 624 account_limit: ?u64 = null, 625 + // computed by listActiveHosts at load time so cold-start spawn doesn't 626 + // need a per-host DbRequest round-trip. equals account_limit when set, 627 + // otherwise COUNT(account.uid) for this host. 628 + effective_account_count: u64 = 0, 625 629 }; 626 630 627 631 const HostResult = struct { id: u64, last_seq: u64 }; ··· 718 722 hosts.deinit(allocator); 719 723 } 720 724 725 + // batch the effective_account_count into the host load — same JOIN/COUNT 726 + // shape as getEffectiveAccountCountImpl but folded into one query so 727 + // spawnWorker doesn't need a per-host DbRequest round-trip during 728 + // cold-start. h.id is the primary key so other h.* columns are 729 + // functionally dependent for the GROUP BY. 721 730 var result = try db.query( 722 - "SELECT id, hostname, status, last_seq, failed_attempts, account_limit FROM host WHERE status = 'active' ORDER BY id ASC", 731 + "SELECT h.id, h.hostname, h.status, h.last_seq, h.failed_attempts, h.account_limit, " ++ 732 + "COALESCE(h.account_limit, COUNT(a.uid)) AS effective_account_count " ++ 733 + "FROM host h LEFT JOIN account a ON a.host_id = h.id " ++ 734 + "WHERE h.status = 'active' " ++ 735 + "GROUP BY h.id ORDER BY h.id ASC", 723 736 .{}, 724 737 ); 725 738 defer result.deinit(); 726 739 727 740 while (result.nextUnsafe() catch null) |row| { 741 + const eff_count_i64 = row.get(i64, 6); 728 742 try hosts.append(allocator, .{ 729 743 .id = @intCast(row.get(i64, 0)), 730 744 .hostname = try allocator.dupe(u8, row.get([]const u8, 1)), ··· 732 746 .last_seq = @intCast(row.get(i64, 3)), 733 747 .failed_attempts = @intCast(row.get(i32, 4)), 734 748 .account_limit = if (row.get(?i64, 5)) |v| @as(?u64, @intCast(v)) else null, 749 + .effective_account_count = if (eff_count_i64 > 0) @intCast(eff_count_i64) else 0, 735 750 }); 736 751 } 737 752

+28 -24

src/slurper.zig

··· 526 526 db_queue.push(&reset_req.base); 527 527 reset_req.base.wait(self.io, self.shutdown); 528 528 529 - // phase 5: spawn worker (Evented) 530 - try self.spawnWorker(db_req.host_id, hostname, db_req.last_seq); 529 + // phase 5: fetch effective account count (single DbRequest, addHost is 530 + // a rare one-off path — not the cold-start hot loop). cold-start uses 531 + // listActiveHosts which preloads this in the batch query. 532 + var account_count: u64 = 0; 533 + const GetCountReq = struct { 534 + base: event_log_mod.DbRequest = .{ .callback = &execute }, 535 + hid: u64, 536 + count: u64 = 0, 537 + 538 + fn execute(b: *event_log_mod.DbRequest, dp: *event_log_mod.DiskPersist) void { 539 + const s: *@This() = @fieldParentPtr("base", b); 540 + s.count = dp.getEffectiveAccountCount(s.hid); 541 + } 542 + }; 543 + var count_req: GetCountReq = .{ .hid = db_req.host_id }; 544 + db_queue.push(&count_req.base); 545 + count_req.base.wait(self.io, self.shutdown); 546 + account_count = count_req.count; 547 + 548 + // phase 6: spawn worker (Evented) 549 + try self.spawnWorker(db_req.host_id, hostname, db_req.last_seq, account_count); 531 550 log.info("added host {s} (id={d})", .{ hostname, db_req.host_id }); 532 551 } 533 552 534 - /// spawn a subscriber thread for a host 535 - fn spawnWorker(self: *Slurper, host_id: u64, hostname: []const u8, last_seq: u64) !void { 553 + /// spawn a subscriber thread for a host. callers must pass the 554 + /// effective_account_count up front — see comment on the addHost path 555 + /// below for the one site that still computes it inline. 556 + fn spawnWorker(self: *Slurper, host_id: u64, hostname: []const u8, last_seq: u64, account_count: u64) !void { 536 557 const hostname_duped = try self.allocator.dupe(u8, hostname); 537 558 errdefer self.allocator.free(hostname_duped); 538 559 539 560 const sub = try self.allocator.create(subscriber_mod.Subscriber); 540 561 errdefer self.allocator.destroy(sub); 541 - 542 - // get effective account count via DbRequestQueue 543 - var account_count: u64 = 0; 544 - if (self.db_queue) |db_queue| { 545 - const GetCountReq = struct { 546 - base: event_log_mod.DbRequest = .{ .callback = &execute }, 547 - hid: u64, 548 - count: u64 = 0, 549 - 550 - fn execute(b: *event_log_mod.DbRequest, dp: *event_log_mod.DiskPersist) void { 551 - const s: *@This() = @fieldParentPtr("base", b); 552 - s.count = dp.getEffectiveAccountCount(s.hid); 553 - } 554 - }; 555 - var count_req: GetCountReq = .{ .hid = host_id }; 556 - db_queue.push(&count_req.base); 557 - count_req.base.wait(self.io, self.shutdown); 558 - account_count = count_req.count; 559 - } 560 562 561 563 sub.* = subscriber_mod.Subscriber.init( 562 564 self.allocator, ··· 670 672 var spawned: usize = 0; 671 673 for (hosts) |host| { 672 674 if (self.shutdown.load(.acquire)) break; 673 - self.spawnWorker(host.id, host.hostname, host.last_seq) catch |err| { 675 + // host.effective_account_count was preloaded by listActiveHostsImpl 676 + // — no per-host DbRequest round-trip during cold start. 677 + self.spawnWorker(host.id, host.hostname, host.last_seq, host.effective_account_count) catch |err| { 674 678 log.warn("failed to spawn worker for {s}: {s}", .{ host.hostname, @errorName(err) }); 675 679 }; 676 680 spawned += 1;

+94 -39

src/validator.zig

··· 61 61 io: Io, 62 62 // pool of reusable resolvers for inline host authority checks. 63 63 // frame workers acquire/release via atomic flag to avoid creating 64 - // a fresh resolver (and fresh TLS handshake) per call. 65 - host_resolvers: [host_resolver_pool_size]zat.DidResolver = undefined, 66 - host_resolver_available: [host_resolver_pool_size]std.atomic.Value(bool) = .{std.atomic.Value(bool){ .raw = false }} ** host_resolver_pool_size, 64 + // a fresh resolver (and fresh TLS handshake) per call. heap-allocated 65 + // in start() so the pool size can be tuned via HOST_RESOLVER_POOL_SIZE 66 + // env var without recompiling. with keep_alive=false, pool width is a 67 + // real startup throughput knob. 68 + host_resolvers: []zat.DidResolver = &.{}, 69 + host_resolver_available: []std.atomic.Value(bool) = &.{}, 67 70 host_resolver_inited: bool = false, 68 71 69 72 const max_resolver_threads = 8; 70 73 const default_resolver_threads = 4; 71 74 const max_queue_size: usize = 100_000; 72 - const host_resolver_pool_size: usize = 4; 75 + const default_host_resolver_pool_size: usize = 4; 76 + const max_host_resolver_pool_size: usize = 64; 73 77 74 78 pub fn init(allocator: Allocator, stats: *broadcaster.Stats, io: Io) Validator { 75 79 return initWithConfig(allocator, stats, .{}, io); ··· 96 100 } 97 101 98 102 if (self.host_resolver_inited) { 99 - for (&self.host_resolvers) |*r| { 103 + for (self.host_resolvers) |*r| { 100 104 r.deinit(); 101 105 } 106 + self.allocator.free(self.host_resolvers); 107 + self.allocator.free(self.host_resolver_available); 108 + self.host_resolvers = &.{}; 109 + self.host_resolver_available = &.{}; 102 110 self.host_resolver_inited = false; 103 111 } 104 112 ··· 124 132 125 133 // init host authority resolver pool (reused across calls). 126 134 // 127 - // keep_alive = false: workaround for 100% rejection rate observed 128 - // 2026-04-08. hypothesis is that zig 0.16 std.http.Client doesn't 129 - // recover stale keep-alive connections on the pooled resolvers — 130 - // pool was added 2026-03-18 on zig 0.15, never re-validated after 131 - // the 0.16 migration on 2026-04-05. plc.directory is reachable 132 - // from the pod and cold resolvers (resolveLoop) work fine, so it's 133 - // specifically the pooled + long-lived keep_alive path. 134 - // 135 - // cost: one TLS handshake per host authority check (~tens of ms). 136 - // host authority checks only fire on is_new or host_changed, so the 137 - // steady-state rate is low. keep the pool for the socket churn 138 - // savings across multiple fiber callers even without keep_alive. 135 + // keep_alive = false: workaround for ~99% rejection rate observed 136 + // 2026-04-08. root cause not yet known — local repro couldn't 137 + // reproduce the failure, the leading hypothesis is that pool slots 138 + // get poisoned by a transient network condition and never recover 139 + // because there's no slot-recovery path. slot recovery added in 140 + // resolveHostAuthority below; keep_alive can flip back to true via 141 + // canary once resolve_loop_resolve_fail and the new sampled warn 142 + // log give us the actual underlying error kind from zat 143 + // v0.3.0-alpha.23. see relay docs/zlay-external-review-2026-04-09.md. 139 144 // 140 - // TODO: remove once upstream zig fix lands. file issue when we 141 - // have the actual error kind from the sampled warn logs below. 142 - for (&self.host_resolvers) |*r| { 145 + // pool size is HOST_RESOLVER_POOL_SIZE (default 4, max 64). with 146 + // keep_alive=false, every check is a fresh TLS handshake (~tens of 147 + // ms), so pool width is a real startup throughput knob — bumping 148 + // it lets more is_new checks run concurrently during cold-start 149 + // reconnect storms. 150 + const requested_size = parseEnvInt(usize, "HOST_RESOLVER_POOL_SIZE", default_host_resolver_pool_size); 151 + const pool_size = @min(requested_size, max_host_resolver_pool_size); 152 + 153 + self.host_resolvers = try self.allocator.alloc(zat.DidResolver, pool_size); 154 + errdefer self.allocator.free(self.host_resolvers); 155 + self.host_resolver_available = try self.allocator.alloc(std.atomic.Value(bool), pool_size); 156 + errdefer self.allocator.free(self.host_resolver_available); 157 + 158 + for (self.host_resolvers) |*r| { 143 159 r.* = zat.DidResolver.initWithOptions(self.io, self.allocator, .{ .keep_alive = false }); 144 160 } 145 - for (&self.host_resolver_available) |*a| { 146 - a.store(true, .release); 161 + for (self.host_resolver_available) |*a| { 162 + a.* = .{ .raw = true }; 147 163 } 148 164 self.host_resolver_inited = true; 165 + 166 + log.info("host_authority resolver pool: size={d} keep_alive=false", .{pool_size}); 149 167 } 150 168 151 169 /// validate a #sync frame: signature verification only (no ops, no MST). ··· 473 491 // resolve DID → signing key 474 492 const parsed = zat.Did.parse(d) orelse continue; 475 493 var doc = resolver.resolve(parsed) catch |err| { 494 + _ = self.stats.resolve_loop_resolve_fail_total.fetchAdd(1, .monotonic); 476 495 log.debug("DID resolve failed for {s}: {s}", .{ d, @errorName(err) }); 477 496 continue; 478 497 }; 479 498 defer doc.deinit(); 499 + _ = self.stats.resolve_loop_resolve_ok_total.fetchAdd(1, .monotonic); 480 500 481 501 // extract and decode signing key 482 502 const vm = doc.signingKey() orelse continue; ··· 555 575 556 576 /// synchronous host authority check. called on first-seen DIDs (is_new) 557 577 /// and host migrations (host_changed). resolves the DID doc to verify the 558 - /// PDS endpoint matches the incoming host. retries once on failure to 559 - /// handle transient network errors. 578 + /// PDS endpoint matches the incoming host. 560 579 /// 561 580 /// uses a pooled resolver to avoid creating a fresh resolver (and fresh 562 581 /// TLS handshake) per call. blocks briefly if all pool slots are in use. 582 + /// 583 + /// on resolve failure, the slot is destroyed and re-initialized before 584 + /// the retry, so any state corruption (poisoned http client, half-closed 585 + /// connection, etc.) doesn't persist across calls. this is the leading 586 + /// hypothesis for the 2026-04-08 ~99% rejection rate — pool slots had 587 + /// no recovery path. see relay docs/zlay-external-review-2026-04-09.md. 563 588 /// 564 589 /// returns: 565 590 /// .accept — should not happen (caller should only call on new/mismatch) ··· 576 601 const idx = self.acquireHostResolver(); 577 602 defer self.releaseHostResolver(idx); 578 603 579 - var resolver = &self.host_resolvers[idx]; 604 + // first attempt on the existing pool slot. the first-attempt error is 605 + // not captured: if state corruption was the cause, the kind from the 606 + // fresh-slot retry below is what we want to log. 607 + if (self.host_resolvers[idx].resolve(parsed)) |doc_first| { 608 + var d = doc_first; 609 + defer d.deinit(); 610 + return self.checkPdsHost(&d, persist, did, incoming_host_id); 611 + } else |_| { 612 + // first-attempt failure: count it (steady-state signal independent 613 + // of recovery success), then destroy + re-init the slot before the 614 + // retry so any internal state corruption doesn't persist. 615 + _ = self.stats.host_resolver_resolve_fail_total.fetchAdd(1, .monotonic); 616 + self.recycleHostResolver(idx); 617 + _ = self.stats.host_resolver_resets_total.fetchAdd(1, .monotonic); 580 618 581 - // first resolve attempt. the first-attempt error is not captured: 582 - // resolver.resolve upstream collapses everything into 583 - // DidResolutionFailed, so both attempts produce the same error kind 584 - // here and only the second one is worth logging. 585 - var doc = resolver.resolve(parsed) catch { 586 - // retry once on network failure 587 - var doc2 = resolver.resolve(parsed) catch |err| { 619 + if (self.host_resolvers[idx].resolve(parsed)) |doc_retry| { 620 + var d = doc_retry; 621 + defer d.deinit(); 622 + return self.checkPdsHost(&d, persist, did, incoming_host_id); 623 + } else |err| { 588 624 _ = self.stats.host_authority_reject_resolve.fetchAdd(1, .monotonic); 589 625 self.sampleLogReject("resolve", did, @errorName(err), incoming_host_id, 0); 590 626 return .reject; 591 - }; 592 - defer doc2.deinit(); 593 - return self.checkPdsHost(&doc2, persist, did, incoming_host_id); 594 - }; 595 - defer doc.deinit(); 596 - return self.checkPdsHost(&doc, persist, did, incoming_host_id); 627 + } 628 + } 629 + } 630 + 631 + /// destroy and re-initialize a pool slot in place. caller must hold the 632 + /// slot via acquireHostResolver — concurrent access is not safe. used by 633 + /// the slot-recovery path on resolve failure. if the re-init alloc fails, 634 + /// the slot is left in a degraded state and the next caller will see the 635 + /// failure naturally; we don't crash on OOM here. 636 + fn recycleHostResolver(self: *Validator, idx: usize) void { 637 + self.host_resolvers[idx].deinit(); 638 + self.host_resolvers[idx] = zat.DidResolver.initWithOptions( 639 + self.io, 640 + self.allocator, 641 + .{ .keep_alive = false }, 642 + ); 597 643 } 598 644 599 645 /// acquire a resolver from the pool. spins until one is available. 646 + /// records cumulative wait time + in_use gauge for diagnosing pool 647 + /// contention. matches the codebase convention of timing via Io.Timestamp 648 + /// (see frame_worker.zig microTimestamp). 600 649 fn acquireHostResolver(self: *Validator) usize { 650 + const start_us = Io.Timestamp.now(self.io, .real).toMicroseconds(); 601 651 while (self.alive.load(.acquire)) { 602 - for (0..host_resolver_pool_size) |i| { 652 + for (0..self.host_resolvers.len) |i| { 603 653 if (self.host_resolver_available[i].cmpxchgStrong(true, false, .acquire, .monotonic) == null) { 654 + const now_us = Io.Timestamp.now(self.io, .real).toMicroseconds(); 655 + const elapsed_us: u64 = @intCast(@max(0, now_us - start_us)); 656 + _ = self.stats.host_resolver_acquire_wait_us_total.fetchAdd(elapsed_us, .monotonic); 657 + _ = self.stats.host_resolver_in_use.fetchAdd(1, .monotonic); 604 658 return i; 605 659 } 606 660 } ··· 610 664 } 611 665 612 666 fn releaseHostResolver(self: *Validator, idx: usize) void { 667 + _ = self.stats.host_resolver_in_use.fetchSub(1, .monotonic); 613 668 self.host_resolver_available[idx].store(true, .release); 614 669 } 615 670

Configure Feed

Configure Feed