perlsky is a Perl 5 implementation of an AT Protocol Personal Data Server.
13
fork

Configure Feed

Select the types of activity you want to include in your feed.

Test crawler notifications against the reference PDS

alice 146ffe6b a1711dcd

+367
+8
README.md
··· 14 14 Reference differential validation: 15 15 16 16 - Run `script/differential-validate` to compare `perlds` against the official published `@atproto/pds` on a focused set of account, repo, moderation, sync, firehose, and `importRepo` snapshot-restore behaviors. 17 + - The differential harness also configures a local relay/crawler mock for both servers and verifies that both emit `com.atproto.sync.requestCrawl` notices with the expected hostname after repo activity, based on the upstream crawler wiring in `packages/pds/src/crawlers.ts`, `context.ts`, and `sequencer.ts`. 17 18 - Run `PERLDS_DIFF_ACCOUNT_DID_METHOD=did:plc script/differential-validate` to exercise the same harness in PLC-account mode, including recommended DID credentials, PLC signature requests, PLC handle updates, token-gated PLC signing behavior, and moderation checks after PLC handle changes. 18 19 - The helper installs the reference runtime into `.tools/reference-runtime` with Node 20 via `fnm`. 19 20 - Run `PERLDS_RUN_REFERENCE_DIFF=1 prove -lv t/reference-differential.t` to exercise the same harness from the test suite. 20 21 - Run `PERLDS_RUN_REFERENCE_DIFF=1 prove -lv t/reference-differential-plc.t` to run the PLC-specific reference comparison from the test suite. 22 + 23 + Relay / crawler discovery: 24 + 25 + - Configure `hostname` to the public host name you want relays to crawl, for example `pds.example.com`. This should be the host, not the full URL. 26 + - Configure `crawlers` as a list of relay or crawler service origins, for example `["https://bsky.network"]`. 27 + - `perlds` will POST `com.atproto.sync.requestCrawl` to each configured crawler after local repo/account/identity activity, while throttling repeat notices with `crawler_notify_interval` (default `1200` seconds). 28 + - Local regression coverage for this path lives in `t/crawlers.t`. 21 29 22 30 Moderation and labels: 23 31
+3
etc/perlds.example.json
··· 2 2 "host": "127.0.0.1", 3 3 "port": 7755, 4 4 "base_url": "http://127.0.0.1:7755", 5 + "hostname": "localhost", 5 6 "service_did_method": "did:web", 6 7 "service_handle_domain": "localhost", 7 8 "jwt_secret": "change-me", 8 9 "admin_password": "change-me-too", 10 + "crawlers": [], 11 + "crawler_notify_interval": 1200, 9 12 "data_dir": "data/runtime", 10 13 "db_path": "data/runtime/perlds.sqlite" 11 14 }
+85
script/differential-validate
··· 194 194 return $res; 195 195 } 196 196 197 + sub get_json_url ($url) { 198 + return Mojo::UserAgent->new(max_redirects => 0)->get($url)->result; 199 + } 200 + 197 201 sub auth_header ($token) { 198 202 return { Authorization => "Bearer $token" }; 199 203 } ··· 419 423 }; 420 424 } 421 425 426 + sub crawler_requests ($origin) { 427 + my $res = get_json_url("$origin/requests"); 428 + die "crawler request log fetch failed for $origin\n" unless $res->is_success; 429 + return $res->json || {}; 430 + } 431 + 432 + sub wait_for_crawler_requests ($name, $origin, $minimum = 1, $timeout = 10) { 433 + my $deadline = time + $timeout; 434 + while (time < $deadline) { 435 + my $state = eval { crawler_requests($origin) }; 436 + if ($state && (($state->{count} // 0) >= $minimum)) { 437 + return $state; 438 + } 439 + sleep 0.1; 440 + } 441 + die "timed out waiting for $name crawler requests at $origin\n"; 442 + } 443 + 422 444 note('Preparing official reference runtime'); 423 445 setup_reference_runtime(); 424 446 ··· 431 453 432 454 my $plc_ready = File::Spec->catfile($tmp, 'plc.ready.json'); 433 455 my $ref_ready = File::Spec->catfile($tmp, 'reference.ready.json'); 456 + my $reference_crawler_ready = File::Spec->catfile($tmp, 'reference-crawler.ready.json'); 457 + my $perlds_crawler_ready = File::Spec->catfile($tmp, 'perlds-crawler.ready.json'); 434 458 435 459 my $plc_log = File::Spec->catfile($tmp, 'plc.log'); 436 460 my $ref_log = File::Spec->catfile($tmp, 'reference.log'); 437 461 my $perl_log = File::Spec->catfile($tmp, 'perlds.log'); 462 + my $reference_crawler_log = File::Spec->catfile($tmp, 'reference-crawler.log'); 463 + my $perlds_crawler_log = File::Spec->catfile($tmp, 'perlds-crawler.log'); 438 464 439 465 my $plc = spawn_logged( 440 466 'plc-mock', ··· 450 476 my $plc_info = wait_for_ready_file('plc mock', $plc_ready); 451 477 pass("started local PLC mock at $plc_info->{origin}"); 452 478 479 + my $reference_crawler_port = free_port(); 480 + my $perlds_crawler_port = free_port(); 481 + 482 + my $reference_crawler = spawn_logged( 483 + 'reference-crawler', 484 + ['fnm', 'exec', '--using=20', '--', 'node', File::Spec->catfile($root, 'tools', 'differential', 'crawler-mock.cjs')], 485 + { 486 + PERLDS_READY_FILE => $reference_crawler_ready, 487 + PERLDS_CRAWLER_PORT => $reference_crawler_port, 488 + PERLDS_CRAWLER_HOST => '127.0.0.1', 489 + }, 490 + $reference_crawler_log, 491 + ); 492 + 493 + my $perlds_crawler = spawn_logged( 494 + 'perlds-crawler', 495 + ['fnm', 'exec', '--using=20', '--', 'node', File::Spec->catfile($root, 'tools', 'differential', 'crawler-mock.cjs')], 496 + { 497 + PERLDS_READY_FILE => $perlds_crawler_ready, 498 + PERLDS_CRAWLER_PORT => $perlds_crawler_port, 499 + PERLDS_CRAWLER_HOST => '127.0.0.1', 500 + }, 501 + $perlds_crawler_log, 502 + ); 503 + 504 + my $reference_crawler_info = wait_for_ready_file('reference crawler', $reference_crawler_ready); 505 + wait_for_http_ok('reference crawler', "$reference_crawler_info->{origin}/_health"); 506 + pass("started reference crawler mock at $reference_crawler_info->{origin}"); 507 + 508 + my $perlds_crawler_info = wait_for_ready_file('perlds crawler', $perlds_crawler_ready); 509 + wait_for_http_ok('perlds crawler', "$perlds_crawler_info->{origin}/_health"); 510 + pass("started perlds crawler mock at $perlds_crawler_info->{origin}"); 511 + 453 512 my $reference_data = File::Spec->catdir($tmp, 'reference'); 454 513 make_path($reference_data); 455 514 ··· 470 529 PDS_INVITE_REQUIRED => 0, 471 530 PDS_DID_PLC_URL => $plc_info->{origin}, 472 531 PDS_CONTACT_EMAIL_ADDRESS => 'abuse@example.test', 532 + PDS_CRAWLERS => $reference_crawler_info->{origin}, 473 533 }, 474 534 $ref_log, 475 535 ); ··· 482 542 open my $cfg_fh, '>', $perlds_config or die "unable to write $perlds_config: $!"; 483 543 print {$cfg_fh} encode_json({ 484 544 base_url => "http://127.0.0.1:$perl_port", 545 + hostname => 'localhost', 485 546 service_handle_domain => 'test', 486 547 service_did_method => 'did:web', 487 548 account_did_method => $diff_account_did_method, ··· 493 554 : ()), 494 555 jwt_secret => 'perlds-jwt-secret', 495 556 admin_password => 'perlds-admin-secret', 557 + crawlers => [$perlds_crawler_info->{origin}], 496 558 data_dir => File::Spec->catdir($tmp, 'perlds-data'), 497 559 db_path => File::Spec->catfile($tmp, 'perlds.sqlite'), 498 560 }); ··· 516 578 handle => 'alice.test', 517 579 email => 'alice-ref@test.com', 518 580 admin_password => 'reference-admin-secret', 581 + crawler_origin => $reference_crawler_info->{origin}, 519 582 }, 520 583 perlds => { 521 584 origin => "http://127.0.0.1:$perl_port", 522 585 handle => 'alice.test', 523 586 email => 'alice-perl@test.com', 524 587 admin_password => 'perlds-admin-secret', 588 + crawler_origin => $perlds_crawler_info->{origin}, 525 589 }, 526 590 ); 527 591 ··· 577 641 next unless $res->is_success; 578 642 check(($res->json->{did} // q()) eq $server{$name}{did}, "$name resolveHandle returns the created DID"); 579 643 } 644 + 645 + note('Comparing crawler notifications'); 646 + for my $name (sort keys %server) { 647 + my $state = wait_for_crawler_requests("$name crawler", $server{$name}{crawler_origin}); 648 + my $first = $state->{requests}[0]{body} || {}; 649 + $server{$name}{crawler_notice} = { 650 + saw_request => (($state->{count} // 0) >= 1) ? 1 : 0, 651 + hostname_matches => (($first->{hostname} // q()) eq 'localhost') ? 1 : 0, 652 + }; 653 + check($server{$name}{crawler_notice}{saw_request}, "$name requested a relay crawl"); 654 + check($server{$name}{crawler_notice}{hostname_matches}, "$name requestCrawl uses the configured hostname"); 655 + } 656 + 657 + check( 658 + same_hash($server{reference}{crawler_notice}, $server{perlds}{crawler_notice}), 659 + 'outbound crawler notifications match the official reference PDS semantics', 660 + ); 580 661 581 662 note('Comparing getSession'); 582 663 for my $name (sort keys %server) { ··· 949 1030 print slurp_file($perl_log); 950 1031 print "\nPLC mock log:\n"; 951 1032 print slurp_file($plc_log); 1033 + print "\nReference crawler log:\n"; 1034 + print slurp_file($reference_crawler_log); 1035 + print "\nperlds crawler log:\n"; 1036 + print slurp_file($perlds_crawler_log); 952 1037 die "\ndifferential validation failed with $failed mismatches\n"; 953 1038 } 954 1039
+187
t/crawlers.t
··· 1 + use v5.34; 2 + use warnings; 3 + use Config (); 4 + use File::Spec; 5 + use File::Temp qw(tempdir); 6 + use FindBin qw($Bin); 7 + use IO::Socket::INET; 8 + use JSON::PP qw(decode_json); 9 + use POSIX qw(WNOHANG); 10 + use Test::More; 11 + use Time::HiRes qw(sleep time); 12 + 13 + BEGIN { 14 + require lib; 15 + my $root = File::Spec->rel2abs(File::Spec->catdir($Bin, '..')); 16 + lib->import( 17 + File::Spec->catdir($root, 'lib'), 18 + File::Spec->catdir($root, 'local', 'lib', 'perl5'), 19 + File::Spec->catdir($root, 'local', 'lib', 'perl5', $Config::Config{archname}), 20 + ); 21 + } 22 + 23 + use Mojo::URL; 24 + use Mojo::UserAgent; 25 + use Test::Mojo; 26 + use ATProto::PDS; 27 + 28 + my $root = File::Spec->rel2abs(File::Spec->catdir($Bin, '..')); 29 + my $tmp = tempdir(CLEANUP => 1); 30 + my @children; 31 + 32 + END { 33 + local $?; 34 + for my $child (reverse @children) { 35 + next unless $child->{pid}; 36 + next unless kill 0, $child->{pid}; 37 + kill 'TERM', $child->{pid}; 38 + for (1 .. 40) { 39 + last if waitpid($child->{pid}, WNOHANG) == $child->{pid}; 40 + sleep 0.1; 41 + } 42 + kill 'KILL', $child->{pid} if kill 0, $child->{pid}; 43 + waitpid($child->{pid}, 0); 44 + } 45 + $? = 0; 46 + } 47 + 48 + sub free_port { 49 + my $sock = IO::Socket::INET->new( 50 + LocalAddr => '127.0.0.1', 51 + LocalPort => 0, 52 + Proto => 'tcp', 53 + Listen => 5, 54 + ReuseAddr => 1, 55 + ) or die "unable to allocate a port: $!"; 56 + my $port = $sock->sockport; 57 + close $sock; 58 + return $port; 59 + } 60 + 61 + sub slurp { 62 + my ($path) = @_; 63 + open my $fh, '<', $path or die "open($path): $!"; 64 + local $/; 65 + return <$fh>; 66 + } 67 + 68 + sub spawn_crawler_mock { 69 + my ($ready_file, $log_file, $port) = @_; 70 + my $pid = fork; 71 + die "fork failed: $!" unless defined $pid; 72 + 73 + if ($pid == 0) { 74 + open STDOUT, '>', $log_file or die "open($log_file): $!"; 75 + open STDERR, '>&', \*STDOUT or die "dup stdout failed"; 76 + chdir $root or die "chdir($root): $!"; 77 + $ENV{PERLDS_READY_FILE} = $ready_file; 78 + $ENV{PERLDS_CRAWLER_PORT} = $port; 79 + $ENV{PERLDS_CRAWLER_HOST} = '127.0.0.1'; 80 + exec 'fnm', 'exec', '--using=20', '--', 'node', 81 + File::Spec->catfile($root, 'tools', 'differential', 'crawler-mock.cjs'); 82 + die "exec failed: $!"; 83 + } 84 + 85 + push @children, { pid => $pid }; 86 + return $pid; 87 + } 88 + 89 + sub wait_for_ready { 90 + my ($path, $timeout) = @_; 91 + $timeout //= 20; 92 + my $deadline = time + $timeout; 93 + while (time < $deadline) { 94 + if (-f $path) { 95 + return decode_json(slurp($path)); 96 + } 97 + sleep 0.1; 98 + } 99 + die "timed out waiting for $path"; 100 + } 101 + 102 + sub crawler_state { 103 + my ($origin) = @_; 104 + my $res = Mojo::UserAgent->new(max_redirects => 0)->get("$origin/requests")->result; 105 + die "crawler state fetch failed for $origin" unless $res->is_success; 106 + return $res->json || {}; 107 + } 108 + 109 + sub wait_for_requests { 110 + my ($origin, $minimum, $timeout) = @_; 111 + $minimum //= 1; 112 + $timeout //= 10; 113 + my $deadline = time + $timeout; 114 + while (time < $deadline) { 115 + my $state = eval { crawler_state($origin) }; 116 + if ($state && (($state->{count} // 0) >= $minimum)) { 117 + return $state; 118 + } 119 + sleep 0.1; 120 + } 121 + die "timed out waiting for crawler requests at $origin"; 122 + } 123 + 124 + my $crawler_port = free_port(); 125 + my $crawler_ready = File::Spec->catfile($tmp, 'crawler.ready.json'); 126 + my $crawler_log = File::Spec->catfile($tmp, 'crawler.log'); 127 + spawn_crawler_mock($crawler_ready, $crawler_log, $crawler_port); 128 + my $crawler = wait_for_ready($crawler_ready); 129 + 130 + my $app = ATProto::PDS->new( 131 + project_root => $root, 132 + settings => { 133 + base_url => 'http://127.0.0.1:7755', 134 + service_handle_domain => 'test', 135 + service_did_method => 'did:web', 136 + jwt_secret => 'crawl-secret', 137 + admin_password => 'admin-secret', 138 + crawlers => [$crawler->{origin}], 139 + crawler_notify_interval => 3600, 140 + db_path => File::Spec->catfile($tmp, 'crawlers.sqlite'), 141 + data_dir => File::Spec->catdir($tmp, 'data'), 142 + }, 143 + ); 144 + 145 + my $t = Test::Mojo->new($app); 146 + 147 + $t->post_ok('/xrpc/com.atproto.server.createAccount' => json => { 148 + handle => 'alice.test', 149 + email => 'alice@test.com', 150 + password => 'hunter22', 151 + })->status_is(200); 152 + 153 + my $created = $t->tx->res->json; 154 + my $access = $created->{accessJwt}; 155 + my $did = $created->{did}; 156 + 157 + my $state = wait_for_requests($crawler->{origin}); 158 + is($state->{requests}[0]{body}{hostname}, '127.0.0.1', 'crawl requests use the public hostname without the port'); 159 + 160 + $t->post_ok('/xrpc/com.atproto.repo.createRecord' => { 161 + Authorization => "Bearer $access", 162 + } => json => { 163 + repo => $did, 164 + collection => 'app.bsky.feed.post', 165 + rkey => 'crawler-test', 166 + record => { 167 + '$type' => 'app.bsky.feed.post', 168 + text => 'crawler notification test', 169 + createdAt => '2026-03-10T00:00:00Z', 170 + }, 171 + })->status_is(200); 172 + 173 + sleep 0.5; 174 + $state = crawler_state($crawler->{origin}); 175 + is($state->{count}, 1, 'crawler notifications are throttled inside the configured interval'); 176 + 177 + my $crawler_url = Mojo::URL->new($crawler->{origin}); 178 + my $crawler_host = lc($crawler_url->host // '127.0.0.1'); 179 + $crawler_host .= ':' . $crawler_url->port if defined($crawler_url->port) && $crawler_url->port != 80; 180 + 181 + $t->get_ok(Mojo::URL->new('/xrpc/com.atproto.sync.getHostStatus')->query( 182 + hostname => $crawler_host, 183 + ))->status_is(200) 184 + ->json_is('/hostname', $crawler_host) 185 + ->json_is('/status', 'active'); 186 + 187 + done_testing;
+84
tools/differential/crawler-mock.cjs
··· 1 + #!/usr/bin/env node 2 + 3 + const fs = require('node:fs'); 4 + const http = require('node:http'); 5 + 6 + const readyFile = process.env.PERLDS_READY_FILE; 7 + const host = process.env.PERLDS_CRAWLER_HOST || '127.0.0.1'; 8 + const port = Number(process.env.PERLDS_CRAWLER_PORT || 0); 9 + 10 + if (!readyFile) { 11 + console.error('PERLDS_READY_FILE is required'); 12 + process.exit(1); 13 + } 14 + 15 + const requests = []; 16 + 17 + const sendJson = (res, statusCode, payload) => { 18 + const body = JSON.stringify(payload); 19 + res.writeHead(statusCode, { 20 + 'Content-Type': 'application/json', 21 + 'Content-Length': Buffer.byteLength(body), 22 + }); 23 + res.end(body); 24 + }; 25 + 26 + const readBody = (req) => 27 + new Promise((resolve, reject) => { 28 + let body = ''; 29 + req.setEncoding('utf8'); 30 + req.on('data', (chunk) => { 31 + body += chunk; 32 + }); 33 + req.on('end', () => resolve(body)); 34 + req.on('error', reject); 35 + }); 36 + 37 + const server = http.createServer(async (req, res) => { 38 + if (req.method === 'GET' && req.url === '/_health') { 39 + sendJson(res, 200, { ok: true }); 40 + return; 41 + } 42 + 43 + if (req.method === 'GET' && req.url === '/requests') { 44 + sendJson(res, 200, { count: requests.length, requests }); 45 + return; 46 + } 47 + 48 + if (req.method === 'POST' && req.url === '/xrpc/com.atproto.sync.requestCrawl') { 49 + try { 50 + const raw = await readBody(req); 51 + const body = raw.length ? JSON.parse(raw) : {}; 52 + requests.push({ 53 + at: new Date().toISOString(), 54 + body, 55 + }); 56 + sendJson(res, 200, {}); 57 + } catch (error) { 58 + sendJson(res, 400, { 59 + error: 'InvalidRequest', 60 + message: error && error.message ? error.message : String(error), 61 + }); 62 + } 63 + return; 64 + } 65 + 66 + sendJson(res, 404, { error: 'NotFound' }); 67 + }); 68 + 69 + server.listen(port, host, () => { 70 + const address = server.address(); 71 + const origin = `http://${address.address}:${address.port}`; 72 + fs.writeFileSync( 73 + readyFile, 74 + JSON.stringify({ origin, host: address.address, port: address.port }) + '\n', 75 + 'utf8', 76 + ); 77 + }); 78 + 79 + const shutdown = () => { 80 + server.close(() => process.exit(0)); 81 + }; 82 + 83 + process.on('SIGINT', shutdown); 84 + process.on('SIGTERM', shutdown);