declarative relay deployment on hetzner relay-eval.waow.tech
atproto relay
14
fork

Configure Feed

Select the types of activity you want to include in your feed.

indigo: replace collectiondir with lightrail

lightrail (fig's Rust collection directory) replaces the Go collectiondir
sidecar, which had unbounded memory growth (~1.4 GiB and climbing toward
its OOM limit). lightrail validates sync 1.1 commit proofs, removes repos
on collection deletion, and has a configurable fjall cache.

- add Dockerfile.lightrail, helm values, and ServiceMonitor
- route listReposByCollection to lightrail in ingress
- replace collectiondir grafana panels with lightrail metrics
- remove collectiondir backfill recipe from justfile

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>

+291 -95
+13
indigo/deploy/Dockerfile.lightrail
··· 1 + # multi-stage build for lightrail (fig's Rust collection directory) 2 + # source: https://tangled.org/microcosm.blue/lightrail 3 + 4 + FROM rust:1.91-bookworm AS builder 5 + 6 + WORKDIR /build 7 + COPY . . 8 + RUN cargo build --release 9 + 10 + FROM debian:bookworm-slim 11 + RUN apt-get update && apt-get install -y ca-certificates && rm -rf /var/lib/apt/lists/* 12 + COPY --from=builder /build/target/release/lightrail /usr/local/bin/lightrail 13 + ENTRYPOINT ["lightrail"]
+1 -8
indigo/deploy/ingress.yaml
··· 19 19 pathType: Exact 20 20 backend: 21 21 service: 22 - name: collectiondir 23 - port: 24 - number: 2510 25 - - path: /v1/listCollections 26 - pathType: Prefix 27 - backend: 28 - service: 29 - name: collectiondir 22 + name: lightrail 30 23 port: 31 24 number: 2510 32 25 - path: /
+15
indigo/deploy/lightrail-servicemonitor.yaml
··· 1 + apiVersion: monitoring.coreos.com/v1 2 + kind: ServiceMonitor 3 + metadata: 4 + name: lightrail 5 + namespace: monitoring 6 + spec: 7 + selector: 8 + matchLabels: 9 + app.kubernetes.io/name: lightrail 10 + namespaceSelector: 11 + matchNames: 12 + - relay 13 + endpoints: 14 + - port: metrics 15 + interval: 30s
+73
indigo/deploy/lightrail-values.yaml
··· 1 + # bjw-s/app-template helm values for lightrail 2 + # fig's Rust collection directory — replaces collectiondir 3 + # source: https://tangled.org/microcosm.blue/lightrail 4 + # schema: https://github.com/bjw-s-labs/helm-charts/tree/main/charts/other/app-template 5 + 6 + controllers: 7 + lightrail: 8 + containers: 9 + main: 10 + image: 11 + repository: atcr.io/zzstoatzz.io/lightrail 12 + tag: latest 13 + args: 14 + - --subscribe 15 + - https://relay.waow.tech 16 + - --db-path 17 + - /data/lightrail.db 18 + - --listen 19 + - "0.0.0.0:2510" 20 + - --metrics-listen 21 + - "0.0.0.0:6789" 22 + - --fjall-cache-mb 23 + - "256" 24 + - --deep-crawl 25 + - --crawl-qps 26 + - "8" 27 + env: 28 + - name: LIGHTRAIL_ADMIN_PASSWORD 29 + valueFrom: 30 + secretKeyRef: 31 + name: collectiondir-secret 32 + key: COLLECTIONS_ADMIN_TOKEN 33 + - name: RUST_LOG 34 + value: "info" 35 + probes: 36 + liveness: &probes 37 + enabled: true 38 + custom: true 39 + spec: 40 + httpGet: 41 + path: / 42 + port: &apiport 2510 43 + initialDelaySeconds: 30 44 + periodSeconds: 10 45 + timeoutSeconds: 3 46 + failureThreshold: 5 47 + readiness: *probes 48 + resources: 49 + requests: 50 + memory: 256Mi 51 + cpu: 100m 52 + limits: 53 + memory: 4096Mi 54 + 55 + defaultPodOptions: 56 + imagePullSecrets: 57 + - name: atcr-creds 58 + 59 + service: 60 + lightrail: 61 + controller: lightrail 62 + ports: 63 + http: 64 + port: *apiport 65 + metrics: 66 + port: 6789 67 + 68 + persistence: 69 + data: 70 + enabled: true 71 + type: persistentVolumeClaim 72 + accessMode: ReadWriteOnce 73 + size: 10Gi
+172 -44
indigo/deploy/relay-dashboard.json
··· 428 428 ] 429 429 }, 430 430 { 431 - "title": "collectiondir", 431 + "title": "lightrail", 432 432 "type": "row", 433 433 "gridPos": { 434 434 "h": 1, ··· 440 440 "panels": [] 441 441 }, 442 442 { 443 - "title": "collectiondir firehose events/sec", 444 - "type": "timeseries", 443 + "title": "repos resynced", 444 + "type": "stat", 445 445 "gridPos": { 446 446 "h": 8, 447 - "w": 8, 447 + "w": 6, 448 448 "x": 0, 449 449 "y": 34 450 450 }, ··· 454 454 }, 455 455 "fieldConfig": { 456 456 "defaults": { 457 - "unit": "ops", 458 457 "color": { 459 - "mode": "palette-classic" 458 + "mode": "thresholds" 460 459 }, 461 - "custom": { 462 - "fillOpacity": 15, 463 - "lineWidth": 2, 464 - "spanNulls": false 460 + "thresholds": { 461 + "steps": [ 462 + { "color": "yellow", "value": null }, 463 + { "color": "green", "value": 500000 } 464 + ] 465 465 } 466 466 }, 467 467 "overrides": [] 468 + }, 469 + "options": { 470 + "colorMode": "value", 471 + "graphMode": "area", 472 + "reduceOptions": { 473 + "calcs": ["lastNotNull"] 474 + } 468 475 }, 469 476 "targets": [ 470 477 { 471 - "expr": "sum(rate(collectiondir_firehose_received_total[5m]))", 472 - "legendFormat": "received", 478 + "expr": "lightrail_resync_completed_total{outcome=\"success\"}", 479 + "legendFormat": "success", 473 480 "refId": "A" 474 481 } 475 482 ] 476 483 }, 477 484 { 478 - "title": "collectiondir commits/sec", 485 + "title": "firehose events/sec", 479 486 "type": "timeseries", 480 487 "gridPos": { 481 488 "h": 8, 482 - "w": 8, 483 - "x": 8, 489 + "w": 9, 490 + "x": 6, 484 491 "y": 34 485 492 }, 486 493 "datasource": { ··· 490 497 "fieldConfig": { 491 498 "defaults": { 492 499 "unit": "ops", 493 - "color": { 494 - "mode": "palette-classic" 495 - }, 500 + "color": { "mode": "palette-classic" }, 496 501 "custom": { 497 502 "fillOpacity": 15, 498 503 "lineWidth": 2, ··· 503 508 }, 504 509 "targets": [ 505 510 { 506 - "expr": "sum(rate(collectiondir_firehose_commits[5m]))", 507 - "legendFormat": "commits", 511 + "expr": "sum(rate(lightrail_firehose_events_total[5m])) by (kind)", 512 + "legendFormat": "{{kind}}", 508 513 "refId": "A" 509 514 } 510 515 ] 511 516 }, 512 517 { 513 - "title": "collectiondir new pairs indexed/sec", 518 + "title": "collection births/sec", 514 519 "type": "timeseries", 515 520 "gridPos": { 516 521 "h": 8, 517 - "w": 8, 518 - "x": 16, 522 + "w": 9, 523 + "x": 15, 519 524 "y": 34 520 525 }, 521 526 "datasource": { ··· 525 530 "fieldConfig": { 526 531 "defaults": { 527 532 "unit": "ops", 528 - "color": { 529 - "mode": "palette-classic" 530 - }, 533 + "color": { "mode": "palette-classic" }, 531 534 "custom": { 532 535 "fillOpacity": 15, 533 536 "lineWidth": 2, ··· 538 541 }, 539 542 "targets": [ 540 543 { 541 - "expr": "sum(rate(collectiondir_pebble_new_total[5m]))", 542 - "legendFormat": "new pairs", 544 + "expr": "sum(rate(lightrail_collection_births_total[5m])) by (source)", 545 + "legendFormat": "{{source}}", 543 546 "refId": "A" 547 + }, 548 + { 549 + "expr": "sum(rate(lightrail_collection_deaths_total[5m])) by (source)", 550 + "legendFormat": "deaths ({{source}})", 551 + "refId": "B" 544 552 } 545 553 ] 546 554 }, 547 555 { 548 - "title": "collectiondir disk i/o", 556 + "title": "commits indexed/sec", 549 557 "type": "timeseries", 550 558 "gridPos": { 551 559 "h": 8, ··· 559 567 }, 560 568 "fieldConfig": { 561 569 "defaults": { 562 - "unit": "Bps", 563 - "color": { 564 - "mode": "palette-classic" 565 - }, 570 + "unit": "ops", 571 + "color": { "mode": "palette-classic" }, 566 572 "custom": { 567 573 "fillOpacity": 15, 568 574 "lineWidth": 2, ··· 573 579 }, 574 580 "targets": [ 575 581 { 576 - "expr": "rate(container_fs_writes_bytes_total{namespace=\"relay\",pod=~\"collectiondir.*\",container=\"main\"}[5m])", 577 - "legendFormat": "writes", 582 + "expr": "sum(rate(lightrail_commits_indexed_total[5m])) by (mode)", 583 + "legendFormat": "{{mode}}", 578 584 "refId": "A" 579 585 }, 580 586 { 581 - "expr": "rate(container_fs_reads_bytes_total{namespace=\"relay\",pod=~\"collectiondir.*\",container=\"main\"}[5m])", 582 - "legendFormat": "reads", 587 + "expr": "sum(rate(lightrail_commit_dropped_total[5m])) by (reason)", 588 + "legendFormat": "dropped ({{reason}})", 583 589 "refId": "B" 584 590 } 585 591 ] 586 592 }, 587 593 { 588 - "title": "collectiondir memory", 594 + "title": "memory", 589 595 "type": "timeseries", 590 596 "gridPos": { 591 597 "h": 8, ··· 600 606 "fieldConfig": { 601 607 "defaults": { 602 608 "unit": "bytes", 603 - "color": { 604 - "mode": "palette-classic" 605 - }, 609 + "color": { "mode": "palette-classic" }, 606 610 "custom": { 607 611 "fillOpacity": 15, 608 612 "lineWidth": 2, 609 613 "spanNulls": false 610 614 } 611 615 }, 612 - "overrides": [] 616 + "overrides": [ 617 + { 618 + "matcher": { "id": "byName", "options": "limit" }, 619 + "properties": [ 620 + { "id": "custom.lineStyle", "value": { "fill": "dash", "dash": [10, 10] } }, 621 + { "id": "custom.fillOpacity", "value": 0 }, 622 + { "id": "custom.lineWidth", "value": 3 }, 623 + { "id": "color", "value": { "mode": "fixed", "fixedColor": "red" } } 624 + ] 625 + } 626 + ] 613 627 }, 614 628 "targets": [ 615 629 { 616 - "expr": "container_memory_working_set_bytes{namespace=\"relay\",pod=~\"collectiondir.*\",container=\"main\"}", 630 + "expr": "container_memory_working_set_bytes{namespace=\"relay\",pod=~\"lightrail.*\",container=\"main\"}", 617 631 "legendFormat": "working set", 618 632 "refId": "A" 619 633 }, 620 634 { 621 - "expr": "kube_pod_container_resource_limits{namespace=\"relay\",pod=~\"collectiondir.*\",container=\"main\",resource=\"memory\"}", 635 + "expr": "kube_pod_container_resource_limits{namespace=\"relay\",pod=~\"lightrail.*\",container=\"main\",resource=\"memory\"}", 622 636 "legendFormat": "limit", 637 + "refId": "B" 638 + } 639 + ] 640 + }, 641 + { 642 + "title": "disk usage", 643 + "type": "timeseries", 644 + "gridPos": { 645 + "h": 8, 646 + "w": 8, 647 + "x": 16, 648 + "y": 42 649 + }, 650 + "datasource": { 651 + "type": "prometheus", 652 + "uid": "prometheus" 653 + }, 654 + "fieldConfig": { 655 + "defaults": { 656 + "unit": "bytes", 657 + "color": { "mode": "palette-classic" }, 658 + "custom": { 659 + "fillOpacity": 15, 660 + "lineWidth": 2, 661 + "spanNulls": false 662 + } 663 + }, 664 + "overrides": [] 665 + }, 666 + "targets": [ 667 + { 668 + "expr": "lightrail_db_disk_bytes{keyspace=\"total\"}", 669 + "legendFormat": "total", 670 + "refId": "A" 671 + }, 672 + { 673 + "expr": "lightrail_db_disk_bytes{keyspace=\"index\"}", 674 + "legendFormat": "index", 675 + "refId": "B" 676 + }, 677 + { 678 + "expr": "lightrail_db_disk_bytes{keyspace=\"default\"}", 679 + "legendFormat": "default", 680 + "refId": "C" 681 + } 682 + ] 683 + }, 684 + { 685 + "title": "identity cache hit rate", 686 + "type": "timeseries", 687 + "gridPos": { 688 + "h": 8, 689 + "w": 12, 690 + "x": 0, 691 + "y": 50 692 + }, 693 + "datasource": { 694 + "type": "prometheus", 695 + "uid": "prometheus" 696 + }, 697 + "fieldConfig": { 698 + "defaults": { 699 + "unit": "ops", 700 + "color": { "mode": "palette-classic" }, 701 + "custom": { 702 + "fillOpacity": 15, 703 + "lineWidth": 2, 704 + "spanNulls": false 705 + } 706 + }, 707 + "overrides": [] 708 + }, 709 + "targets": [ 710 + { 711 + "expr": "sum(rate(lightrail_identity_resolution_total[5m])) by (outcome)", 712 + "legendFormat": "{{outcome}}", 713 + "refId": "A" 714 + } 715 + ] 716 + }, 717 + { 718 + "title": "dropped events/sec", 719 + "type": "timeseries", 720 + "gridPos": { 721 + "h": 8, 722 + "w": 12, 723 + "x": 12, 724 + "y": 50 725 + }, 726 + "datasource": { 727 + "type": "prometheus", 728 + "uid": "prometheus" 729 + }, 730 + "fieldConfig": { 731 + "defaults": { 732 + "unit": "ops", 733 + "color": { "mode": "palette-classic" }, 734 + "custom": { 735 + "fillOpacity": 15, 736 + "lineWidth": 2, 737 + "spanNulls": false 738 + } 739 + }, 740 + "overrides": [] 741 + }, 742 + "targets": [ 743 + { 744 + "expr": "sum(rate(lightrail_event_dropped_total[5m])) by (reason)", 745 + "legendFormat": "{{reason}}", 746 + "refId": "A" 747 + }, 748 + { 749 + "expr": "rate(lightrail_firehose_decode_errors_total[5m])", 750 + "legendFormat": "decode errors", 623 751 "refId": "B" 624 752 } 625 753 ]
+17 -43
indigo/justfile
··· 132 132 --from-literal=COLLECTIONS_ADMIN_TOKEN="${COLLECTIONDIR_ADMIN_TOKEN:-}" \ 133 133 --dry-run=client -o yaml | kubectl apply -f - 134 134 135 - echo "==> installing collectiondir" 136 - helm upgrade --install collectiondir bjw-s/app-template \ 135 + echo "==> installing lightrail" 136 + helm upgrade --install lightrail bjw-s/app-template \ 137 137 --namespace relay \ 138 - --values deploy/collectiondir-values.yaml \ 138 + --values deploy/lightrail-values.yaml \ 139 139 --wait --timeout 5m 140 - kubectl apply -f deploy/collectiondir-servicemonitor.yaml 140 + kubectl apply -f deploy/lightrail-servicemonitor.yaml 141 141 142 142 echo "==> installing reconnect cronjob" 143 143 kubectl apply -f deploy/reconnect-cronjob.yaml ··· 232 232 233 233 # --- images --- 234 234 235 - # build and push collectiondir image from indigo source 235 + # build and push collectiondir image from indigo source (kept for rollback) 236 236 collectiondir-publish: 237 237 #!/usr/bin/env bash 238 238 set -euo pipefail ··· 243 243 -f "$TMPDIR/cmd/collectiondir/Dockerfile" \ 244 244 -t atcr.io/zzstoatzz.io/collectiondir:latest "$TMPDIR" 245 245 ATCR_AUTO_AUTH=1 docker push atcr.io/zzstoatzz.io/collectiondir:latest 246 + 247 + # build and push lightrail image (fig's Rust collection directory) 248 + lightrail-publish: 249 + #!/usr/bin/env bash 250 + set -euo pipefail 251 + TMPDIR=$(mktemp -d) 252 + trap "rm -rf $TMPDIR" EXIT 253 + git clone --depth 1 https://tangled.org/microcosm.blue/lightrail "$TMPDIR" 254 + docker build --platform linux/amd64 \ 255 + -f deploy/Dockerfile.lightrail \ 256 + -t atcr.io/zzstoatzz.io/lightrail:latest "$TMPDIR" 257 + ATCR_AUTO_AUTH=1 docker push atcr.io/zzstoatzz.io/lightrail:latest 246 258 247 259 # build and push relay image from our fork (outdated cursor fix) 248 260 relay-publish: ··· 273 285 jetstream *args: 274 286 ../scripts/jetstream {{ args }} 275 287 276 - # backfill collectiondir with full network PDS hosts 277 - # pass --hosts <file> to use a specific host list, otherwise extracts from relay 278 - backfill *args: 279 - #!/usr/bin/env bash 280 - set -euo pipefail 281 - : "${COLLECTIONDIR_ADMIN_TOKEN:?set COLLECTIONDIR_ADMIN_TOKEN}" 282 - 283 - PIDS=() 284 - cleanup() { kill "${PIDS[@]}" 2>/dev/null; } 285 - trap cleanup EXIT 286 - 287 - # port-forward to collectiondir 288 - kubectl port-forward -n relay svc/collectiondir 2510:2510 >/dev/null 2>&1 & 289 - PIDS+=($!) 290 - 291 - EXTRA_ARGS=({{ args }}) 292 - 293 - # if --hosts not provided, extract from relay 294 - if ! printf '%s\n' "${EXTRA_ARGS[@]}" | grep -q '^--hosts$'; then 295 - : "${RELAY_ADMIN_PASSWORD:?set RELAY_ADMIN_PASSWORD}" 296 - kubectl port-forward -n relay svc/relay 12470:2470 >/dev/null 2>&1 & 297 - PIDS+=($!) 298 - sleep 3 299 - 300 - echo "==> extracting connected PDS host list from relay" 301 - curl -sf -u "admin:$RELAY_ADMIN_PASSWORD" http://localhost:12470/admin/pds/list \ 302 - | jq -r '.[] | select(.HasActiveConnection) | .Host' > /tmp/relay-hosts.txt 303 - TOTAL=$(curl -sf -u "admin:$RELAY_ADMIN_PASSWORD" http://localhost:12470/admin/pds/list | jq 'length') 304 - echo " $(wc -l < /tmp/relay-hosts.txt | tr -d ' ') connected hosts (of $TOTAL total)" 305 - echo 306 - EXTRA_ARGS+=(--hosts /tmp/relay-hosts.txt) 307 - else 308 - sleep 3 309 - fi 310 - 311 - ../scripts/backfill \ 312 - --token "$COLLECTIONDIR_ADMIN_TOKEN" \ 313 - "${EXTRA_ARGS[@]}"