Cooperative email for PDS operators
8
fork

Configure Feed

Select the types of activity you want to include in your feed.

Warmup deliverability, staggered scheduler, and ops alerting

+8518 -810
+3
.gitignore
··· 1 1 /labeler 2 + /label-api 2 3 /relay 4 + /rotate-dkim 5 + /sendtest 3 6 state/ 4 7 *.sqlite 5 8 *.sqlite-wal
+1 -1
README.md
··· 21 21 - Warming tier caps protect the shared IP during the first 14 days 22 22 of a new member's lifetime. 23 23 - Pool-level FBL registrations: Gmail Postmaster verified, Microsoft 24 - SNDS + JMRP registered, Yahoo CFL pending. Operator-classified 24 + SNDS + JMRP registered, Yahoo CFL verified. Operator-classified 25 25 inbound (`postmaster@`, `abuse@`, `fbl@`, …) forwards to an 26 26 external inbox for provider authorization flows. See 27 27 [docs/operator-runbook.md](docs/operator-runbook.md) for the live
+5 -5
cmd/labeler/bootstrap.go
··· 157 157 func validatePDSURL(raw string) error { 158 158 u, err := neturl.Parse(raw) 159 159 if err != nil { 160 - return fmt.Errorf("invalid URL: %v", err) 160 + return fmt.Errorf("invalid URL: %w", err) 161 161 } 162 162 if u.Scheme != "https" { 163 163 return fmt.Errorf("scheme %q not allowed, must be https", u.Scheme) ··· 177 177 } 178 178 resp, err := client.Do(req) 179 179 if err != nil { 180 - return "", fmt.Errorf("plc lookup %s: %v", did, err) 180 + return "", fmt.Errorf("plc lookup %s: %w", did, err) 181 181 } 182 182 defer resp.Body.Close() 183 183 ··· 193 193 } `json:"service"` 194 194 } 195 195 if err := json.NewDecoder(io.LimitReader(resp.Body, 1<<20)).Decode(&doc); err != nil { 196 - return "", fmt.Errorf("plc decode %s: %v", did, err) 196 + return "", fmt.Errorf("plc decode %s: %w", did, err) 197 197 } 198 198 199 199 for _, svc := range doc.Service { ··· 229 229 } 230 230 resp, err := client.Do(req) 231 231 if err != nil { 232 - return nil, fmt.Errorf("listRecords %s: %v", did, err) 232 + return nil, fmt.Errorf("listRecords %s: %w", did, err) 233 233 } 234 234 defer resp.Body.Close() 235 235 ··· 239 239 240 240 var result listRecordsResponse 241 241 if err := json.NewDecoder(io.LimitReader(resp.Body, 1<<20)).Decode(&result); err != nil { 242 - return nil, fmt.Errorf("decode listRecords: %v", err) 242 + return nil, fmt.Errorf("decode listRecords: %w", err) 243 243 } 244 244 245 245 var atts []jetstream.ReceivedAttestation
+2 -2
cmd/labeler/main.go
··· 266 266 } 267 267 resp, err := r.client.Do(req) 268 268 if err != nil { 269 - return "", fmt.Errorf("plc lookup %s: %v", did, err) 269 + return "", fmt.Errorf("plc lookup %s: %w", did, err) 270 270 } 271 271 defer resp.Body.Close() 272 272 ··· 278 278 AlsoKnownAs []string `json:"alsoKnownAs"` 279 279 } 280 280 if err := json.NewDecoder(io.LimitReader(resp.Body, 1<<20)).Decode(&doc); err != nil { 281 - return "", fmt.Errorf("plc decode %s: %v", did, err) 281 + return "", fmt.Errorf("plc decode %s: %w", did, err) 282 282 } 283 283 284 284 for _, aka := range doc.AlsoKnownAs {
+451 -101
cmd/relay/main.go
··· 20 20 "net/url" 21 21 "os" 22 22 "os/signal" 23 + "path/filepath" 23 24 "strings" 24 - "sync" 25 25 "syscall" 26 26 "time" 27 27 ··· 60 60 61 61 // Inbound SMTP (bounce processing) 62 62 InboundAddr string `json:"inboundAddr"` // default ":25" (port 25 for receiving bounces) 63 + 64 + // InboundRateLimitMsgsPerMinute caps per-source-IP message rate at 65 + // MAIL FROM on the inbound listener. Zero or negative disables. 66 + // Default: 30. Provider bounce traffic and FBL reports come from 67 + // many IPs, so per-IP caps don't affect legitimate volume. 68 + InboundRateLimitMsgsPerMinute float64 `json:"inboundRateLimitMsgsPerMinute"` 69 + // InboundRateLimitBurst is the per-IP token-bucket capacity. Zero 70 + // defaults to 10. Higher values tolerate larger short bursts at the 71 + // cost of weaker abuse protection. 72 + InboundRateLimitBurst int `json:"inboundRateLimitBurst"` 63 73 64 74 // Admin API 65 75 AdminAddr string `json:"adminAddr"` // default ":8080" (Tailscale-only) ··· 209 219 metricsRegistry.MustRegister(prometheus.NewProcessCollector(prometheus.ProcessCollectorOpts{})) 210 220 metricsRegistry.MustRegister(prometheus.NewGoCollector()) 211 221 metrics := relay.NewMetrics(metricsRegistry) 222 + // Wire panic recovery for background goroutines (#209). Every 223 + // relay.GoSafe call below counts recovered panics into 224 + // metrics.GoroutineCrashes; without this wire the panics are 225 + // still logged but not counted. 226 + relay.SetPanicRecorder(metrics) 227 + // Wire SQLITE_BUSY classification at hot-path writers (#210). 228 + // The store reports busy errors via metrics.SQLiteBusyErrors; 229 + // the periodic pool-stats sampler is started below once the 230 + // cancellable ctx is in scope. 231 + store.SetBusyRecorder(metrics) 212 232 213 233 // Label checker 214 234 labelChecker := relay.NewLabelChecker(cfg.LabelerURL, &http.Client{Timeout: 10 * time.Second}) ··· 284 304 var ospreyEnforcer *relay.OspreyEnforcer 285 305 if cfg.OspreyURL != "" { 286 306 ospreyEnforcer = relay.NewOspreyEnforcer(cfg.OspreyURL, &http.Client{Timeout: 5 * time.Second}) 287 - log.Printf("osprey.enforcer.enabled: url=%s", cfg.OspreyURL) 307 + // Persist labelcheck cache so a relay restart doesn't reset 308 + // to fully cold (#215). The fail-closed branch in 309 + // activeLabelsFor is the safety net for the rare case where 310 + // snapshot read fails AND Osprey is unreachable. 311 + snapPath := filepath.Join(cfg.StateDir, "osprey-cache.json") 312 + ospreyEnforcer.SetSnapshotPath(snapPath) 313 + ospreyEnforcer.SetColdCacheRecorder(metrics) 314 + if n, err := ospreyEnforcer.LoadSnapshot(); err != nil { 315 + log.Printf("osprey.cache.load_error: %v", err) 316 + } else if n > 0 { 317 + log.Printf("osprey.cache.loaded: entries=%d path=%s", n, snapPath) 318 + } 319 + log.Printf("osprey.enforcer.enabled: url=%s fail_closed_cold=%v", cfg.OspreyURL, true) 288 320 } 289 321 290 322 // Osprey event emitter (optional) ··· 292 324 if cfg.KafkaBroker != "" { 293 325 ospreyEmitter = osprey.NewEmitter(cfg.KafkaBroker) 294 326 ospreyEmitter.SetMetrics(&relay.EmitterMetricsAdapter{ 295 - Emitted: metrics.OspreyEventsEmitted, 296 - Failed: metrics.OspreyEventsFailed, 327 + Emitted: metrics.OspreyEventsEmitted, 328 + Failed: metrics.OspreyEventsFailed, 329 + Spooled: metrics.OspreyEventsSpooled, 330 + Replayed: metrics.OspreyEventsReplayed, 331 + Dropped: metrics.OspreyEventsDropped, 332 + SpoolDepth: metrics.OspreySpoolDepth, 297 333 }) 334 + // On-disk DLQ for failed Kafka writes (#214). Without this an 335 + // atmos-ops outage silently drops every event the relay emits 336 + // during the window — labels stop propagating and trust scoring 337 + // freezes on stale data with no operator-visible signal. The 338 + // replayer below drains it back when Kafka recovers. 339 + spoolDir := filepath.Join(cfg.StateDir, "osprey-spool") 340 + if eventSpool, err := osprey.NewEventSpool(spoolDir, 0); err != nil { 341 + log.Printf("osprey.spool.disabled: error=%v — events will be lost on Kafka outages", err) 342 + } else { 343 + ospreyEmitter.SetSpool(eventSpool) 344 + log.Printf("osprey.spool.enabled: dir=%s", spoolDir) 345 + } 346 + metrics.OspreyDisabled.Set(0) 298 347 // Closed explicitly in the shutdown sequence (after queue drains), not via defer. 299 348 log.Printf("osprey.enabled: broker=%s topic=osprey.actions_input", cfg.KafkaBroker) 300 349 } else { 301 350 ospreyEmitter = osprey.Noop() 351 + metrics.OspreyDisabled.Set(1) 352 + log.Printf("osprey.disabled: kafkaBroker not configured — relay events will not be propagated") 302 353 } 354 + 303 355 304 356 // Delivery queue 305 357 queue := relay.NewQueue(func(result relay.DeliveryResult) { 306 358 status := result.Status 307 359 if status == "sent" { 308 - store.UpdateMessageStatus(context.Background(), result.EntryID, relaystore.MsgSent, result.SMTPCode) 360 + if err := store.UpdateMessageStatus(context.Background(), result.EntryID, relaystore.MsgSent, result.SMTPCode); err != nil { 361 + if errors.Is(err, relaystore.ErrMessageNotFound) { 362 + // Spool entry without a backing DB row — the 363 + // orphan signature from #208. Log + count so the 364 + // reconciliation janitor's effectiveness is 365 + // observable; do NOT surface to delivery state. 366 + log.Printf("delivery.orphan: entry_id=%d status=sent — DB row missing", result.EntryID) 367 + metrics.OrphanDeliveries.WithLabelValues("sent").Inc() 368 + } else { 369 + log.Printf("delivery.update_error: entry_id=%d status=sent error=%v", result.EntryID, err) 370 + } 371 + } 309 372 ospreyEmitter.Emit(context.Background(), osprey.EventData{ 310 373 EventType: osprey.EventDeliveryResult, 311 374 SenderDID: result.MemberDID, ··· 314 377 SMTPCode: result.SMTPCode, 315 378 }) 316 379 } else { 317 - store.UpdateMessageStatus(context.Background(), result.EntryID, relaystore.MsgBounced, result.SMTPCode) 380 + if err := store.UpdateMessageStatus(context.Background(), result.EntryID, relaystore.MsgBounced, result.SMTPCode); err != nil { 381 + if errors.Is(err, relaystore.ErrMessageNotFound) { 382 + log.Printf("delivery.orphan: entry_id=%d status=bounced — DB row missing", result.EntryID) 383 + metrics.OrphanDeliveries.WithLabelValues("bounced").Inc() 384 + } else { 385 + log.Printf("delivery.update_error: entry_id=%d status=bounced error=%v", result.EntryID, err) 386 + } 387 + } 318 388 if result.SMTPCode >= 500 { 319 389 metrics.BouncesTotal.WithLabelValues("hard").Inc() 320 390 } else { ··· 351 421 qc.RelayDomain = cfg.Domain 352 422 return qc 353 423 }()) 354 - queue.SetSpool(relay.NewSpool(spoolDir)) 424 + spool := relay.NewSpool(spoolDir) 425 + queue.SetSpool(spool) 355 426 queue.SetMetrics(metrics) 356 427 357 428 // Reload any messages that were queued but not delivered before last shutdown ··· 392 463 for i, d := range domains { 393 464 rsaKey, edKey, err := deserializeDKIMKeys(d.DKIMRSAPriv, d.DKIMEdPriv) 394 465 if err != nil { 395 - return nil, fmt.Errorf("deserialize DKIM keys for %s/%s: %v", did, d.Domain, err) 466 + return nil, fmt.Errorf("deserialize DKIM keys for %s/%s: %w", did, d.Domain, err) 396 467 } 397 468 domainInfos[i] = relay.DomainInfo{ 398 469 Domain: d.Domain, ··· 423 494 // cached value if Osprey is unreachable — a previously suspended 424 495 // DID stays blocked even during a network partition. 425 496 if ospreyEnforcer != nil && mwd.Status == relaystore.StatusActive { 426 - policy, _ := ospreyEnforcer.GetPolicy(ctx, member.DID) 497 + policy, err := ospreyEnforcer.GetPolicy(ctx, member.DID) 498 + if errors.Is(err, relay.ErrOspreyColdCache) { 499 + // Cold cache + Osprey unreachable. #215: block AUTH 500 + // rather than fail-open. The rejection is transient 501 + // from the client's POV; once Osprey returns, the 502 + // policy resolves normally. 503 + log.Printf("osprey.enforce: did=%s action=block_auth reason=cold_cache_unreachable", member.DID) 504 + mwd.Status = relaystore.StatusSuspended 505 + } 427 506 if policy != nil && policy.Suspended { 428 507 log.Printf("osprey.enforce: did=%s action=block_auth reason=%s", member.DID, policy.SuspendReason) 429 508 mwd.Status = relaystore.StatusSuspended ··· 444 523 // limits and suspension checks use the same snapshot. 445 524 var policy *relay.LabelPolicy 446 525 if ospreyEnforcer != nil { 447 - policy, _ = ospreyEnforcer.GetPolicy(ctx, member.DID) 526 + p, err := ospreyEnforcer.GetPolicy(ctx, member.DID) 527 + if errors.Is(err, relay.ErrOspreyColdCache) { 528 + // #215: cold cache + Osprey unreachable → 451 SMTP 529 + // deferral. Client retries; by then either Osprey 530 + // is back or the cache has been warmed. 531 + return fmt.Errorf("451 osprey unreachable, please retry") 532 + } 533 + policy = p 448 534 } 449 535 450 536 // Apply warming limits + label policy (highly_trusted skips warming, ··· 586 672 // sendCheck above (highly_trusted skips warming, burst_warming throttles). 587 673 var batchPolicy *relay.LabelPolicy 588 674 if ospreyEnforcer != nil { 589 - batchPolicy, _ = ospreyEnforcer.GetPolicy(context.Background(), member.DID) 675 + p, err := ospreyEnforcer.GetPolicy(context.Background(), member.DID) 676 + if errors.Is(err, relay.ErrOspreyColdCache) { 677 + // #215: same cold-cache fail-closed as the per-msg 678 + // path; reject the batch with 451 so the sender 679 + // retries when Osprey is healthy again. 680 + return fmt.Errorf("451 osprey unreachable, please retry") 681 + } 682 + batchPolicy = p 590 683 } 591 684 hourly, daily := relay.WarmingLimitsForPolicy(warmingCfg, member.CreatedAt, member.HourlyLimit, member.DailyLimit, batchPolicy) 592 685 if err := rateLimiter.CheckBatchAndRecord(context.Background(), member.DID, len(deliverable), hourly, daily); err != nil { ··· 606 699 subject, body := extractSubjectAndBody(data) 607 700 contentFP := relay.ContentFingerprint(subject, body) 608 701 702 + // Multi-RCPT DATA fans out to one queue entry per recipient. If the 703 + // loop returns early on a per-recipient error, recipients 1..N-1 are 704 + // already enqueued and the SMTP client will retry the entire DATA 705 + // (because we returned a transient error), duplicating those 706 + // recipients. Instead, we collect per-recipient outcomes and only 707 + // reject the whole DATA when ZERO recipients succeeded. See #226. 708 + outcomes := make([]relay.RecipientOutcome, 0, len(deliverable)) 609 709 for _, recipient := range deliverable { 710 + outcome := relay.RecipientOutcome{Recipient: recipient} 711 + 610 712 verpFrom := relay.VERPReturnPath(member.DID, recipient, cfg.Domain) 611 713 612 714 // Build per-recipient message with its own List-Unsubscribe header. ··· 640 742 // for DMARC alignment) → operator signature on top (d=atmos.email, 641 743 // carries FBL routing). 642 744 signer := relay.NewDualDomainSigner(member.DKIMKeys, operatorKeys, member.Domain, cfg.OperatorDKIMDomain) 643 - signed, err := signer.Sign(strings.NewReader(string(perMsgData))) 644 - if err != nil { 645 - return fmt.Errorf("DKIM sign: %v", err) 745 + signed, signErr := signer.Sign(strings.NewReader(string(perMsgData))) 746 + if signErr != nil { 747 + outcome.Err = fmt.Errorf("DKIM sign: %w", signErr) 748 + log.Printf("smtp.recipient_failed: did=%s recipient=%s stage=dkim error=%v", member.DID, recipient, signErr) 749 + outcomes = append(outcomes, outcome) 750 + continue 646 751 } 647 752 648 753 // Log message to store 649 - msgID, err := store.InsertMessage(context.Background(), &relaystore.Message{ 754 + msgID, insErr := store.InsertMessage(context.Background(), &relaystore.Message{ 650 755 MemberDID: member.DID, 651 756 FromAddr: from, 652 757 ToAddr: recipient, ··· 655 760 CreatedAt: time.Now().UTC(), 656 761 ContentFingerprint: contentFP, 657 762 }) 658 - if err != nil { 659 - return fmt.Errorf("log message: %v", err) 763 + if insErr != nil { 764 + outcome.Err = fmt.Errorf("log message: %w", insErr) 765 + log.Printf("smtp.recipient_failed: did=%s recipient=%s stage=insert error=%v", member.DID, recipient, insErr) 766 + outcomes = append(outcomes, outcome) 767 + continue 660 768 } 661 - 662 - // Increment send count (rate counters already recorded by CheckBatchAndRecord) 663 - store.IncrementSendCount(context.Background(), member.DID) 769 + outcome.MsgID = msgID 664 770 665 771 // Enqueue for delivery — capacity was pre-checked above so this 666 772 // should only fail on spool I/O errors, not capacity. 667 - if err := queue.Enqueue(&relay.QueueEntry{ 773 + if enqErr := queue.Enqueue(&relay.QueueEntry{ 668 774 ID: msgID, 669 775 From: verpFrom, 670 776 To: recipient, 671 777 Data: signed, 672 778 MemberDID: member.DID, 673 - }); err != nil { 674 - log.Printf("queue.enqueue_error: did=%s entry_id=%d error=%v", member.DID, msgID, err) 675 - return fmt.Errorf("451 delivery queue error — try again later") 779 + }); enqErr != nil { 780 + // Mark the row as failed so it doesn't masquerade as queued 781 + // (the orphan-reconciliation janitor would catch it eventually, 782 + // but immediate update keeps the messages table consistent). 783 + if updErr := store.UpdateMessageStatus(context.Background(), msgID, relaystore.MsgFailed, 0); updErr != nil { 784 + log.Printf("smtp.mark_failed_error: did=%s msg_id=%d error=%v", member.DID, msgID, updErr) 785 + } 786 + outcome.Err = fmt.Errorf("queue.enqueue: %w", enqErr) 787 + log.Printf("smtp.recipient_failed: did=%s recipient=%s stage=enqueue msg_id=%d error=%v", member.DID, recipient, msgID, enqErr) 788 + outcomes = append(outcomes, outcome) 789 + continue 790 + } 791 + 792 + // Only count the send AFTER successful enqueue — failed recipients 793 + // shouldn't burn lifetime send-count budget. Rate counters were 794 + // pre-recorded for the full batch by CheckBatchAndRecord above; that 795 + // over-counts on partial failure but the warming/limit window is 796 + // short enough that the impact is negligible vs. the complexity of 797 + // rolling back per-recipient rate-counter rows. 798 + store.IncrementSendCount(context.Background(), member.DID) 799 + 800 + outcomes = append(outcomes, outcome) 801 + } 802 + 803 + succeeded, failed, retryAll, lastErr := relay.AggregateRecipientOutcomes(outcomes) 804 + if metrics.PartialDeliveryRecipients != nil { 805 + if succeeded > 0 { 806 + metrics.PartialDeliveryRecipients.WithLabelValues("succeeded").Add(float64(succeeded)) 676 807 } 808 + if failed > 0 { 809 + metrics.PartialDeliveryRecipients.WithLabelValues("failed").Add(float64(failed)) 810 + } 811 + } 812 + if retryAll { 813 + metrics.MessagesRejected.WithLabelValues("delivery_failed").Inc() 814 + log.Printf("smtp.delivery_all_failed: did=%s recipients=%d last_error=%v", member.DID, len(deliverable), lastErr) 815 + return fmt.Errorf("451 delivery queue error — try again later: %w", lastErr) 816 + } 817 + if failed > 0 { 818 + if metrics.PartialDeliveries != nil { 819 + metrics.PartialDeliveries.Inc() 820 + } 821 + log.Printf("smtp.partial_delivery: did=%s succeeded=%d failed=%d last_error=%v", member.DID, succeeded, failed, lastErr) 677 822 } 678 823 679 824 // Emit relay_attempt event after successful queuing. Enrich with ··· 707 852 return nil 708 853 } 709 854 710 - // Inbound SMTP server for bounce processing (port 25) 711 - // 712 - // Build a cached hash→DID map for O(1) bounce lookups. 713 - // Rebuilt from the store on each lookup miss to pick up new enrollments. 714 - memberHashCache := &memberHashMap{} 715 - memberHashCache.rebuild(store) 855 + // Inbound SMTP server for bounce processing (port 25). The cache below 856 + // answers VERP "is this hash a member?" lookups without hitting the DB 857 + // on every inbound. Both a positive cache (rebuilt at most every 30s) 858 + // and a negative cache (5min TTL, 10k entries) defend against random- 859 + // VERP DoS — see #218. 860 + memberHashCache := relay.NewMemberHashCache(relay.MemberHashCacheConfig{ 861 + Rebuild: func() (map[string]string, error) { 862 + members, err := store.ListMembers(context.Background()) 863 + if err != nil { 864 + return nil, err 865 + } 866 + out := make(map[string]string, len(members)) 867 + for _, mb := range members { 868 + out[relay.MemberHashFromDID(mb.DID)] = mb.DID 869 + } 870 + return out, nil 871 + }, 872 + Metrics: metrics, 873 + }) 716 874 717 - inboundMemberLookup := func(ctx context.Context, memberHash string) (string, bool) { 718 - if did, ok := memberHashCache.lookup(memberHash); ok { 719 - return did, true 720 - } 721 - // Cache miss — a new member may have been enrolled since last rebuild 722 - memberHashCache.rebuild(store) 723 - return memberHashCache.lookup(memberHash) 875 + inboundMemberLookup := func(_ context.Context, memberHash string) (string, bool) { 876 + return memberHashCache.Lookup(memberHash) 724 877 } 725 878 726 879 inboundBounceHandler := func(ctx context.Context, memberDID, recipient, bounceType, details string) { ··· 754 907 } 755 908 756 909 inboundServer := relay.NewInboundServer(relay.InboundConfig{ 757 - ListenAddr: cfg.InboundAddr, 758 - Domain: cfg.Domain, 910 + ListenAddr: cfg.InboundAddr, 911 + Domain: cfg.Domain, 912 + RateLimitMsgsPerMinute: cfg.InboundRateLimitMsgsPerMinute, 913 + RateLimitBurst: cfg.InboundRateLimitBurst, 759 914 }, inboundBounceHandler, inboundMemberLookup) 760 915 761 916 // Inbound reply forwarding: classify inbound mail and deliver replies ··· 823 978 ctx, cancel := context.WithCancel(context.Background()) 824 979 defer cancel() 825 980 981 + // Osprey labelcheck cache snapshotter (#215). Persists the 982 + // in-memory enforcer cache every 60s so a relay restart doesn't 983 + // reset to fully cold. Combined with fail-closed-on-cold-cache 984 + // in the enforcer, this turns the previously-load-bearing 985 + // fail-open path into a rare edge case (snapshot read failed 986 + // AND Osprey unreachable AND DID has never been seen). 987 + if ospreyEnforcer != nil { 988 + relay.GoSafe("osprey.cache_snapshot", func() { 989 + t := time.NewTicker(60 * time.Second) 990 + defer t.Stop() 991 + for { 992 + select { 993 + case <-ctx.Done(): 994 + if err := ospreyEnforcer.Snapshot(); err != nil { 995 + log.Printf("osprey.cache.snapshot_error_on_shutdown: %v", err) 996 + } 997 + return 998 + case <-t.C: 999 + if err := ospreyEnforcer.Snapshot(); err != nil { 1000 + log.Printf("osprey.cache.snapshot_error: %v", err) 1001 + } 1002 + } 1003 + } 1004 + }) 1005 + } 1006 + 1007 + // Osprey DLQ replayer (#214). Drains the on-disk spool back to 1008 + // Kafka every 30s. A sustained Kafka outage manifests as a 1009 + // growing osprey_spool_depth gauge without permanent loss until 1010 + // the cap is hit. Started here, after ctx is in scope, so the 1011 + // loop respects the same shutdown signal as the rest of the 1012 + // long-lived goroutines. 1013 + if ospreyEmitter.Enabled() { 1014 + relay.GoSafe("osprey.replayer", func() { 1015 + t := time.NewTicker(30 * time.Second) 1016 + defer t.Stop() 1017 + for { 1018 + select { 1019 + case <-ctx.Done(): 1020 + return 1021 + case <-t.C: 1022 + n, failed, err := ospreyEmitter.ReplaySpool(ctx) 1023 + if err != nil { 1024 + log.Printf("osprey.replay.error: %v", err) 1025 + continue 1026 + } 1027 + if n > 0 || failed > 0 { 1028 + log.Printf("osprey.replay: replayed=%d failed=%d", n, failed) 1029 + } 1030 + } 1031 + } 1032 + }) 1033 + } 1034 + 826 1035 sigCh := make(chan os.Signal, 1) 827 1036 signal.Notify(sigCh, syscall.SIGINT, syscall.SIGTERM) 828 - go func() { 1037 + relay.GoSafe("signal.shutdown", func() { 829 1038 sig := <-sigCh 830 1039 log.Printf("received %s, shutting down", sig) 831 1040 cancel() 832 - }() 1041 + }) 833 1042 834 - // Start SMTP server 1043 + // Start SMTP server. TLS uses CertReloader (#216) so ACME cert 1044 + // renewals are picked up automatically without a process restart. 1045 + // Previously the ACME reloadServices hook restarted the relay 1046 + // every 60-90 days, dropping in-flight SMTP/HTTP sessions and 1047 + // triggering the spool-reload race in #208. With GetCertificate, 1048 + // the next TLS handshake after renewal serves the new cert with 1049 + // zero session disruption. 835 1050 var tlsConfig *tls.Config 836 1051 if cfg.TLSCertFile != "" && cfg.TLSKeyFile != "" { 837 - cert, err := tls.LoadX509KeyPair(cfg.TLSCertFile, cfg.TLSKeyFile) 1052 + reloader, err := relay.NewCertReloader(cfg.TLSCertFile, cfg.TLSKeyFile) 838 1053 if err != nil { 839 - // On first deploy, ACME cert may not exist yet. Start without TLS; 840 - // the ACME reloadServices hook will restart us once the cert is ready. 841 1054 log.Printf("system.tls_unavailable: error=%v (SMTP will run without STARTTLS until cert is provisioned)", err) 842 1055 } else { 843 1056 tlsConfig = &tls.Config{ 844 - Certificates: []tls.Certificate{cert}, 845 - MinVersion: tls.VersionTLS12, 1057 + GetCertificate: reloader.GetCertificate, 1058 + MinVersion: tls.VersionTLS12, 846 1059 } 847 - log.Printf("system.tls_loaded: cert=%s", cfg.TLSCertFile) 1060 + log.Printf("system.tls_loaded: cert=%s key=%s reloader=auto", cfg.TLSCertFile, cfg.TLSKeyFile) 848 1061 } 849 1062 } 850 1063 ··· 858 1071 Verifier: dns.NewVerifier(net.DefaultResolver), 859 1072 })) 860 1073 861 - go func() { 1074 + relay.GoSafe("smtp.serve", func() { 862 1075 log.Printf("SMTP server listening on %s", cfg.SMTPAddr) 863 1076 if err := smtpServer.ListenAndServe(); err != nil { 864 1077 log.Printf("smtp server: %v", err) 865 1078 } 866 - }() 1079 + }) 867 1080 868 1081 // SPF alignment checker 869 1082 spfChecker := relay.NewSPFChecker(cfg.Domain, "", nil) ··· 880 1093 // Periodically sweep expired pending enrollments so stale rows don't 881 1094 // accumulate. One-per-hour is plenty given 24h TTL and UNIQUE(domain) 882 1095 // already guarantees the table stays small in practice. 883 - go func() { 1096 + relay.GoSafe("pending_enrollment_cleanup", func() { 884 1097 t := time.NewTicker(1 * time.Hour) 885 1098 defer t.Stop() 886 1099 for range t.C { ··· 891 1104 log.Printf("pending_enrollment_cleanup: expired=%d", n) 892 1105 } 893 1106 } 894 - }() 1107 + }) 1108 + 1109 + // SQLite pool-stats sampler (#210). Polls sql.DB.Stats() every 1110 + // 10s and republishes the values as Prometheus gauges so 1111 + // operators can graph pool pressure (open/in-use/idle) and 1112 + // contention (WaitCount, WaitDuration) without a busy-error 1113 + // ever escaping the 5s busy_timeout PRAGMA. Combined with 1114 + // metrics.SQLiteBusyErrors at hot writers, this turns the 1115 + // previously-invisible contention surface into both a leading 1116 + // indicator (pool waits climbing) AND a firing one (busy 1117 + // errors actually returned). 1118 + relay.GoSafe("sqlite.stats", func() { 1119 + t := time.NewTicker(10 * time.Second) 1120 + defer t.Stop() 1121 + for { 1122 + select { 1123 + case <-ctx.Done(): 1124 + return 1125 + case <-t.C: 1126 + ps := store.SampleStats() 1127 + metrics.SetSQLiteStats(ps.OpenConnections, ps.InUse, ps.Idle, ps.WaitCount, ps.WaitDurationSecond) 1128 + } 1129 + } 1130 + }) 1131 + 1132 + // Orphan-reconciliation janitor (#208). Finds messages rows that 1133 + // are still status=queued long after creation but have no spool 1134 + // file backing them, and marks them failed so dashboards stop 1135 + // showing them as in-flight forever and operators can see the 1136 + // rate via metrics.OrphanReconciled. 1137 + // 1138 + // Why this is necessary: a multi-recipient batch where recipient 1139 + // N's queue.Enqueue fails after recipients 1..N-1 succeeded 1140 + // leaves an N-th row at status=queued with no spool entry. The 1141 + // SMTP session returns 4xx; the client retries; rows for 1142 + // recipients 1..N-1 get duplicated; the original N-th row is 1143 + // orphaned. Fixing the duplicate-delivery side requires changing 1144 + // the SMTP session to accept partial success (#226 follow-up); 1145 + // this janitor closes the orphan accounting in the meantime. 1146 + // 1147 + // orphanMinAge gives Enqueue plenty of time to land its spool 1148 + // file before we second-guess. 5 minutes is far longer than any 1149 + // reasonable Enqueue path. 1150 + const orphanMinAge = 5 * time.Minute 1151 + relay.GoSafe("orphan_reconcile", func() { 1152 + t := time.NewTicker(5 * time.Minute) 1153 + defer t.Stop() 1154 + for range t.C { 1155 + ids, err := store.ListQueuedMessageIDsOlderThan(context.Background(), orphanMinAge, 500) 1156 + if err != nil { 1157 + log.Printf("orphan_reconcile: list_error=%v", err) 1158 + continue 1159 + } 1160 + closed := 0 1161 + for _, id := range ids { 1162 + if spool.Exists(id) { 1163 + continue 1164 + } 1165 + if err := store.UpdateMessageStatus(context.Background(), id, relaystore.MsgFailed, 0); err != nil { 1166 + log.Printf("orphan_reconcile: update_error id=%d error=%v", id, err) 1167 + continue 1168 + } 1169 + closed++ 1170 + metrics.OrphanReconciled.Inc() 1171 + } 1172 + if closed > 0 { 1173 + log.Printf("orphan_reconcile: scanned=%d closed=%d", len(ids), closed) 1174 + } 1175 + } 1176 + }) 1177 + 1178 + // Periodic refresh of the inbound member-hash cache (#218). The cache 1179 + // rebuilds on-miss too, but that path is rate-limited to one rebuild 1180 + // per 30s; this background ticker guarantees newly enrolled members 1181 + // become resolvable within ~60s without needing a miss to trigger it. 1182 + relay.GoSafe("member_hash_refresh", func() { 1183 + memberHashCache.PeriodicRebuild(ctx, 60*time.Second) 1184 + }) 1185 + 1186 + // Bypass-expiry janitor (#213). Runs every 5min; removes bypass 1187 + // entries whose expires_at has passed and writes 'expired' audit 1188 + // rows. Without this, an admin token compromise that issued a 1189 + // long bypass would persist past any reasonable detection 1190 + // window — even with the expiry recorded, removal needs an 1191 + // active sweep. Legacy bypass entries (expires_at='') are NOT 1192 + // touched; operators must explicitly re-add with expiry. 1193 + relay.GoSafe("bypass_expiry", func() { 1194 + t := time.NewTicker(5 * time.Minute) 1195 + defer t.Stop() 1196 + for { 1197 + select { 1198 + case <-ctx.Done(): 1199 + return 1200 + case <-t.C: 1201 + // Snapshot the live set before purge so we can mirror 1202 + // the eviction into the labelChecker's in-memory bypass 1203 + // list. The store path uses formatTime cutoffs; the 1204 + // in-memory set is just a string slice, so we recompute 1205 + // the diff: anything in labelChecker.BypassDIDs() that 1206 + // isn't in the post-purge store list has expired. 1207 + n, err := store.PurgeExpiredBypassDIDs(context.Background()) 1208 + if err != nil { 1209 + log.Printf("bypass_expiry: error=%v", err) 1210 + continue 1211 + } 1212 + if n == 0 { 1213 + continue 1214 + } 1215 + active, err := store.ListBypassDIDs(context.Background()) 1216 + if err != nil { 1217 + log.Printf("bypass_expiry: list_error=%v", err) 1218 + continue 1219 + } 1220 + keep := make(map[string]struct{}, len(active)) 1221 + for _, d := range active { 1222 + keep[d] = struct{}{} 1223 + } 1224 + for _, d := range labelChecker.BypassDIDs() { 1225 + if _, ok := keep[d]; !ok { 1226 + labelChecker.RemoveBypassDID(d) 1227 + } 1228 + } 1229 + log.Printf("bypass_expiry: removed=%d", n) 1230 + } 1231 + } 1232 + }) 895 1233 896 1234 // Start admin API (includes /metrics endpoint) 897 1235 adminAPI := admin.NewComplete(store, cfg.AdminToken, cfg.Domain, labelChecker, spfChecker, domainVerifier) ··· 942 1280 for i := range seedList { 943 1281 seedList[i] = strings.TrimSpace(seedList[i]) 944 1282 } 1283 + var fromParts []string 1284 + if fp := os.Getenv("WARMUP_FROM_LOCAL_PARTS"); fp != "" { 1285 + for _, p := range strings.Split(fp, ",") { 1286 + fromParts = append(fromParts, strings.TrimSpace(p)) 1287 + } 1288 + } 945 1289 ws := relay.NewWarmupSender(relay.WarmupConfig{ 946 1290 SeedAddresses: seedList, 1291 + FromLocalParts: fromParts, 947 1292 MemberLookup: memberLookup, 948 1293 Queue: queue, 949 1294 OperatorKeys: operatorKeys, ··· 965 1310 }) 966 1311 adminAPI.SetWarmupSender(ws) 967 1312 log.Printf("warmup.enabled: seed_count=%d", len(seedList)) 1313 + 1314 + if warmupDIDsEnv := os.Getenv("WARMUP_DIDS"); warmupDIDsEnv != "" { 1315 + var warmupDIDs []string 1316 + for _, d := range strings.Split(warmupDIDsEnv, ",") { 1317 + warmupDIDs = append(warmupDIDs, strings.TrimSpace(d)) 1318 + } 1319 + warmupSched := relay.NewWarmupScheduler(relay.WarmupSchedulerConfig{ 1320 + Sender: ws, 1321 + ListDIDs: func(ctx context.Context) ([]string, error) { 1322 + return warmupDIDs, nil 1323 + }, 1324 + }) 1325 + warmupSched.Start(ctx) 1326 + defer warmupSched.Stop() 1327 + log.Printf("warmup.scheduler: dids=%v", warmupDIDs) 1328 + } 968 1329 } 969 1330 970 1331 // Durable notification queue worker (audit #158). Drains ··· 975 1336 // that rotation mail lands within a minute under normal conditions, 976 1337 // slow enough not to hammer an already-struggling downstream. 977 1338 notifyWorker := notify.NewQueueWorker(store, adminAPI.DeliverNotification, 15*time.Second) 978 - go func() { 1339 + relay.GoSafe("notify.queue", func() { 979 1340 if err := notifyWorker.Run(ctx); err != nil && !errors.Is(err, context.Canceled) { 980 1341 log.Printf("notify.queue: %v", err) 981 1342 } 982 - }() 1343 + }) 983 1344 log.Printf("notify.queue.enabled: tick=15s max_attempts=%d", relaystore.MaxNotificationAttempts) 984 1345 985 1346 dashboardUI := adminui.NewWithQueue(store, labelChecker, func() int { return queue.Depth() }) ··· 1043 1404 WriteTimeout: 30 * time.Second, 1044 1405 IdleTimeout: 120 * time.Second, 1045 1406 } 1046 - go func() { 1407 + relay.GoSafe("admin.serve", func() { 1047 1408 log.Printf("admin API listening on %s", cfg.AdminAddr) 1048 1409 if err := adminServer.ListenAndServe(); err != nil && err != http.ErrServerClosed { 1049 1410 log.Printf("admin server: %v", err) 1050 1411 } 1051 - }() 1412 + }) 1052 1413 1053 1414 // Public HTTPS listener — answers on multiple hostnames with different 1054 1415 // roles. See internal/relay/publicrouter.go for the routing rules: ··· 1074 1435 enrollHandler := adminui.NewEnrollHandler(adminAPI, didResolver) 1075 1436 enrollHandler.SetDomainLister(storeDomainLister{store: store}) 1076 1437 enrollHandler.SetFunnelRecorder(metrics) 1438 + // Bind enrollment to OAuth-verified DIDs (#207). Without this 1439 + // wire, /admin/enroll-start and /admin/enroll accept any DID 1440 + // from a request body — letting an attacker who only owns a 1441 + // domain enroll under any victim's atproto identity. 1442 + adminAPI.SetEnrollAuthVerifier(enrollHandler) 1077 1443 // Enable /enroll/label-status for the success-page polling UX. 1078 1444 // LabelChecker is tailnet-only; proxying through the relay keeps 1079 1445 // labeler connectivity private. ··· 1231 1597 WriteTimeout: 10 * time.Second, 1232 1598 IdleTimeout: 60 * time.Second, 1233 1599 } 1234 - go func() { 1600 + publicErrCh := make(chan error, 1) 1601 + relay.GoSafe("public.serve", func() { 1235 1602 log.Printf("public HTTPS listening on %s", cfg.PublicAddr) 1236 1603 if err := publicServer.ListenAndServeTLS("", ""); err != nil && err != http.ErrServerClosed { 1237 - log.Printf("public server: %v", err) 1604 + publicErrCh <- err 1238 1605 } 1239 - }() 1606 + }) 1607 + relay.GoSafe("public.errwatch", func() { 1608 + if err := <-publicErrCh; err != nil { 1609 + log.Fatalf("public server: %v", err) 1610 + } 1611 + }) 1240 1612 } 1241 1613 } 1242 1614 1243 1615 // Start inbound SMTP server (bounce processing) 1244 - go func() { 1616 + relay.GoSafe("inbound.serve", func() { 1245 1617 log.Printf("inbound SMTP server listening on %s", cfg.InboundAddr) 1246 1618 if err := inboundServer.ListenAndServe(); err != nil { 1247 1619 log.Printf("inbound smtp server: %v", err) 1248 1620 } 1249 - }() 1621 + }) 1250 1622 1251 1623 // Start delivery queue — queueDone closes when Run returns (all in-flight deliveries complete) 1252 1624 queueDone := make(chan struct{}) 1253 - go func() { 1625 + relay.GoSafe("queue.run", func() { 1254 1626 defer close(queueDone) 1255 1627 if err := queue.Run(ctx); err != nil && ctx.Err() == nil { 1256 1628 log.Printf("delivery queue: %v", err) 1257 1629 } 1258 - }() 1630 + }) 1259 1631 1260 1632 // Relay-local Osprey event consumer. Reads osprey.execution_results 1261 1633 // from Kafka and writes to the relay_events table, so the admin UI ··· 1265 1637 var eventsConsumer *relay.OspreyEventConsumer 1266 1638 if cfg.KafkaBroker != "" { 1267 1639 eventsConsumer = relay.NewOspreyEventConsumer(cfg.KafkaBroker, store, relay.WithConsumerMetrics(metrics)) 1268 - go func() { 1640 + relay.GoSafe("events.consumer", func() { 1269 1641 if err := eventsConsumer.Run(ctx); err != nil && ctx.Err() == nil { 1270 1642 log.Printf("relay_events.consumer: %v", err) 1271 1643 } 1272 - }() 1644 + }) 1273 1645 log.Printf("relay_events.enabled: broker=%s topic=%s", cfg.KafkaBroker, relay.OspreyOutputTopic) 1274 1646 } 1275 1647 ··· 1291 1663 // "was-queried-recently". Without this the gauges falsely report 1292 1664 // unreachable during quiet periods (between sends) — an outage at 1293 1665 // 3 AM would look identical to idle. 1294 - go func() { 1666 + relay.GoSafe("health.probe", func() { 1295 1667 // Short initial delay so the first probe runs ~10s after startup, 1296 1668 // giving dependent services time to become ready after a deploy. 1297 1669 initialDelay := time.NewTimer(10 * time.Second) ··· 1349 1721 probe() 1350 1722 } 1351 1723 } 1352 - }() 1724 + }) 1353 1725 1354 1726 // Periodic rate counter cleanup (every hour) 1355 - go func() { 1727 + relay.GoSafe("rate_counter.cleanup", func() { 1356 1728 ticker := time.NewTicker(1 * time.Hour) 1357 1729 defer ticker.Stop() 1358 1730 for { ··· 1403 1775 } 1404 1776 } 1405 1777 } 1406 - }() 1778 + }) 1407 1779 1408 1780 <-ctx.Done() 1409 1781 ··· 1477 1849 func loadConfig(path string) (*RelayConfig, error) { 1478 1850 data, err := os.ReadFile(path) 1479 1851 if err != nil { 1480 - return nil, fmt.Errorf("read config %s: %v", path, err) 1852 + return nil, fmt.Errorf("read config %s: %w", path, err) 1481 1853 } 1482 1854 1483 1855 var cfg RelayConfig 1484 1856 if err := json.Unmarshal(data, &cfg); err != nil { 1485 - return nil, fmt.Errorf("parse config %s: %v", path, err) 1857 + return nil, fmt.Errorf("parse config %s: %w", path, err) 1486 1858 } 1487 1859 1488 1860 // Env var overrides ··· 1506 1878 if cfg.Domain == "" { 1507 1879 cfg.Domain = "atmos.email" 1508 1880 } 1881 + if cfg.InboundRateLimitMsgsPerMinute == 0 { 1882 + cfg.InboundRateLimitMsgsPerMinute = 30 1883 + } 1884 + if cfg.InboundRateLimitBurst == 0 { 1885 + cfg.InboundRateLimitBurst = 10 1886 + } 1509 1887 if cfg.InboundAddr == "" { 1510 1888 cfg.InboundAddr = ":25" 1511 1889 } ··· 1534 1912 func deserializeDKIMKeys(rsaBytes, edBytes []byte) (*rsa.PrivateKey, ed25519.PrivateKey, error) { 1535 1913 rsaRaw, err := x509.ParsePKCS8PrivateKey(rsaBytes) 1536 1914 if err != nil { 1537 - return nil, nil, fmt.Errorf("parse RSA key: %v", err) 1915 + return nil, nil, fmt.Errorf("parse RSA key: %w", err) 1538 1916 } 1539 1917 rsaKey, ok := rsaRaw.(*rsa.PrivateKey) 1540 1918 if !ok { ··· 1543 1921 1544 1922 edRaw, err := x509.ParsePKCS8PrivateKey(edBytes) 1545 1923 if err != nil { 1546 - return nil, nil, fmt.Errorf("parse Ed25519 key: %v", err) 1924 + return nil, nil, fmt.Errorf("parse Ed25519 key: %w", err) 1547 1925 } 1548 1926 edKey, ok := edRaw.(ed25519.PrivateKey) 1549 1927 if !ok { ··· 1551 1929 } 1552 1930 1553 1931 return rsaKey, edKey, nil 1554 - } 1555 - 1556 - // memberHashMap caches VERP member hash → DID mappings for O(1) bounce lookups. 1557 - type memberHashMap struct { 1558 - mu sync.RWMutex 1559 - m map[string]string // hash → DID 1560 - } 1561 - 1562 - func (h *memberHashMap) rebuild(store *relaystore.Store) { 1563 - members, err := store.ListMembers(context.Background()) 1564 - if err != nil { 1565 - log.Printf("inbound.member_cache_rebuild_error: error=%v", err) 1566 - return 1567 - } 1568 - newMap := make(map[string]string, len(members)) 1569 - for _, m := range members { 1570 - newMap[relay.MemberHashFromDID(m.DID)] = m.DID 1571 - } 1572 - h.mu.Lock() 1573 - h.m = newMap 1574 - h.mu.Unlock() 1575 - } 1576 - 1577 - func (h *memberHashMap) lookup(hash string) (string, bool) { 1578 - h.mu.RLock() 1579 - defer h.mu.RUnlock() 1580 - did, ok := h.m[hash] 1581 - return did, ok 1582 1932 } 1583 1933 1584 1934 // extractMessageID extracts the Message-ID header from raw message data.
+97
cmd/rotate-dkim/main.go
··· 1 + // SPDX-License-Identifier: AGPL-3.0-or-later 2 + 3 + package main 4 + 5 + import ( 6 + "crypto/ed25519" 7 + "crypto/x509" 8 + "encoding/base64" 9 + "encoding/json" 10 + "flag" 11 + "fmt" 12 + "os" 13 + "time" 14 + 15 + "atmosphere-mail/internal/relay" 16 + ) 17 + 18 + func main() { 19 + domain := flag.String("domain", "atmos.email", "domain the DKIM keys sign for") 20 + outDir := flag.String("out", ".", "directory to write the key file") 21 + flag.Parse() 22 + 23 + now := time.Now().UTC() 24 + selector := relay.OperatorDKIMSelector(now) 25 + 26 + keys, err := relay.GenerateDKIMKeys(selector) 27 + if err != nil { 28 + fmt.Fprintf(os.Stderr, "error: %v\n", err) 29 + os.Exit(1) 30 + } 31 + 32 + rsaBytes, err := x509.MarshalPKCS8PrivateKey(keys.RSAPriv) 33 + if err != nil { 34 + fmt.Fprintf(os.Stderr, "error marshalling RSA key: %v\n", err) 35 + os.Exit(1) 36 + } 37 + edBytes, err := x509.MarshalPKCS8PrivateKey(keys.EdPriv) 38 + if err != nil { 39 + fmt.Fprintf(os.Stderr, "error marshalling Ed25519 key: %v\n", err) 40 + os.Exit(1) 41 + } 42 + 43 + type keyFile struct { 44 + Selector string `json:"selector"` 45 + Domain string `json:"domain"` 46 + RSAPriv string `json:"rsa_priv"` 47 + EdPriv string `json:"ed_priv"` 48 + } 49 + 50 + f := keyFile{ 51 + Selector: keys.Selector, 52 + Domain: *domain, 53 + RSAPriv: base64.StdEncoding.EncodeToString(rsaBytes), 54 + EdPriv: base64.StdEncoding.EncodeToString(edBytes), 55 + } 56 + out, err := json.MarshalIndent(&f, "", " ") 57 + if err != nil { 58 + fmt.Fprintf(os.Stderr, "error: %v\n", err) 59 + os.Exit(1) 60 + } 61 + 62 + outPath := fmt.Sprintf("%s/operator-dkim-keys-%s.json", *outDir, now.Format("20060102")) 63 + if err := os.WriteFile(outPath, out, 0o600); err != nil { 64 + fmt.Fprintf(os.Stderr, "error writing key file: %v\n", err) 65 + os.Exit(1) 66 + } 67 + 68 + rsaPub, _ := x509.MarshalPKIXPublicKey(&keys.RSAPriv.PublicKey) 69 + edPub := keys.EdPriv.Public().(ed25519.PublicKey) 70 + 71 + fmt.Println("=== DKIM Key Rotation ===") 72 + fmt.Println() 73 + fmt.Printf("Key file: %s\n", outPath) 74 + fmt.Printf("Selector: %s\n", keys.Selector) 75 + fmt.Printf("Domain: %s\n", *domain) 76 + fmt.Println() 77 + fmt.Println("--- DNS TXT Records ---") 78 + fmt.Println() 79 + fmt.Printf(" %sr._domainkey.%s TXT %s\n", keys.Selector, *domain, keys.RSADNSRecord()) 80 + fmt.Printf(" %se._domainkey.%s TXT %s\n", keys.Selector, *domain, keys.EdDNSRecord()) 81 + fmt.Println() 82 + fmt.Println("--- Terraform Variables ---") 83 + fmt.Println() 84 + fmt.Printf(" relay_dkim_rsa_pubkey = %q\n", base64.StdEncoding.EncodeToString(rsaPub)) 85 + fmt.Printf(" relay_dkim_ed25519_pubkey = %q\n", base64.StdEncoding.EncodeToString(edPub)) 86 + fmt.Println() 87 + fmt.Println("--- Rotation Steps ---") 88 + fmt.Println() 89 + fmt.Println("1. Publish new DNS records via Terraform (keep old records active)") 90 + fmt.Println(" tofu apply -var 'relay_dkim_rsa_pubkey=...' -var 'relay_dkim_ed25519_pubkey=...'") 91 + fmt.Println("2. Wait 2+ hours for DNS propagation (TTL is 1h)") 92 + fmt.Printf("3. Copy %s to relay:\n", outPath) 93 + fmt.Println(" scp", outPath, "root@atmos-relay:/var/lib/atmos-relay/operator-dkim-keys.json") 94 + fmt.Println("4. Restart: systemctl restart atmos-relay") 95 + fmt.Println("5. Verify: send test email, check DKIM-Signature header uses new selector") 96 + fmt.Println("6. After 7 days: remove old DNS records from Terraform") 97 + }
+3 -4
docs/blog-alpha-launch.md
··· 88 88 members, inbound log, shadow-verdicts, review queue for 89 89 auto-suspensions. 90 90 - **FBL integrations**: Gmail Postmaster Tools verified, Microsoft 91 - SNDS + JMRP registered, Yahoo CFL pending. Pool-level registration 91 + SNDS + JMRP registered, Yahoo CFL verified. All three major US 92 + mailbox-provider feedback loops are live. Pool-level registration 92 93 via `d=atmos.email` signing means one registration per provider 93 94 covers every member. 94 95 - **Atproto OAuth** (PAR + DPoP + PKCE + `private_key_jwt`) for ··· 132 133 dashboard. Rules will be frozen at their current behavior by a 133 134 harness that publishes fixtures to a test Kafka and asserts on 134 135 verdicts. 135 - 4. **Yahoo CFL registration.** The last externally-gated FBL 136 - program. Manual form, 1–5 day turnaround. 137 - 5. **Content policies that aren't just abuse.** Transactional-only is 136 + 4. **Content policies that aren't just abuse.** Transactional-only is 138 137 a deliberate v1 constraint; the path to "Postmark for atproto" 139 138 runs through richer template support and eventually a managed 140 139 API alongside SMTP.
+1 -1
docs/operator-runbook.md
··· 213 213 | Gmail Postmaster Tools | Verified | TXT token published for `atmos.email`; dashboard live at postmaster.google.com. Reputation score needs ~48 h of sending volume to populate. | 214 214 | Microsoft SNDS | IP registered, authorization email landed via operator-forwarder | The enrollment flow required receiving a verification mail at `postmaster@atmos.email` — handled by the operator-forwarder routing described in section 6. | 215 215 | Microsoft JMRP | Registered | FBL recipient `fbl@atmospheremail.com` accepted. First complaint probe will confirm the delivery path. | 216 - | Yahoo CFL | Pending | Manual form at `senders.yahooinc.com/complaint-feedback-loop/` — no API. Tracked as the last externally-gated item before the FBL triangle is complete. | 216 + | Yahoo CFL | Verified 2026-04-20 | Domain verified via TXT (`yahoo-verification-key=…`) at the atmos.email apex. Verification record is a no-op now; tracked for removal in chainlink #144. Complaints will arrive at `fbl@atmospheremail.com` once Yahoo begins sending. | 217 217 218 218 Adding a new provider later: publish the FBL recipient as 219 219 `fbl@atmospheremail.com` if they accept an external address, otherwise
+2 -2
go.mod
··· 6 6 7 7 require ( 8 8 github.com/a-h/templ v0.3.1001 9 - github.com/bluesky-social/indigo v0.0.0-20260417172304-7da09df6081d 9 + github.com/bluesky-social/indigo v0.0.0-20260422192121-9bad73ca4cad 10 10 github.com/emersion/go-msgauth v0.7.0 11 11 github.com/emersion/go-sasl v0.0.0-20241020182733-b788ff22d5a6 12 12 github.com/emersion/go-smtp v0.24.0 13 13 github.com/fxamacker/cbor/v2 v2.9.1 14 14 github.com/gorilla/websocket v1.5.3 15 + github.com/jackc/pgx/v5 v5.9.2 15 16 github.com/mr-tron/base58 v1.3.0 16 17 github.com/prometheus/client_golang v1.23.2 17 18 github.com/segmentio/kafka-go v0.4.50 ··· 33 34 github.com/hashicorp/golang-lru/v2 v2.0.7 // indirect 34 35 github.com/jackc/pgpassfile v1.0.0 // indirect 35 36 github.com/jackc/pgservicefile v0.0.0-20240606120523-5a60cdf6a761 // indirect 36 - github.com/jackc/pgx/v5 v5.9.2 // indirect 37 37 github.com/jackc/puddle/v2 v2.2.2 // indirect 38 38 github.com/klauspost/compress v1.18.0 // indirect 39 39 github.com/kylelemons/godebug v1.1.0 // indirect
+2
go.sum
··· 4 4 github.com/beorn7/perks v1.0.1/go.mod h1:G2ZrVWU2WbWT9wwq4/hrbKbnv/1ERSJQ0ibhJ6rlkpw= 5 5 github.com/bluesky-social/indigo v0.0.0-20260417172304-7da09df6081d h1:ThKFUrkm2/IZwbvmIKLJYr0wPHibtCkIVmuZCWmdIHM= 6 6 github.com/bluesky-social/indigo v0.0.0-20260417172304-7da09df6081d/go.mod h1:JqQkz8lrOI6YZivP38GHmtVOTtzsNToITKj1gMpU5Jo= 7 + github.com/bluesky-social/indigo v0.0.0-20260422192121-9bad73ca4cad h1:OWhqcY8bjkTYLSd3lnd2orx8sKaiNGzUH+kdV+JQdkw= 8 + github.com/bluesky-social/indigo v0.0.0-20260422192121-9bad73ca4cad/go.mod h1:JqQkz8lrOI6YZivP38GHmtVOTtzsNToITKj1gMpU5Jo= 7 9 github.com/cespare/xxhash/v2 v2.3.0 h1:UL815xU9SqsFlibzuggzjXhog7bL6oX9BbNZnL2UFvs= 8 10 github.com/cespare/xxhash/v2 v2.3.0/go.mod h1:VGX0DQ3Q6kWi7AoAeZDth3/j3BFtOZR5XLFGgcrjCOs= 9 11 github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
+11
infra/dns.tf
··· 169 169 } 170 170 171 171 # --------------------------------------------------------------------------- 172 + # dnswl.org — domain ownership verification for allowlist registration 173 + # --------------------------------------------------------------------------- 174 + resource "bunnynet_dns_record" "dnswl_verify" { 175 + zone = bunnynet_dns_zone.atmos_email.id 176 + name = "_token._dnswl" 177 + type = "TXT" 178 + value = "r4ooourdma829nvxd8u3tuqvk37ealih" 179 + ttl = local.dns_ttl 180 + } 181 + 182 + # --------------------------------------------------------------------------- 172 183 # Labeler — public XRPC endpoint on atmos-ops 173 184 # --------------------------------------------------------------------------- 174 185 resource "bunnynet_dns_record" "labeler_a" {
+62
infra/main.tf
··· 109 109 port = "41641" 110 110 source_ips = ["0.0.0.0/0", "::/0"] 111 111 } 112 + 113 + # Firewall rules are load-bearing for email deliverability and service 114 + # availability. Accidental deletion would knock out SMTP + HTTPS. 115 + lifecycle { 116 + prevent_destroy = true 117 + } 112 118 } 113 119 114 120 # --------------------------------------------------------------------------- ··· 225 231 port = "41641" 226 232 source_ips = ["0.0.0.0/0", "::/0"] 227 233 } 234 + 235 + lifecycle { 236 + prevent_destroy = true 237 + } 228 238 } 229 239 230 240 # --------------------------------------------------------------------------- ··· 258 268 } 259 269 } 260 270 } 271 + 272 + # --------------------------------------------------------------------------- 273 + # Backup volumes — encrypted block storage for Restic repositories. 274 + # Separate from the boot disk so backups survive server rebuilds. 275 + # NixOS formats these with ext4 + label on first mount; do NOT set 276 + # the `format` argument here (it uses Hetzner's unattended formatter 277 + # which conflicts with NixOS disk management). 278 + # --------------------------------------------------------------------------- 279 + 280 + resource "hcloud_volume" "ops_backup" { 281 + name = "atmos-ops-backup" 282 + size = 20 283 + location = "ash" 284 + 285 + labels = { 286 + managed_by = "opentofu" 287 + role = "backup" 288 + project = "atmosphere-mail" 289 + } 290 + 291 + lifecycle { 292 + prevent_destroy = true 293 + } 294 + } 295 + 296 + resource "hcloud_volume_attachment" "ops_backup" { 297 + volume_id = hcloud_volume.ops_backup.id 298 + server_id = hcloud_server.atmos_ops.id 299 + automount = false 300 + } 301 + 302 + resource "hcloud_volume" "relay_backup" { 303 + name = "atmos-relay-backup" 304 + size = 10 305 + location = "ash" 306 + 307 + labels = { 308 + managed_by = "opentofu" 309 + role = "backup" 310 + project = "atmosphere-mail" 311 + } 312 + 313 + lifecycle { 314 + prevent_destroy = true 315 + } 316 + } 317 + 318 + resource "hcloud_volume_attachment" "relay_backup" { 319 + volume_id = hcloud_volume.relay_backup.id 320 + server_id = hcloud_server.atmos_relay.id 321 + automount = false 322 + }
+172 -9
infra/nixos/atmos-ops.nix
··· 31 31 boot.loader.grub = { 32 32 enable = true; 33 33 efiSupport = false; 34 + configurationLimit = 20; 34 35 }; 35 36 36 37 boot.initrd.availableKernelModules = [ ··· 117 118 }; 118 119 119 120 sops.secrets.bunny_api_key = {}; 121 + sops.secrets.ntfy_gatus_token = {}; 120 122 121 123 # Environment file for label-api (PG DSN) 122 124 sops.templates."label-api-env" = { ··· 311 313 description = "Write Gatus config for status page"; 312 314 wantedBy = [ "docker-atmos-status.service" ]; 313 315 before = [ "docker-atmos-status.service" ]; 316 + after = [ "sops-nix.service" ]; 314 317 serviceConfig = { 315 318 Type = "oneshot"; 316 319 RemainAfterExit = true; 317 320 }; 318 321 script = '' 319 322 mkdir -p /var/lib/atmos-status 320 - cat > /var/lib/atmos-status/config.yaml << 'EOF' 323 + NTFY_TOKEN=$(cat ${config.sops.secrets.ntfy_gatus_token.path}) 324 + cat > /var/lib/atmos-status/config.yaml << EOF 321 325 storage: 322 326 type: sqlite 323 327 path: /data/gatus.db 324 328 329 + alerting: 330 + ntfy: 331 + url: "https://ntfy.internal.example" 332 + topic: "atmos-ops" 333 + token: "$NTFY_TOKEN" 334 + priority: 5 335 + default-alert: 336 + enabled: true 337 + failure-threshold: 3 338 + success-threshold: 2 339 + send-on-resolved: true 340 + 325 341 ui: 326 342 title: Atmosphere Mail Status 327 343 description: Public service health for the Atmosphere Mail cooperative relay ··· 329 345 endpoints: 330 346 - name: Web 331 347 group: core 332 - url: "https://atmos.email/healthz" 333 - interval: 60s 334 - conditions: 335 - - "[STATUS] == 200" 336 - - "[RESPONSE_TIME] < 5000" 337 - 338 - - name: Marketing Site 339 - group: core 340 348 url: "https://atmospheremail.com" 341 349 interval: 60s 342 350 conditions: 343 351 - "[STATUS] == 200" 344 352 - "[RESPONSE_TIME] < 5000" 353 + alerts: 354 + - type: ntfy 345 355 346 356 - name: Labeler 347 357 group: core ··· 350 360 conditions: 351 361 - "[STATUS] == 200" 352 362 - "[RESPONSE_TIME] < 5000" 363 + alerts: 364 + - type: ntfy 353 365 354 366 - name: SMTP Inbound (Port 25) 355 367 group: core ··· 357 369 interval: 60s 358 370 conditions: 359 371 - "[CONNECTED] == true" 372 + alerts: 373 + - type: ntfy 360 374 361 375 - name: SMTP Submission (Port 587) 362 376 group: core ··· 364 378 interval: 60s 365 379 conditions: 366 380 - "[CONNECTED] == true" 381 + alerts: 382 + - type: ntfy 367 383 368 384 - name: MX Record 369 385 group: dns ··· 450 466 ProtectKernelTunables = true; 451 467 ProtectKernelModules = true; 452 468 ProtectControlGroups = true; 469 + RestrictAddressFamilies = [ "AF_INET" "AF_INET6" "AF_UNIX" ]; 470 + RestrictNamespaces = true; 471 + RestrictRealtime = true; 472 + RestrictSUIDSGID = true; 473 + LockPersonality = true; 474 + MemoryDenyWriteExecute = true; 453 475 }; 454 476 }; 455 477 ··· 508 530 ProtectKernelTunables = true; 509 531 ProtectKernelModules = true; 510 532 ProtectControlGroups = true; 533 + RestrictAddressFamilies = [ "AF_INET" "AF_INET6" "AF_UNIX" ]; 534 + RestrictNamespaces = true; 535 + RestrictRealtime = true; 536 + RestrictSUIDSGID = true; 537 + LockPersonality = true; 538 + MemoryDenyWriteExecute = true; 511 539 ReadWritePaths = [ "/var/lib/atmos-labeler" ]; 512 540 }; 513 541 }; ··· 563 591 log { 564 592 output file /var/log/caddy/access-labeler.atmos.email.log 565 593 } 594 + @internal path /admin/* /metrics 595 + respond @internal 404 566 596 reverse_proxy localhost:8081 567 597 ''; 568 598 }; ··· 595 625 curl 596 626 htop 597 627 jq 628 + sqlite 598 629 ]; 599 630 600 631 # ------------------------------------------------------------------- ··· 606 637 ''; 607 638 608 639 # ------------------------------------------------------------------- 640 + # Backup — encrypted Restic backups to Hetzner Cloud Volume. 641 + # 642 + # Flow: format-backup-volume (first boot) → mount by label (fstab) 643 + # → restic-password-init → restic timer (every 6h). 644 + # 645 + # Restic repo and password both live on the volume so data survives 646 + # a full server rebuild. Password also on boot disk for access. 647 + # ------------------------------------------------------------------- 648 + systemd.services.format-backup-volume = { 649 + description = "Format Hetzner backup volume if unformatted"; 650 + wantedBy = [ "multi-user.target" ]; 651 + serviceConfig = { 652 + Type = "oneshot"; 653 + RemainAfterExit = true; 654 + }; 655 + path = [ pkgs.util-linux pkgs.e2fsprogs pkgs.systemd ]; 656 + script = '' 657 + DEV="" 658 + for d in /dev/disk/by-id/scsi-0HC_Volume_*; do 659 + [ -b "$d" ] && DEV="$d" && break 660 + done 661 + if [ -z "$DEV" ]; then 662 + echo "No Hetzner Cloud Volume found, skipping" 663 + exit 0 664 + fi 665 + RESOLVED=$(readlink -f "$DEV") 666 + if blkid -o value -s TYPE "$DEV" 2>/dev/null | grep -q .; then 667 + echo "$DEV ($RESOLVED) already formatted" 668 + else 669 + echo "Formatting $DEV ($RESOLVED) as ext4 with label atmos-ops-backup" 670 + mkfs.ext4 -L atmos-ops-backup "$DEV" 671 + fi 672 + # Trigger mount if not yet active (handles hot-attached volumes) 673 + if ! mountpoint -q /var/lib/atmos-backup 2>/dev/null; then 674 + systemctl start var-lib-atmos\\x2dbackup.mount 2>/dev/null || true 675 + fi 676 + ''; 677 + }; 678 + 679 + fileSystems."/var/lib/atmos-backup" = { 680 + device = "/dev/disk/by-label/atmos-ops-backup"; 681 + fsType = "ext4"; 682 + options = [ "nofail" "x-systemd.device-timeout=30" ]; 683 + }; 684 + 685 + systemd.services.restic-password-init = { 686 + description = "Generate restic encryption password if missing"; 687 + after = [ "local-fs.target" ]; 688 + wantedBy = [ "multi-user.target" ]; 689 + serviceConfig = { 690 + Type = "oneshot"; 691 + RemainAfterExit = true; 692 + }; 693 + script = '' 694 + if [ ! -f /root/.restic-password ]; then 695 + ${pkgs.coreutils}/bin/head -c 32 /dev/urandom | ${pkgs.coreutils}/bin/base64 > /root/.restic-password 696 + chmod 0400 /root/.restic-password 697 + fi 698 + if ${pkgs.util-linux}/bin/mountpoint -q /var/lib/atmos-backup && [ ! -f /var/lib/atmos-backup/.restic-password ]; then 699 + cp /root/.restic-password /var/lib/atmos-backup/.restic-password 700 + chmod 0400 /var/lib/atmos-backup/.restic-password 701 + fi 702 + ''; 703 + }; 704 + 705 + services.restic.backups.atmos-ops = { 706 + initialize = true; 707 + repository = "/var/lib/atmos-backup/restic-repo"; 708 + passwordFile = "/root/.restic-password"; 709 + paths = [ 710 + "/var/lib/atmos-backup/dumps" 711 + ]; 712 + backupPrepareCommand = '' 713 + if ! ${pkgs.util-linux}/bin/mountpoint -q /var/lib/atmos-backup; then 714 + echo "ERROR: backup volume not mounted" 715 + exit 1 716 + fi 717 + mkdir -p /var/lib/atmos-backup/dumps 718 + 719 + # PostgreSQL — consistent dump from running container 720 + PGCONTAINER=$(${pkgs.docker}/bin/docker ps -qf name=osprey-postgres 2>/dev/null || true) 721 + if [ -n "$PGCONTAINER" ]; then 722 + ${pkgs.docker}/bin/docker exec "$PGCONTAINER" \ 723 + pg_dump -U osprey -d osprey -Fc \ 724 + > /var/lib/atmos-backup/dumps/osprey.dump.tmp \ 725 + && mv /var/lib/atmos-backup/dumps/osprey.dump.tmp /var/lib/atmos-backup/dumps/osprey.dump 726 + else 727 + echo "WARN: osprey-postgres not running, skipping pg_dump" 728 + fi 729 + 730 + # Labeler SQLite — hot backup via .backup command 731 + if [ -f /var/lib/atmos-labeler/state/labeler.db ]; then 732 + ${pkgs.sqlite}/bin/sqlite3 /var/lib/atmos-labeler/state/labeler.db \ 733 + ".backup '/var/lib/atmos-backup/dumps/labeler.db'" 734 + fi 735 + 736 + # Gatus SQLite 737 + if [ -f /var/lib/atmos-status/gatus.db ]; then 738 + ${pkgs.sqlite}/bin/sqlite3 /var/lib/atmos-status/gatus.db \ 739 + ".backup '/var/lib/atmos-backup/dumps/gatus.db'" 740 + fi 741 + 742 + # Labeler signing key (also in sops, but belt-and-suspenders) 743 + if [ -f /var/lib/atmos-labeler/state/signing.key ]; then 744 + cp /var/lib/atmos-labeler/state/signing.key /var/lib/atmos-backup/dumps/labeler-signing.key 745 + fi 746 + ''; 747 + timerConfig = { 748 + OnCalendar = "*-*-* 00/6:00:00"; 749 + Persistent = true; 750 + RandomizedDelaySec = "30m"; 751 + }; 752 + pruneOpts = [ 753 + "--keep-daily 7" 754 + "--keep-weekly 4" 755 + "--keep-monthly 3" 756 + ]; 757 + }; 758 + 759 + # ------------------------------------------------------------------- 609 760 # Nix 610 761 # ------------------------------------------------------------------- 611 762 nix.settings = { 612 763 experimental-features = [ "nix-command" "flakes" ]; 613 764 trusted-users = [ "root" ]; 765 + }; 766 + 767 + nix.gc = { 768 + automatic = true; 769 + dates = "daily"; 770 + options = "--delete-older-than 5d"; 771 + persistent = true; 772 + }; 773 + 774 + nix.optimise = { 775 + automatic = true; 776 + dates = [ "weekly" ]; 614 777 }; 615 778 }; 616 779 }
+141 -7
infra/nixos/default.nix
··· 35 35 boot.loader.grub = { 36 36 enable = true; 37 37 efiSupport = false; 38 + configurationLimit = 20; 38 39 }; 39 40 40 41 boot.initrd.availableKernelModules = [ ··· 102 103 # Don't restart tailscaled during deploys — avoids SSH drops 103 104 systemd.services.tailscaled.restartIfChanged = false; 104 105 105 - # Tailscale Serve: proxy HTTPS :443 → admin/dashboard on :8080 106 - # Gives clean URLs: https://atmos-relay.internal.example/ui/ 107 - # 108 - # The relay's public HTTPS listener binds to the public IP 109 - # specifically (detected at startup), so Tailscale Serve can use 110 - # :443 on the Tailscale interface without conflict. 106 + # Tailscale Serve: proxy HTTPS on the Tailscale interface → admin 107 + # dashboard on :8080. The relay binds its public HTTPS listener to 108 + # the detected public IP (not 0.0.0.0), so both can use :443 on 109 + # different interfaces without conflict. Requires AF_NETLINK in the 110 + # relay's RestrictAddressFamilies for the IP detection to work. 111 111 systemd.services.tailscale-serve = { 112 112 description = "Configure Tailscale Serve for admin dashboard"; 113 113 after = [ "tailscaled.service" "atmos-relay.service" ]; ··· 151 151 ADMIN_TOKEN=${config.sops.placeholder.admin_token} 152 152 LABELER_URL=${config.sops.placeholder.labeler_url} 153 153 WARMUP_SEED_ADDRESSES=${config.sops.placeholder.warmup_seed_addresses} 154 + WARMUP_FROM_LOCAL_PARTS=scott,hello 155 + WARMUP_DIDS=did:plc:dy67wyyakm7u4v2lthy5zwbn 154 156 ''; 155 157 }; 156 158 ··· 165 167 email = "postmaster@atmos.email"; 166 168 webroot = "/var/lib/acme/.challenges"; 167 169 group = "atmos-relay"; 168 - reloadServices = [ "atmos-relay.service" ]; 170 + # No reloadServices: the relay uses an in-process 171 + # CertReloader (internal/relay/cert_reload.go) that picks 172 + # up new certs on the next TLS handshake via mtime polling. 173 + # A systemd reload/restart would drop in-flight SMTP/HTTP 174 + # sessions every 60-90 days and trigger the spool-reload 175 + # race that #208 fixed — this is exactly the failure mode 176 + # #216 closed. 177 + reloadServices = [ ]; 169 178 }; 170 179 certs."smtp.atmos.email" = {}; 171 180 certs."atmos.email" = {}; ··· 305 314 ProtectKernelTunables = true; 306 315 ProtectKernelModules = true; 307 316 ProtectControlGroups = true; 317 + RestrictAddressFamilies = [ "AF_INET" "AF_INET6" "AF_UNIX" "AF_NETLINK" ]; 318 + RestrictNamespaces = true; 319 + RestrictRealtime = true; 320 + RestrictSUIDSGID = true; 321 + LockPersonality = true; 322 + MemoryDenyWriteExecute = true; 308 323 ReadWritePaths = [ "/var/lib/atmos-relay" ]; 309 324 ReadOnlyPaths = [ 310 325 "/var/lib/acme/smtp.atmos.email" ··· 344 359 curl 345 360 htop 346 361 jq 362 + sqlite 347 363 ]; 348 364 349 365 # ------------------------------------------------------------------- ··· 355 371 ''; 356 372 357 373 # ------------------------------------------------------------------- 374 + # Backup — encrypted Restic backups to Hetzner Cloud Volume. 375 + # 376 + # Same pattern as atmos-ops: auto-format on first boot, mount by 377 + # label, auto-generate restic password, timer every 6h. 378 + # 379 + # Critical data: relay.sqlite, DKIM signing keys, OAuth key. 380 + # ------------------------------------------------------------------- 381 + systemd.services.format-backup-volume = { 382 + description = "Format Hetzner backup volume if unformatted"; 383 + wantedBy = [ "multi-user.target" ]; 384 + serviceConfig = { 385 + Type = "oneshot"; 386 + RemainAfterExit = true; 387 + }; 388 + path = [ pkgs.util-linux pkgs.e2fsprogs pkgs.systemd ]; 389 + script = '' 390 + DEV="" 391 + for d in /dev/disk/by-id/scsi-0HC_Volume_*; do 392 + [ -b "$d" ] && DEV="$d" && break 393 + done 394 + if [ -z "$DEV" ]; then 395 + echo "No Hetzner Cloud Volume found, skipping" 396 + exit 0 397 + fi 398 + RESOLVED=$(readlink -f "$DEV") 399 + if blkid -o value -s TYPE "$DEV" 2>/dev/null | grep -q .; then 400 + echo "$DEV ($RESOLVED) already formatted" 401 + else 402 + echo "Formatting $DEV ($RESOLVED) as ext4 with label atmos-relay-backup" 403 + mkfs.ext4 -L atmos-relay-backup "$DEV" 404 + fi 405 + if ! mountpoint -q /var/lib/atmos-backup 2>/dev/null; then 406 + systemctl start var-lib-atmos\\x2dbackup.mount 2>/dev/null || true 407 + fi 408 + ''; 409 + }; 410 + 411 + fileSystems."/var/lib/atmos-backup" = { 412 + device = "/dev/disk/by-label/atmos-relay-backup"; 413 + fsType = "ext4"; 414 + options = [ "nofail" "x-systemd.device-timeout=30" ]; 415 + }; 416 + 417 + systemd.services.restic-password-init = { 418 + description = "Generate restic encryption password if missing"; 419 + after = [ "local-fs.target" ]; 420 + wantedBy = [ "multi-user.target" ]; 421 + serviceConfig = { 422 + Type = "oneshot"; 423 + RemainAfterExit = true; 424 + }; 425 + script = '' 426 + if [ ! -f /root/.restic-password ]; then 427 + ${pkgs.coreutils}/bin/head -c 32 /dev/urandom | ${pkgs.coreutils}/bin/base64 > /root/.restic-password 428 + chmod 0400 /root/.restic-password 429 + fi 430 + if ${pkgs.util-linux}/bin/mountpoint -q /var/lib/atmos-backup && [ ! -f /var/lib/atmos-backup/.restic-password ]; then 431 + cp /root/.restic-password /var/lib/atmos-backup/.restic-password 432 + chmod 0400 /var/lib/atmos-backup/.restic-password 433 + fi 434 + ''; 435 + }; 436 + 437 + services.restic.backups.atmos-relay = { 438 + initialize = true; 439 + repository = "/var/lib/atmos-backup/restic-repo"; 440 + passwordFile = "/root/.restic-password"; 441 + paths = [ 442 + "/var/lib/atmos-backup/dumps" 443 + ]; 444 + backupPrepareCommand = '' 445 + if ! ${pkgs.util-linux}/bin/mountpoint -q /var/lib/atmos-backup; then 446 + echo "ERROR: backup volume not mounted" 447 + exit 1 448 + fi 449 + mkdir -p /var/lib/atmos-backup/dumps 450 + 451 + # Relay SQLite — hot backup 452 + if [ -f /var/lib/atmos-relay/relay.sqlite ]; then 453 + ${pkgs.sqlite}/bin/sqlite3 /var/lib/atmos-relay/relay.sqlite \ 454 + ".backup '/var/lib/atmos-backup/dumps/relay.sqlite'" 455 + fi 456 + 457 + # DKIM signing keys (generated at first boot, no other copy exists) 458 + if [ -f /var/lib/atmos-relay/operator-dkim-keys.json ]; then 459 + cp /var/lib/atmos-relay/operator-dkim-keys.json /var/lib/atmos-backup/dumps/ 460 + fi 461 + 462 + # OAuth signing key 463 + if [ -f /var/lib/atmos-relay/oauth-signing-key.pem ]; then 464 + cp /var/lib/atmos-relay/oauth-signing-key.pem /var/lib/atmos-backup/dumps/ 465 + fi 466 + ''; 467 + timerConfig = { 468 + OnCalendar = "*-*-* 00/6:00:00"; 469 + Persistent = true; 470 + RandomizedDelaySec = "30m"; 471 + }; 472 + pruneOpts = [ 473 + "--keep-daily 7" 474 + "--keep-weekly 4" 475 + "--keep-monthly 3" 476 + ]; 477 + }; 478 + 479 + # ------------------------------------------------------------------- 358 480 # Nix — enable flakes for nixos-rebuild 359 481 # ------------------------------------------------------------------- 360 482 nix.settings = { 361 483 experimental-features = [ "nix-command" "flakes" ]; 362 484 trusted-users = [ "root" ]; 485 + }; 486 + 487 + nix.gc = { 488 + automatic = true; 489 + dates = "daily"; 490 + options = "--delete-older-than 5d"; 491 + persistent = true; 492 + }; 493 + 494 + nix.optimise = { 495 + automatic = true; 496 + dates = [ "weekly" ]; 363 497 }; 364 498 }; 365 499 }
+14
infra/outputs.tf
··· 85 85 After that, all updates go through git push → CI → deploy. 86 86 EOT 87 87 } 88 + 89 + # --------------------------------------------------------------------------- 90 + # Backup volume outputs 91 + # --------------------------------------------------------------------------- 92 + 93 + output "ops_backup_volume_id" { 94 + description = "Hetzner volume ID of the ops backup volume" 95 + value = hcloud_volume.ops_backup.id 96 + } 97 + 98 + output "relay_backup_volume_id" { 99 + description = "Hetzner volume ID of the relay backup volume" 100 + value = hcloud_volume.relay_backup.id 101 + }
+1 -1
infra/providers.tf
··· 8 8 } 9 9 bunnynet = { 10 10 source = "BunnyWay/bunnynet" 11 - version = ">= 0.4" 11 + version = "~> 0.4" 12 12 } 13 13 } 14 14
+3 -2
infra/secrets/ops.yaml
··· 3 3 labeler_signing_key: ENC[AES256_GCM,data:TaNp9uM0M/r6g5R61dorNQAQwe29PJQTAD6/H4OL4W1pkt+jZWDRzsyKJf42V3kN9lqjKohkVQZHRNqrCZLOsg==,iv:VU8SU4vq2JRT9qa+XYHAQLl6Vvlp6Sdd2e6mzNDsOWU=,tag:11S1pN7YeQXVXHi0MGQ42Q==,type:str] 4 4 labeler_admin_token: ENC[AES256_GCM,data:cymsm1C1wCFUKHRzVw62ljzLGinPcdUBbUB3MDW62pADLErcBvYkzEvOs94=,iv:QxpDZOjEEs+uUonQTrBVLktbAdqyeMoVoaI/6V8vuY8=,tag:DPpClg2/uxvquWOBYvSKrA==,type:str] 5 5 bunny_api_key: ENC[AES256_GCM,data:OdMpHYdwr7FhikBwMxR0aTqvVHc=,iv:hIKLaB/eD9OlVeIQTKXkvCpGgUdcjNHK+gCADLL6tKI=,tag:agw8TUgPVQ5C7UhK4+pp4A==,type:str] 6 + ntfy_gatus_token: ENC[AES256_GCM,data:YLuQTku52WLOIPp4znYnlx9/MIX08CWrhobP/CEf700=,iv:yyiPlRKzKLfZULcSYJUZPD58ntXedFPzzLmUwrJRZR8=,tag:He9bdy7N9+dKqagyPBoU/g==,type:str] 6 7 sops: 7 8 age: 8 9 - recipient: age1kku4ud0z4h6ujn2qums6tupynqq8dhwpcc27kl00rqyeldgmk4lqhcanma ··· 23 24 ZGxJcGdnT2xzWXlFSXJWSlZRZkpKZTQKwZsL+rlt86a2yz0YJf1s77ASq9rOKHXg 24 25 RFy+AtLt/ErRwo77n5P0g7qiPi+2nuq3E1mJDsLGBPdXX2dB73zV4Q== 25 26 -----END AGE ENCRYPTED FILE----- 26 - lastmodified: "2026-04-22T18:16:22Z" 27 - mac: ENC[AES256_GCM,data:BtiqgLeZy4i82Vd1axvictu43RruEtmMbjAkb2OS8yLqPepqbgniUtTXyFPLb9tWPfqDjElJnaWmLxc6pzoASlGsymJ3evjlqfGg4GvVUj5g+ENoddbQEDPfGZKIdIVFSyjjFDIiQinbOHalaGt8pwIO3KrMjdDOihxHZMK0CNI=,iv:eAtLeBinfMY4Rq91cqscxZfwfLPgymXuIkhlYLAjMK4=,tag:JNB9IvE+GV/LoTBYKwrduw==,type:str] 27 + lastmodified: "2026-04-29T05:37:39Z" 28 + mac: ENC[AES256_GCM,data:IIcSl8hrBHjaQNS9OTv+SLeHDUBGQZQFFCao8Mx/wH3geAdw8318srSqa1RZsgvstK+kFmEWAtS6i8jQ6PcmEC2kkajv+XMkPT8CpTnk7a+Qj+Gfj/XOw2DNglNXnpZUffzR+p+gmXVSD1Q8e3tf5pxkt/EpI+yV10DcFlErQCU=,iv:d+ODakNtJOThSCmltphgt7vX/XSsCJStVvi1oLeGACI=,tag:ZMvH2N8zVuqF9Vqton8/yA==,type:str] 28 29 unencrypted_suffix: _unencrypted 29 30 version: 3.11.0
+238 -245
internal/admin/api.go
··· 6 6 "context" 7 7 "crypto/rand" 8 8 "crypto/subtle" 9 - "crypto/x509" 10 9 "encoding/hex" 11 10 "encoding/json" 12 - "errors" 13 11 "fmt" 14 12 "io" 15 13 "log" ··· 163 161 // state across all requests — constructed in NewComplete and never 164 162 // swapped out, so no additional locking at the field level. 165 163 enrollStartLimiter *rateLimiter 164 + 165 + // enrollAuthVerifier, when non-nil, gates /admin/enroll-start and 166 + // /admin/enroll on a successful AT Proto OAuth-verified DID. Nil 167 + // preserves legacy behavior (DNS-TXT-only ownership proof). Wired 168 + // via SetEnrollAuthVerifier in cmd/relay after the EnrollHandler 169 + // exists; do NOT enroll new members in production with this nil. 170 + enrollAuthVerifier EnrollAuthVerifier 171 + } 172 + 173 + // EnrollAuthVerifier resolves an OAuth-verified DID from an inbound 174 + // HTTP request (typically by reading an enroll-auth ticket cookie set 175 + // after a successful AT Proto OAuth flow). Returns the verified DID 176 + // and true on success, or "" / false when no proof is present. 177 + // 178 + // The relay package owns the implementation; the admin API only 179 + // consumes the interface so its package boundary stays free of UI 180 + // imports. See internal/admin/ui (EnrollHandler.VerifyAuthCookie). 181 + type EnrollAuthVerifier interface { 182 + VerifyAuthCookie(r *http.Request) (did string, ok bool) 183 + } 184 + 185 + // SetEnrollAuthVerifier installs the OAuth-verifier dependency. Must 186 + // be called before /admin/enroll-start and /admin/enroll receive 187 + // production traffic; safe to call once during wiring. 188 + func (a *API) SetEnrollAuthVerifier(v EnrollAuthVerifier) { 189 + a.enrollAuthVerifier = v 166 190 } 167 191 168 192 // maxDomainsPerMember is the soft limit on how many sending domains a ··· 236 260 // Public (API-key-authenticated) endpoint for members to check their own status. 237 261 // No admin token required — authenticated by the member's SMTP API key. 238 262 a.mux.HandleFunc("/member/status", a.handleMemberSelfStatus) 263 + // Self-service deliverability metrics — API-key-authenticated like /member/status. 264 + a.mux.HandleFunc("/member/deliverability", a.handleMemberSelfDeliverability) 239 265 // Self-service forward_to update — API-key-authenticated like /member/status. 240 266 a.mux.HandleFunc("/member/forward-to", a.handleMemberSelfForwardTo) 241 267 return a ··· 343 369 http.Error(w, "invalid domain format", http.StatusBadRequest) 344 370 return 345 371 } 372 + 373 + // OAuth-verified DID gate. Without this check, any caller can claim 374 + // any DID — the only ownership proof in the legacy flow is the DNS 375 + // TXT record, which only proves *domain* control. An attacker who 376 + // owns example.com can otherwise enroll under a victim DID, and 377 + // any subsequent operator-approved send burns the victim's atproto 378 + // reputation via FBL attribution. Closes #207. 379 + if a.enrollAuthVerifier != nil { 380 + verifiedDID, ok := a.enrollAuthVerifier.VerifyAuthCookie(r) 381 + if !ok { 382 + log.Printf("admin.enroll_start.no_oauth: claimed_did=%s", did) 383 + http.Error(w, "identity verification required — sign in with your handle before enrolling a domain", http.StatusForbidden) 384 + return 385 + } 386 + if !strings.EqualFold(verifiedDID, did) { 387 + log.Printf("admin.enroll_start.did_mismatch: claimed=%s verified=%s", did, verifiedDID) 388 + http.Error(w, "claimed DID does not match the verified identity from your sign-in", http.StatusForbidden) 389 + return 390 + } 391 + } 346 392 // ContactEmail is optional in the API — the wizard always collects it, 347 393 // but admin-driven callers (force-enroll flows, tests) can skip it. A 348 394 // non-empty value must look like an email so a typoed mailbox doesn't ··· 476 522 Port int `json:"port"` 477 523 } 478 524 525 + // handleEnroll completes an enrollment by token. The handler is a thin 526 + // orchestration over the phase functions in enroll_phases.go (#223): 527 + // 528 + // validate → loadAndVerifyPending → checkDomainAvailable 529 + // → provision → persist → dispatch → respond 530 + // 531 + // Each phase returns either a result or an *enrollHTTPError that the 532 + // orchestrator renders verbatim. Phases are individually unit-testable. 479 533 func (a *API) handleEnroll(w http.ResponseWriter, r *http.Request) { 480 534 if r.Method != http.MethodPost { 481 535 http.Error(w, "method not allowed", http.StatusMethodNotAllowed) ··· 490 544 return 491 545 } 492 546 493 - // Optional: forward_to specifies where inbound mail to this domain 494 - // should be delivered. Empty = inbound disabled for this domain. 495 - // Can be set later via PUT /admin/domain/{domain}/forward-to. 496 - forwardTo := r.URL.Query().Get("forward_to") 497 - if forwardTo != "" && !strings.Contains(forwardTo, "@") { 498 - http.Error(w, "forward_to must be a valid email address", http.StatusBadRequest) 499 - return 500 - } 501 - 502 - // The JSON body supplies a token that we look up in pending_enrollments. 503 - // The DID + domain come from that row, NOT from the request — that's 504 - // what prevents a caller from pairing someone else's verified DNS 505 - // record with their own DID. 506 - body, err := io.ReadAll(io.LimitReader(r.Body, 4096)) 507 - if err != nil { 508 - http.Error(w, "error reading request body", http.StatusBadRequest) 509 - return 510 - } 511 - if len(body) == 0 { 512 - http.Error(w, "enrollment token required: POST JSON body with {\"token\": \"...\"}", http.StatusBadRequest) 513 - return 514 - } 515 - var enrollReq EnrollRequest 516 - if err := json.Unmarshal(body, &enrollReq); err != nil { 517 - http.Error(w, "invalid JSON body", http.StatusBadRequest) 518 - return 519 - } 520 - if enrollReq.Token == "" { 521 - http.Error(w, "token field required", http.StatusBadRequest) 522 - return 523 - } 524 - 525 - pending, err := a.store.GetPendingEnrollment(r.Context(), enrollReq.Token) 526 - if err != nil { 527 - log.Printf("admin.enroll: token_lookup_error=%v", err) 528 - http.Error(w, "internal error", http.StatusInternalServerError) 529 - return 530 - } 531 - if pending == nil { 532 - // Don't distinguish "never existed" from "already consumed" 533 - // to avoid leaking enrollment state to callers. 534 - http.Error(w, "token not found or already used", http.StatusNotFound) 535 - return 536 - } 537 - if time.Now().UTC().After(pending.ExpiresAt) { 538 - // 410 Gone signals "the thing you're pointing at existed but 539 - // is no longer retrievable" — precisely the pending-expired 540 - // semantic. Clean the row so the same token can't be retried. 541 - _ = a.store.DeletePendingEnrollment(r.Context(), enrollReq.Token) 542 - http.Error(w, "enrollment token expired — start over", http.StatusGone) 543 - return 544 - } 545 - 546 - // DNS verification. The verifier is responsible for distinguishing 547 - // "record absent" from "record present but wrong token" from 548 - // "transient DNS error" — we surface each with a different status 549 - // so the UI can render appropriate guidance. 550 - if err := a.domainVerifier.Verify(r.Context(), pending.Domain, enrollReq.Token); err != nil { 551 - log.Printf("admin.enroll: did=%s domain=%s dns_verify_error=%v", pending.DID, pending.Domain, err) 552 - switch { 553 - case errors.Is(err, enroll.ErrNoTXTRecord): 554 - http.Error(w, "no atmos-verify TXT record found at _atmos-enroll."+pending.Domain+" — publish the record and retry", http.StatusForbidden) 555 - case errors.Is(err, enroll.ErrTokenMismatch): 556 - http.Error(w, "TXT record does not contain the expected token — double-check the value", http.StatusForbidden) 557 - default: 558 - // Transient (SERVFAIL, timeout, etc.) — tell the caller to retry. 559 - http.Error(w, fmt.Sprintf("DNS lookup failed: %v — retry in a moment", err), http.StatusServiceUnavailable) 560 - } 547 + token, forwardTo, herr := validateEnrollRequest(r) 548 + if herr != nil { 549 + http.Error(w, herr.Message, herr.Status) 561 550 return 562 551 } 563 552 564 - did := pending.DID 565 - domain := pending.Domain 566 - log.Printf("admin.enroll: did=%s domain=%s dns_verified=true", did, domain) 567 - 568 - // Consume the pending row now that verification succeeded. Don't 569 - // fail the enrollment if cleanup errors — CleanExpired will sweep 570 - // it later and the unique-domain constraint prevents reuse. 571 - if err := a.store.DeletePendingEnrollment(r.Context(), enrollReq.Token); err != nil { 572 - log.Printf("admin.enroll: did=%s domain=%s pending_cleanup_error=%v", did, domain, err) 573 - } 574 - 575 - // Check for existing domain (must be unique across all members) 576 - existingDomain, err := a.store.GetMemberDomain(r.Context(), domain) 577 - if err != nil { 578 - log.Printf("admin.enroll: did=%s error=%v", did, err) 579 - http.Error(w, "internal error", http.StatusInternalServerError) 580 - return 581 - } 582 - if existingDomain != nil { 583 - if existingDomain.DID == did { 584 - http.Error(w, "You've already enrolled this domain. Sign in at /account to manage it.", http.StatusConflict) 585 - } else { 586 - http.Error(w, "This domain is registered to another account.", http.StatusConflict) 587 - } 553 + pending, herr := a.loadAndVerifyPending(r.Context(), r, token) 554 + if herr != nil { 555 + http.Error(w, herr.Message, herr.Status) 588 556 return 589 557 } 590 558 591 - // Defense-in-depth domain limit check. The primary check lives in 592 - // handleEnrollStart, but a second enrollment could complete between 593 - // start and verify if the DID raced to acquire domains via another 594 - // browser tab or API caller. 595 - existingDomains, err := a.store.ListMemberDomains(r.Context(), did) 596 - if err != nil { 597 - log.Printf("admin.enroll: did=%s list_domains_error=%v", did, err) 598 - http.Error(w, "internal error", http.StatusInternalServerError) 599 - return 600 - } 601 - if len(existingDomains) >= maxDomainsPerMember { 602 - http.Error(w, fmt.Sprintf("domain limit reached — your account currently supports up to %d sending domains", maxDomainsPerMember), http.StatusConflict) 559 + if herr := a.checkDomainAvailable(r.Context(), pending.DID, pending.Domain); herr != nil { 560 + http.Error(w, herr.Message, herr.Status) 603 561 return 604 562 } 605 563 606 - // Check if the DID already exists — if so, add domain to existing member 607 - existing, err := a.store.GetMember(r.Context(), did) 608 - if err != nil { 609 - log.Printf("admin.enroll: did=%s error=%v", did, err) 610 - http.Error(w, "internal error", http.StatusInternalServerError) 564 + result, herr := a.provisionMemberAndDomain(r.Context(), pending, forwardTo) 565 + if herr != nil { 566 + http.Error(w, herr.Message, herr.Status) 611 567 return 612 568 } 613 569 614 - // Generate API key 615 - apiKey, err := relay.GenerateAPIKey() 616 - if err != nil { 617 - log.Printf("admin.enroll: did=%s error=generate_api_key %v", did, err) 570 + if err := a.store.EnrollMember(r.Context(), result.Member, result.Domain); err != nil { 571 + log.Printf("admin.enroll: did=%s domain=%s error=enroll %v", pending.DID, pending.Domain, err) 618 572 http.Error(w, "internal error", http.StatusInternalServerError) 619 573 return 620 574 } 621 - apiKeyHash, err := relay.HashAPIKey(apiKey) 622 - if err != nil { 623 - log.Printf("admin.enroll: did=%s error=hash_api_key %v", did, err) 624 - http.Error(w, "internal error", http.StatusInternalServerError) 625 - return 626 - } 627 - 628 - // Generate DKIM keypair 629 - selector := fmt.Sprintf("atmos%s", time.Now().UTC().Format("20060102")) 630 - dkimKeys, err := relay.GenerateDKIMKeys(selector) 631 - if err != nil { 632 - log.Printf("admin.enroll: did=%s error=generate_dkim %v", did, err) 633 - http.Error(w, "internal error", http.StatusInternalServerError) 634 - return 635 - } 636 - 637 - // Serialize private keys for storage 638 - rsaBytes, err := x509.MarshalPKCS8PrivateKey(dkimKeys.RSAPriv) 639 - if err != nil { 640 - log.Printf("admin.enroll: did=%s error=marshal_rsa %v", did, err) 641 - http.Error(w, "internal error", http.StatusInternalServerError) 642 - return 643 - } 644 - edBytes, err := x509.MarshalPKCS8PrivateKey(dkimKeys.EdPriv) 645 - if err != nil { 646 - log.Printf("admin.enroll: did=%s error=marshal_ed %v", did, err) 647 - http.Error(w, "internal error", http.StatusInternalServerError) 648 - return 649 - } 650 - 651 - now := time.Now().UTC() 575 + log.Printf("admin.enroll: did=%s domain=%s selector=%s new_did=%v", pending.DID, pending.Domain, result.DKIMSelector, result.IsNewDID) 652 576 653 - // Build member record only if this is a new DID. All enrollments start 654 - // in pending status and cannot SMTP AUTH until an operator approves 655 - // via /admin/member/{did}/approve. DIDVerified stays false until 656 - // Phase 2 atproto OAuth; DNS TXT ownership of the domain is a separate 657 - // signal from DID ownership. 658 - var memberRecord *relaystore.Member 659 - if existing == nil { 660 - if !pending.TermsAccepted { 661 - http.Error(w, "terms acceptance required", http.StatusBadRequest) 662 - return 663 - } 664 - memberRecord = &relaystore.Member{ 665 - DID: did, 666 - Status: relaystore.StatusPending, 667 - DIDVerified: false, 668 - TermsAcceptedAt: now, 669 - TermsVersion: relaystore.CurrentTermsVersion, 670 - HourlyLimit: 100, 671 - DailyLimit: 1000, 672 - CreatedAt: now, 673 - UpdatedAt: now, 674 - } 675 - } 577 + a.dispatchEnrollNotifications(pending, result.IsNewDID) 676 578 677 - domainRecord := &relaystore.MemberDomain{ 678 - Domain: domain, 679 - DID: did, 680 - APIKeyHash: apiKeyHash, 681 - DKIMRSAPriv: rsaBytes, 682 - DKIMEdPriv: edBytes, 683 - DKIMSelector: selector, 684 - ForwardTo: forwardTo, 685 - ContactEmail: pending.ContactEmail, 686 - CreatedAt: now, 687 - } 688 - 689 - // Atomic insert: member (if new) + domain in one transaction 690 - if err := a.store.EnrollMember(r.Context(), memberRecord, domainRecord); err != nil { 691 - log.Printf("admin.enroll: did=%s domain=%s error=enroll %v", did, domain, err) 692 - http.Error(w, "internal error", http.StatusInternalServerError) 693 - return 694 - } 695 - 696 - log.Printf("admin.enroll: did=%s domain=%s selector=%s new_did=%v", did, domain, selector, existing == nil) 697 - 698 - // Fire the operator-ping email only for truly-new DIDs — re-sending the 699 - // ping every time an approved member adds a domain would train operators 700 - // to ignore it. The webhook notification fires for both cases (different 701 - // kinds) so the operator has visibility without email fatigue. 702 - if existing == nil { 703 - go a.FireOperatorPing(context.Background(), did, domain, pending.ContactEmail) 704 - a.notifyEvent(notify.KindMemberPending, did, domain, "", pending.ContactEmail) 705 - } else { 706 - a.notifyEvent(notify.KindMemberDomainAdded, did, domain, "", pending.ContactEmail) 707 - } 708 - 709 - // Trigger contact email verification if a contact_email was provided. 710 - // Runs in a goroutine so a slow DB/enqueue doesn't block the HTTP response. 711 - if pending.ContactEmail != "" { 712 - go a.TriggerEmailVerification(context.Background(), domain, pending.ContactEmail) 713 - } 714 - 715 - // Check SPF alignment if checker is configured 716 - var spfResult *SPFAlignmentResponse 717 - if a.spfChecker != nil { 718 - result := a.spfChecker.CheckAlignment(r.Context(), domain) 719 - spfResult = &SPFAlignmentResponse{ 720 - Aligned: result.Aligned, 721 - Failures: result.Failures, 722 - } 723 - if !result.Aligned { 724 - log.Printf("admin.enroll.spf_warning: did=%s domain=%s failures=%v", did, domain, result.Failures) 725 - } 726 - } 727 - 728 - resp := EnrollResponse{ 729 - DID: did, 730 - APIKey: apiKey, 731 - DKIM: DKIMResponse{ 732 - Selector: selector, 733 - RSASelector: dkimKeys.RSASelectorName(), 734 - EdSelector: dkimKeys.EdSelectorName(), 735 - RSARecord: dkimKeys.RSADNSRecord(), 736 - EdRecord: dkimKeys.EdDNSRecord(), 737 - RSADNSName: fmt.Sprintf("%s._domainkey.%s", dkimKeys.RSASelectorName(), domain), 738 - EdDNSName: fmt.Sprintf("%s._domainkey.%s", dkimKeys.EdSelectorName(), domain), 739 - }, 740 - SMTP: SMTPResponse{ 741 - Host: "smtp." + a.domain, 742 - Port: 587, 743 - }, 744 - SPFAlignment: spfResult, 745 - apiKeyHash: apiKeyHash, 746 - } 747 - 579 + resp := a.buildEnrollResponse(r.Context(), result, pending.Domain) 748 580 w.Header().Set("Content-Type", "application/json") 749 581 json.NewEncoder(w).Encode(resp) 750 582 } ··· 1082 914 1083 915 // --- Label bypass --- 1084 916 917 + // bypassDefaultTTL is the expiry applied when the request omits ttl_hours. 918 + // 24h matches the typical "let me look at this in the morning" cadence 919 + // for an operator who just disabled enforcement to investigate an 920 + // outage. Anything longer should be deliberate and explicit. 921 + const bypassDefaultTTL = 24 * time.Hour 922 + 923 + // bypassMaxTTL caps the expiry to prevent operator-token compromise 924 + // from creating a permanent T&S bypass. 30 days is long enough for 925 + // legitimate investigation cycles but short enough that an attacker 926 + // can't hide indefinitely. 927 + const bypassMaxTTL = 30 * 24 * time.Hour 928 + 1085 929 func (a *API) handleBypassAdd(w http.ResponseWriter, r *http.Request, did string) { 1086 930 if a.labelChecker == nil { 1087 931 http.Error(w, "label checker not configured", http.StatusInternalServerError) 1088 932 return 1089 933 } 934 + // Optional JSON body: {"ttl_hours": int, "reason": string}. 935 + // Empty body falls back to defaults. 936 + var req struct { 937 + TTLHours int `json:"ttl_hours"` 938 + Reason string `json:"reason"` 939 + } 940 + if r.ContentLength > 0 { 941 + if err := json.NewDecoder(io.LimitReader(r.Body, 4096)).Decode(&req); err != nil { 942 + http.Error(w, "invalid JSON body", http.StatusBadRequest) 943 + return 944 + } 945 + } 946 + ttl := bypassDefaultTTL 947 + if req.TTLHours > 0 { 948 + ttl = time.Duration(req.TTLHours) * time.Hour 949 + } 950 + if ttl > bypassMaxTTL { 951 + http.Error(w, fmt.Sprintf("ttl_hours capped at %d (30d) to prevent permanent T&S disable", int(bypassMaxTTL.Hours())), http.StatusBadRequest) 952 + return 953 + } 954 + expiresAt := time.Now().UTC().Add(ttl) 955 + reason := strings.TrimSpace(req.Reason) 1090 956 a.labelChecker.AddBypassDID(did) 1091 - if err := a.store.InsertBypassDID(r.Context(), did); err != nil { 957 + if err := a.store.InsertBypassDID(r.Context(), did, expiresAt, reason); err != nil { 1092 958 log.Printf("admin.bypass_add: did=%s persist_error=%v", did, err) 959 + http.Error(w, "internal error", http.StatusInternalServerError) 960 + return 1093 961 } 1094 - log.Printf("admin.bypass_add: did=%s", did) 962 + log.Printf("admin.bypass_add: did=%s expires_at=%s reason=%q", did, expiresAt.Format(time.RFC3339), reason) 963 + a.notifyEvent(notify.KindBypassAdded, did, "", reason, "") 1095 964 w.Header().Set("Content-Type", "application/json") 1096 - json.NewEncoder(w).Encode(map[string]string{"status": "bypassed", "did": did}) 965 + json.NewEncoder(w).Encode(map[string]string{ 966 + "status": "bypassed", 967 + "did": did, 968 + "expires_at": expiresAt.Format(time.RFC3339), 969 + "reason": reason, 970 + }) 1097 971 } 1098 972 1099 973 func (a *API) handleBypassRemove(w http.ResponseWriter, r *http.Request, did string) { ··· 1102 976 return 1103 977 } 1104 978 a.labelChecker.RemoveBypassDID(did) 1105 - if err := a.store.DeleteBypassDID(r.Context(), did); err != nil { 979 + if err := a.store.DeleteBypassDID(r.Context(), did, "manual"); err != nil { 1106 980 log.Printf("admin.bypass_remove: did=%s persist_error=%v", did, err) 981 + http.Error(w, "internal error", http.StatusInternalServerError) 982 + return 1107 983 } 1108 984 log.Printf("admin.bypass_remove: did=%s", did) 985 + a.notifyEvent(notify.KindBypassRemoved, did, "", "manual", "") 1109 986 w.Header().Set("Content-Type", "application/json") 1110 987 json.NewEncoder(w).Encode(map[string]string{"status": "removed", "did": did}) 1111 988 } ··· 1439 1316 func generateEmailVerifyToken() (string, error) { 1440 1317 var b [32]byte 1441 1318 if _, err := rand.Read(b[:]); err != nil { 1442 - return "", fmt.Errorf("generate verify token: %v", err) 1319 + return "", fmt.Errorf("generate verify token: %w", err) 1443 1320 } 1444 1321 return hex.EncodeToString(b[:]), nil 1445 1322 } ··· 1502 1379 </head> 1503 1380 <body><h1>%s</h1><p>%s</p></body> 1504 1381 </html>`, title, title, message) 1382 + } 1383 + 1384 + // handleMemberSelfDeliverability returns a member's own deliverability 1385 + // metrics: sends, bounces, complaints, daily sparkline, and reputation 1386 + // labels. API-key-authenticated, same model as /member/status. 1387 + func (a *API) handleMemberSelfDeliverability(w http.ResponseWriter, r *http.Request) { 1388 + if r.Method != http.MethodGet { 1389 + http.Error(w, `{"error":"method not allowed"}`, http.StatusMethodNotAllowed) 1390 + return 1391 + } 1392 + 1393 + // Authenticate: DID in query + API key in Authorization header. 1394 + did := r.URL.Query().Get("did") 1395 + if did == "" { 1396 + http.Error(w, `{"error":"did query parameter required"}`, http.StatusBadRequest) 1397 + return 1398 + } 1399 + if !validDID.MatchString(did) { 1400 + http.Error(w, `{"error":"invalid DID format"}`, http.StatusBadRequest) 1401 + return 1402 + } 1403 + 1404 + apiKey := "" 1405 + if auth := r.Header.Get("Authorization"); strings.HasPrefix(auth, "Bearer ") { 1406 + apiKey = strings.TrimPrefix(auth, "Bearer ") 1407 + } 1408 + if apiKey == "" { 1409 + http.Error(w, `{"error":"Authorization: Bearer <api_key> header required"}`, http.StatusUnauthorized) 1410 + return 1411 + } 1412 + 1413 + member, domains, err := a.store.GetMemberWithDomains(r.Context(), did) 1414 + if err != nil { 1415 + log.Printf("member.deliverability: did=%s error=%v", did, err) 1416 + http.Error(w, `{"error":"internal error"}`, http.StatusInternalServerError) 1417 + return 1418 + } 1419 + if member == nil { 1420 + equalizeBcryptTiming(apiKey) 1421 + http.Error(w, `{"error":"authentication failed"}`, http.StatusUnauthorized) 1422 + return 1423 + } 1424 + 1425 + authenticated := false 1426 + for _, d := range domains { 1427 + if relay.VerifyAPIKey(apiKey, d.APIKeyHash) { 1428 + authenticated = true 1429 + break 1430 + } 1431 + } 1432 + if !authenticated { 1433 + http.Error(w, `{"error":"authentication failed"}`, http.StatusUnauthorized) 1434 + return 1435 + } 1436 + 1437 + ctx := r.Context() 1438 + since14d := time.Now().UTC().AddDate(0, 0, -14) 1439 + 1440 + total, bounced, err := a.store.GetMessageCounts(ctx, did, since14d) 1441 + if err != nil { 1442 + log.Printf("member.deliverability: GetMessageCounts did=%s error=%v", did, err) 1443 + http.Error(w, `{"error":"internal error"}`, http.StatusInternalServerError) 1444 + return 1445 + } 1446 + 1447 + complaints, err := a.store.GetComplaintCount(ctx, did, since14d) 1448 + if err != nil { 1449 + log.Printf("member.deliverability: GetComplaintCount did=%s error=%v", did, err) 1450 + http.Error(w, `{"error":"internal error"}`, http.StatusInternalServerError) 1451 + return 1452 + } 1453 + 1454 + daily, err := a.store.GetDailySendCounts(ctx, did, 14) 1455 + if err != nil { 1456 + log.Printf("member.deliverability: GetDailySendCounts did=%s error=%v", did, err) 1457 + http.Error(w, `{"error":"internal error"}`, http.StatusInternalServerError) 1458 + return 1459 + } 1460 + 1461 + // Fetch labels (best-effort) 1462 + var labels []string 1463 + if a.labelChecker != nil { 1464 + labels, _ = a.labelChecker.QueryLabels(ctx, did) 1465 + } 1466 + 1467 + w.Header().Set("Content-Type", "application/json") 1468 + json.NewEncoder(w).Encode(struct { 1469 + DID string `json:"did"` 1470 + Status string `json:"status"` 1471 + Sent14d int64 `json:"sent_14d"` 1472 + Bounced14d int64 `json:"bounced_14d"` 1473 + Complaints14d int64 `json:"complaints_14d"` 1474 + BounceRate float64 `json:"bounce_rate"` 1475 + DailySends []int64 `json:"daily_sends"` 1476 + HourlyLimit int `json:"hourly_limit"` 1477 + DailyLimit int `json:"daily_limit"` 1478 + Labels []string `json:"labels"` 1479 + }{ 1480 + DID: did, 1481 + Status: member.Status, 1482 + Sent14d: total, 1483 + Bounced14d: bounced, 1484 + Complaints14d: complaints, 1485 + BounceRate: safeBounceRate(total, bounced), 1486 + DailySends: daily, 1487 + HourlyLimit: member.HourlyLimit, 1488 + DailyLimit: member.DailyLimit, 1489 + Labels: labels, 1490 + }) 1491 + } 1492 + 1493 + func safeBounceRate(total, bounced int64) float64 { 1494 + if total == 0 { 1495 + return 0.0 1496 + } 1497 + return float64(bounced) / float64(total) 1505 1498 } 1506 1499 1507 1500 // handleMemberSendVerification is the admin endpoint POST
+88
internal/admin/api_test.go
··· 1119 1119 } 1120 1120 } 1121 1121 1122 + func deliverabilityReq(did, apiKey string) *http.Request { 1123 + req := httptest.NewRequest("GET", "/member/deliverability?did="+did, nil) 1124 + if apiKey != "" { 1125 + req.Header.Set("Authorization", "Bearer "+apiKey) 1126 + } 1127 + return req 1128 + } 1129 + 1130 + func TestSelfDeliverabilitySuccess(t *testing.T) { 1131 + api, store := testAdminAPI(t) 1132 + did := "did:plc:deliveraaaaaaaaaaaaaaaaa" 1133 + apiKey := enrollWithAPIKey(t, store, did, "example.com") 1134 + 1135 + // Seed some messages 1136 + ctx := context.Background() 1137 + now := time.Now().UTC() 1138 + for i := 0; i < 5; i++ { 1139 + _, _ = store.InsertMessage(ctx, &relaystore.Message{ 1140 + MemberDID: did, FromAddr: "x@example.com", ToAddr: "y@z.com", 1141 + MessageID: fmt.Sprintf("<m%d>", i), Status: relaystore.MsgSent, CreatedAt: now, 1142 + }) 1143 + } 1144 + _, _ = store.InsertMessage(ctx, &relaystore.Message{ 1145 + MemberDID: did, FromAddr: "x@example.com", ToAddr: "y@z.com", 1146 + MessageID: "<b1>", Status: relaystore.MsgBounced, CreatedAt: now, 1147 + }) 1148 + _, _ = store.InsertFeedbackEvent(ctx, &relaystore.FeedbackEvent{ 1149 + MemberDID: did, EventType: "complaint", CreatedAt: now, 1150 + }) 1151 + 1152 + w := httptest.NewRecorder() 1153 + api.ServeHTTP(w, deliverabilityReq(did, apiKey)) 1154 + 1155 + if w.Code != http.StatusOK { 1156 + t.Fatalf("status = %d, want 200; body: %s", w.Code, w.Body.String()) 1157 + } 1158 + 1159 + var resp struct { 1160 + Sent14d int64 `json:"sent_14d"` 1161 + Bounced14d int64 `json:"bounced_14d"` 1162 + Complaints14d int64 `json:"complaints_14d"` 1163 + BounceRate float64 `json:"bounce_rate"` 1164 + DailySends []int64 `json:"daily_sends"` 1165 + } 1166 + if err := json.NewDecoder(w.Body).Decode(&resp); err != nil { 1167 + t.Fatal(err) 1168 + } 1169 + if resp.Sent14d != 6 { 1170 + t.Errorf("Sent14d = %d, want 6 (5 sent + 1 bounced)", resp.Sent14d) 1171 + } 1172 + if resp.Bounced14d != 1 { 1173 + t.Errorf("Bounced14d = %d, want 1", resp.Bounced14d) 1174 + } 1175 + if resp.Complaints14d != 1 { 1176 + t.Errorf("Complaints14d = %d, want 1", resp.Complaints14d) 1177 + } 1178 + if resp.BounceRate != 1.0/6.0 { 1179 + t.Errorf("BounceRate = %f, want %f", resp.BounceRate, 1.0/6.0) 1180 + } 1181 + if len(resp.DailySends) != 14 { 1182 + t.Errorf("DailySends len = %d, want 14", len(resp.DailySends)) 1183 + } 1184 + } 1185 + 1186 + func TestSelfDeliverabilityBadAPIKey(t *testing.T) { 1187 + api, store := testAdminAPI(t) 1188 + did := "did:plc:deliverbbbbbbbbbbbbbbbbb" 1189 + enrollWithAPIKey(t, store, did, "example.com") 1190 + 1191 + w := httptest.NewRecorder() 1192 + api.ServeHTTP(w, deliverabilityReq(did, "wrong-key")) 1193 + 1194 + if w.Code != http.StatusUnauthorized { 1195 + t.Errorf("status = %d, want 401", w.Code) 1196 + } 1197 + } 1198 + 1199 + func TestSelfDeliverabilityMissingAuth(t *testing.T) { 1200 + api, _ := testAdminAPI(t) 1201 + 1202 + req := httptest.NewRequest("GET", "/member/deliverability?did=did:plc:aaaaaaaaaaaaaaaaaaaaaaaa", nil) 1203 + w := httptest.NewRecorder() 1204 + api.ServeHTTP(w, req) 1205 + if w.Code != http.StatusUnauthorized { 1206 + t.Errorf("status = %d, want 401", w.Code) 1207 + } 1208 + } 1209 + 1122 1210 // --- forward_to admin endpoints --- 1123 1211 1124 1212 func TestAdminDomainForwardToSet_RequiresAdminAuth(t *testing.T) {
+195
internal/admin/bypass_audit_test.go
··· 1 + // SPDX-License-Identifier: AGPL-3.0-or-later 2 + 3 + package admin 4 + 5 + import ( 6 + "bytes" 7 + "context" 8 + "encoding/json" 9 + "fmt" 10 + "net/http" 11 + "net/http/httptest" 12 + "strings" 13 + "testing" 14 + "time" 15 + 16 + "atmosphere-mail/internal/relay" 17 + "atmosphere-mail/internal/relaystore" 18 + ) 19 + 20 + func newBypassAPI(t *testing.T) (*API, *relay.LabelChecker, *relaystore.Store) { 21 + t.Helper() 22 + store, err := relaystore.New(":memory:") 23 + if err != nil { 24 + t.Fatal(err) 25 + } 26 + t.Cleanup(func() { store.Close() }) 27 + lc := relay.NewLabelChecker("http://127.0.0.1:1", nil) 28 + api := NewWithLabelChecker(store, "tok", "atmos.email", lc) 29 + return api, lc, store 30 + } 31 + 32 + func bypassAddReq(t *testing.T, api *API, did string, body any) *httptest.ResponseRecorder { 33 + t.Helper() 34 + var buf []byte 35 + if body != nil { 36 + var err error 37 + buf, err = json.Marshal(body) 38 + if err != nil { 39 + t.Fatal(err) 40 + } 41 + } 42 + req := httptest.NewRequest("POST", "/admin/member/"+did+"/bypass-labels", bytes.NewReader(buf)) 43 + if buf != nil { 44 + req.Header.Set("Content-Type", "application/json") 45 + } 46 + req.Header.Set("Authorization", "Bearer tok") 47 + w := httptest.NewRecorder() 48 + api.ServeHTTP(w, req) 49 + return w 50 + } 51 + 52 + // TestBypassAdd_DefaultTTLApplied confirms a bare add (no body) lands 53 + // with the 24h default expiry rather than no expiry. 54 + func TestBypassAdd_DefaultTTLApplied(t *testing.T) { 55 + api, _, store := newBypassAPI(t) 56 + did := "did:plc:aaaaaaaabbbbbbbbcccccccc" 57 + 58 + w := bypassAddReq(t, api, did, nil) 59 + if w.Code != http.StatusOK { 60 + t.Fatalf("status = %d body=%s", w.Code, w.Body.String()) 61 + } 62 + var resp map[string]string 63 + _ = json.Unmarshal(w.Body.Bytes(), &resp) 64 + exp, err := time.Parse(time.RFC3339, resp["expires_at"]) 65 + if err != nil { 66 + t.Fatalf("parse expires_at %q: %v", resp["expires_at"], err) 67 + } 68 + dt := time.Until(exp) 69 + if dt < 23*time.Hour || dt > 25*time.Hour { 70 + t.Errorf("default TTL = %s, want ~24h", dt) 71 + } 72 + 73 + // Persisted in store with that expiry. 74 + listed, _ := store.ListBypassDIDs(context.Background()) 75 + if len(listed) != 1 || listed[0] != did { 76 + t.Errorf("ListBypassDIDs = %v, want [%s]", listed, did) 77 + } 78 + } 79 + 80 + // TestBypassAdd_RejectsTTLOverCap pins the security cap. 81 + func TestBypassAdd_RejectsTTLOverCap(t *testing.T) { 82 + api, _, _ := newBypassAPI(t) 83 + w := bypassAddReq(t, api, "did:plc:bbbbbbbbccccccccdddddddd", 84 + map[string]any{"ttl_hours": 24*30 + 1, "reason": "dangerous"}) 85 + if w.Code != http.StatusBadRequest { 86 + t.Fatalf("status = %d, want 400; body=%s", w.Code, w.Body.String()) 87 + } 88 + if !strings.Contains(strings.ToLower(w.Body.String()), "ttl_hours") { 89 + t.Errorf("error message should mention ttl_hours; got %q", w.Body.String()) 90 + } 91 + } 92 + 93 + // TestBypassAdd_PersistsReason — the reason string round-trips. 94 + func TestBypassAdd_PersistsReason(t *testing.T) { 95 + api, _, store := newBypassAPI(t) 96 + did := "did:plc:ccccccccddddddddeeeeeeee" 97 + w := bypassAddReq(t, api, did, map[string]any{"ttl_hours": 1, "reason": "investigating sender flood"}) 98 + if w.Code != http.StatusOK { 99 + t.Fatalf("status = %d body=%s", w.Code, w.Body.String()) 100 + } 101 + var n int 102 + _ = store.SampleStats() // unused but verifies store is initialized 103 + row := storeRowReason(t, store, did) 104 + if row != "investigating sender flood" { 105 + t.Errorf("persisted reason = %q, want %q", row, "investigating sender flood") 106 + } 107 + _ = n 108 + } 109 + 110 + // storeRowReason peeks the bypass_dids row directly so the test can 111 + // assert reason persistence without exposing a Store getter. 112 + func storeRowReason(t *testing.T, s *relaystore.Store, did string) string { 113 + t.Helper() 114 + type stmtCarrier interface { 115 + // duck-typed access to the underlying *sql.DB via a short 116 + // query helper we already use in audit-log tests. 117 + } 118 + _ = stmtCarrier(nil) 119 + rows, err := s.ListBypassDIDs(context.Background()) 120 + if err != nil || len(rows) == 0 { 121 + t.Fatalf("expected at least one row, got %v err=%v", rows, err) 122 + } 123 + // Fall back: query store internals via SQL through the public 124 + // ListBypassAuditEntries helper. We don't have one yet; in the 125 + // meantime, the audit row carries the reason and is asserted in 126 + // the relaystore-package tests. Here we just verify the entry 127 + // exists in the active set. 128 + for _, d := range rows { 129 + if d == did { 130 + return persistedReasonFromAudit(t, s, did) 131 + } 132 + } 133 + return "" 134 + } 135 + 136 + // persistedReasonFromAudit reads the most-recent 'add' audit row for 137 + // the given DID via a raw query. The Store doesn't expose the audit 138 + // table publicly, so we go through the test-only SQL helper to keep 139 + // the assertion legible. 140 + func persistedReasonFromAudit(t *testing.T, s *relaystore.Store, did string) string { 141 + t.Helper() 142 + rows, err := s.ListBypassAuditForTest(context.Background(), did) 143 + if err != nil { 144 + t.Fatalf("audit query: %v", err) 145 + } 146 + for _, e := range rows { 147 + if e.Action == "add" { 148 + return e.Reason 149 + } 150 + } 151 + return "" 152 + } 153 + 154 + // TestBypassRemove_WritesAuditRow confirms a manual removal lands a 155 + // 'remove'/'manual' audit row so post-hoc analysis can distinguish 156 + // it from janitor-driven 'expired' removals. 157 + func TestBypassRemove_WritesAuditRow(t *testing.T) { 158 + api, _, store := newBypassAPI(t) 159 + did := "did:plc:ddddddddeeeeeeeeffffffff" 160 + if w := bypassAddReq(t, api, did, map[string]any{"ttl_hours": 1}); w.Code != http.StatusOK { 161 + t.Fatalf("add: %d %s", w.Code, w.Body.String()) 162 + } 163 + req := httptest.NewRequest("DELETE", "/admin/member/"+did+"/bypass-labels", nil) 164 + req.Header.Set("Authorization", "Bearer tok") 165 + w := httptest.NewRecorder() 166 + api.ServeHTTP(w, req) 167 + if w.Code != http.StatusOK { 168 + t.Fatalf("remove: %d %s", w.Code, w.Body.String()) 169 + } 170 + rows, err := store.ListBypassAuditForTest(context.Background(), did) 171 + if err != nil { 172 + t.Fatal(err) 173 + } 174 + var sawRemove bool 175 + for _, e := range rows { 176 + if e.Action == "remove" && e.Reason == "manual" { 177 + sawRemove = true 178 + } 179 + } 180 + if !sawRemove { 181 + t.Errorf("expected audit row action=remove reason=manual, got %+v", rows) 182 + } 183 + } 184 + 185 + // Compile-time guard: ensure *relaystore.Store has ListBypassAuditForTest. 186 + // Without it the tests above will fail to link, signalling we forgot 187 + // to add the test-only accessor. 188 + var _ = func() bool { 189 + var s *relaystore.Store 190 + _ = s 191 + return true 192 + }() 193 + 194 + // silence unused warnings for fmt import if we ever drop it. 195 + var _ = fmt.Sprintf
+187
internal/admin/enroll_oauth_gate_test.go
··· 1 + // SPDX-License-Identifier: AGPL-3.0-or-later 2 + 3 + package admin 4 + 5 + import ( 6 + "bytes" 7 + "encoding/json" 8 + "net/http" 9 + "net/http/httptest" 10 + "strings" 11 + "testing" 12 + ) 13 + 14 + // fakeAuthVerifier returns a hard-coded verified DID when a "verified" 15 + // cookie is present, simulating the EnrollHandler's OAuth ticket 16 + // lookup without dragging the UI package into the test boundary. 17 + type fakeAuthVerifier struct { 18 + verifiedDID string 19 + } 20 + 21 + func (f *fakeAuthVerifier) VerifyAuthCookie(r *http.Request) (string, bool) { 22 + if r == nil || f == nil { 23 + return "", false 24 + } 25 + if c, _ := r.Cookie("verified"); c != nil && c.Value == "yes" { 26 + return f.verifiedDID, true 27 + } 28 + return "", false 29 + } 30 + 31 + // TestEnrollStart_OAuthGate_RejectsMissingCookie pins #207: when an 32 + // OAuth verifier is wired, /admin/enroll-start must refuse a request 33 + // that does not present a verified-DID cookie. 34 + func TestEnrollStart_OAuthGate_RejectsMissingCookie(t *testing.T) { 35 + api, _, _ := testEnrollAPI(t) 36 + api.SetEnrollAuthVerifier(&fakeAuthVerifier{verifiedDID: "did:plc:aaaaaaaabbbbbbbbcccccccc"}) 37 + 38 + body, _ := json.Marshal(EnrollStartRequest{ 39 + DID: "did:plc:aaaaaaaabbbbbbbbcccccccc", Domain: "ok.example", TermsAccepted: true, 40 + }) 41 + req := httptest.NewRequest(http.MethodPost, "/admin/enroll-start", bytes.NewReader(body)) 42 + // No cookie set. 43 + w := httptest.NewRecorder() 44 + api.ServeHTTP(w, req) 45 + if w.Code != http.StatusForbidden { 46 + t.Fatalf("expected 403 without cookie, got %d body=%s", w.Code, w.Body.String()) 47 + } 48 + if !strings.Contains(strings.ToLower(w.Body.String()), "identity verification") { 49 + t.Errorf("body should mention identity verification, got %q", w.Body.String()) 50 + } 51 + } 52 + 53 + // TestEnrollStart_OAuthGate_RejectsDIDMismatch is the central #207 54 + // scenario: caller proves DID A via OAuth but tries to enroll 55 + // claiming DID B. The mismatch must be refused. 56 + func TestEnrollStart_OAuthGate_RejectsDIDMismatch(t *testing.T) { 57 + api, _, _ := testEnrollAPI(t) 58 + verifier := &fakeAuthVerifier{verifiedDID: "did:plc:bbbbbbbbccccccccdddddddd"} 59 + api.SetEnrollAuthVerifier(verifier) 60 + 61 + // Claimed DID does NOT match the OAuth-verified DID. 62 + body, _ := json.Marshal(EnrollStartRequest{ 63 + DID: "did:plc:zzzzzzzzyyyyyyyyxxxxxxxx", Domain: "ok.example", TermsAccepted: true, 64 + }) 65 + req := httptest.NewRequest(http.MethodPost, "/admin/enroll-start", bytes.NewReader(body)) 66 + req.AddCookie(&http.Cookie{Name: "verified", Value: "yes"}) 67 + w := httptest.NewRecorder() 68 + api.ServeHTTP(w, req) 69 + if w.Code != http.StatusForbidden { 70 + t.Fatalf("expected 403 on DID mismatch, got %d body=%s", w.Code, w.Body.String()) 71 + } 72 + if !strings.Contains(strings.ToLower(w.Body.String()), "does not match") { 73 + t.Errorf("body should mention mismatch, got %q", w.Body.String()) 74 + } 75 + } 76 + 77 + // TestEnrollStart_OAuthGate_AllowsExactMatch confirms the happy path: 78 + // claimed DID == OAuth-verified DID, the start succeeds. 79 + func TestEnrollStart_OAuthGate_AllowsExactMatch(t *testing.T) { 80 + api, store, _ := testEnrollAPI(t) 81 + did := "did:plc:aaaaaaaabbbbbbbbcccccccc" 82 + api.SetEnrollAuthVerifier(&fakeAuthVerifier{verifiedDID: did}) 83 + 84 + body, _ := json.Marshal(EnrollStartRequest{DID: did, Domain: "ok.example", TermsAccepted: true}) 85 + req := httptest.NewRequest(http.MethodPost, "/admin/enroll-start", bytes.NewReader(body)) 86 + req.AddCookie(&http.Cookie{Name: "verified", Value: "yes"}) 87 + w := httptest.NewRecorder() 88 + api.ServeHTTP(w, req) 89 + if w.Code != http.StatusOK { 90 + t.Fatalf("expected 200 on matching DID, got %d body=%s", w.Code, w.Body.String()) 91 + } 92 + // Pending row must be persisted (existing happy-path invariant). 93 + var resp EnrollStartResponse 94 + if err := json.Unmarshal(w.Body.Bytes(), &resp); err != nil { 95 + t.Fatalf("decode response: %v", err) 96 + } 97 + if resp.Token == "" { 98 + t.Error("expected non-empty token in response") 99 + } 100 + if pending, _ := store.GetPendingEnrollment(req.Context(), resp.Token); pending == nil { 101 + t.Error("pending enrollment row not persisted") 102 + } 103 + } 104 + 105 + // TestEnrollStart_OAuthGate_CaseInsensitiveDIDMatch — DIDs are 106 + // case-insensitive in their method-specific identifier portion for 107 + // did:plc (lowercase base32) but did:web allows mixed case in 108 + // hostnames per RFC 3986. Match must use EqualFold to avoid spurious 109 + // rejection on identity systems that produce mixed-case DIDs. 110 + func TestEnrollStart_OAuthGate_CaseInsensitiveDIDMatch(t *testing.T) { 111 + api, _, _ := testEnrollAPI(t) 112 + api.SetEnrollAuthVerifier(&fakeAuthVerifier{verifiedDID: "did:web:Example.com"}) 113 + 114 + body, _ := json.Marshal(EnrollStartRequest{ 115 + DID: "did:web:example.com", Domain: "ok.example", TermsAccepted: true, 116 + }) 117 + req := httptest.NewRequest(http.MethodPost, "/admin/enroll-start", bytes.NewReader(body)) 118 + req.AddCookie(&http.Cookie{Name: "verified", Value: "yes"}) 119 + w := httptest.NewRecorder() 120 + api.ServeHTTP(w, req) 121 + if w.Code != http.StatusOK { 122 + t.Fatalf("expected 200 with case-folded DID match, got %d body=%s", w.Code, w.Body.String()) 123 + } 124 + } 125 + 126 + // TestEnrollStart_OAuthGate_NilVerifierIsLegacyOpen pins backward- 127 + // compatibility: deployments that haven't wired SetEnrollAuthVerifier 128 + // yet (pre-#207 binaries during rolling deploy) must accept requests 129 + // the same as before. The code path is exercised by every existing 130 + // enroll test, but having a dedicated assertion makes the contract 131 + // explicit so a future refactor can't quietly tighten it. 132 + func TestEnrollStart_OAuthGate_NilVerifierIsLegacyOpen(t *testing.T) { 133 + api, _, _ := testEnrollAPI(t) 134 + // Verifier intentionally NOT set. 135 + 136 + body, _ := json.Marshal(EnrollStartRequest{ 137 + DID: "did:plc:aaaaaaaabbbbbbbbcccccccc", Domain: "ok.example", TermsAccepted: true, 138 + }) 139 + req := httptest.NewRequest(http.MethodPost, "/admin/enroll-start", bytes.NewReader(body)) 140 + w := httptest.NewRecorder() 141 + api.ServeHTTP(w, req) 142 + if w.Code != http.StatusOK { 143 + t.Fatalf("expected 200 with nil verifier (legacy mode), got %d body=%s", w.Code, w.Body.String()) 144 + } 145 + } 146 + 147 + // TestEnroll_OAuthGate_BlocksDIDSwapAtCompletion exercises the second- 148 + // layer check: even if a pending row was created with a verified DID, 149 + // the /admin/enroll completion step must independently re-verify so a 150 + // stolen token can't be redeemed from a session that does not own the 151 + // pending row's DID. 152 + func TestEnroll_OAuthGate_BlocksDIDSwapAtCompletion(t *testing.T) { 153 + api, store, lk := testEnrollAPI(t) 154 + pendingDID := "did:plc:aaaaaaaabbbbbbbbcccccccc" 155 + domain := "ok.example" 156 + 157 + // Step 1: legitimate user starts enrollment with their verified DID. 158 + api.SetEnrollAuthVerifier(&fakeAuthVerifier{verifiedDID: pendingDID}) 159 + startBody, _ := json.Marshal(EnrollStartRequest{DID: pendingDID, Domain: domain, TermsAccepted: true}) 160 + startReq := httptest.NewRequest(http.MethodPost, "/admin/enroll-start", bytes.NewReader(startBody)) 161 + startReq.AddCookie(&http.Cookie{Name: "verified", Value: "yes"}) 162 + startW := httptest.NewRecorder() 163 + api.ServeHTTP(startW, startReq) 164 + if startW.Code != http.StatusOK { 165 + t.Fatalf("enroll-start: %d %s", startW.Code, startW.Body.String()) 166 + } 167 + var sr EnrollStartResponse 168 + _ = json.Unmarshal(startW.Body.Bytes(), &sr) 169 + lk.records["_atmos-enroll."+domain] = []string{"atmos-verify=" + sr.Token} 170 + 171 + // Step 2: attacker has the token (e.g. captured from DNS) but their 172 + // session is verified as a DIFFERENT DID. The completion must refuse. 173 + api.SetEnrollAuthVerifier(&fakeAuthVerifier{verifiedDID: "did:plc:zzzzzzzzyyyyyyyyxxxxxxxx"}) 174 + completeBody, _ := json.Marshal(EnrollRequest{Token: sr.Token}) 175 + completeReq := httptest.NewRequest(http.MethodPost, "/admin/enroll", bytes.NewReader(completeBody)) 176 + completeReq.AddCookie(&http.Cookie{Name: "verified", Value: "yes"}) 177 + completeW := httptest.NewRecorder() 178 + api.ServeHTTP(completeW, completeReq) 179 + if completeW.Code != http.StatusForbidden { 180 + t.Fatalf("expected 403 on session-DID swap at completion, got %d body=%s", 181 + completeW.Code, completeW.Body.String()) 182 + } 183 + // Member must not exist. 184 + if got, _ := store.GetMember(completeReq.Context(), pendingDID); got != nil { 185 + t.Error("attacker session created a member despite DID-swap rejection") 186 + } 187 + }
+330
internal/admin/enroll_phases.go
··· 1 + // SPDX-License-Identifier: AGPL-3.0-or-later 2 + 3 + package admin 4 + 5 + import ( 6 + "context" 7 + "crypto/x509" 8 + "encoding/json" 9 + "errors" 10 + "fmt" 11 + "io" 12 + "log" 13 + "net/http" 14 + "strings" 15 + "time" 16 + 17 + "atmosphere-mail/internal/enroll" 18 + "atmosphere-mail/internal/notify" 19 + "atmosphere-mail/internal/relay" 20 + "atmosphere-mail/internal/relaystore" 21 + ) 22 + 23 + // enrollHTTPError is the failure value returned by the enrollment phase 24 + // helpers below. handleEnroll renders it via http.Error and otherwise 25 + // proceeds to the next phase. 26 + // 27 + // Splitting the handler into discrete phases (validate → load+verify → 28 + // authorize → provision → persist → dispatch → respond) makes each step 29 + // individually unit-testable and keeps handleEnroll itself a short 30 + // orchestration function. See #223. 31 + type enrollHTTPError struct { 32 + Status int 33 + Message string 34 + } 35 + 36 + func (e *enrollHTTPError) Error() string { return e.Message } 37 + 38 + func enrollErrf(status int, format string, args ...any) *enrollHTTPError { 39 + return &enrollHTTPError{Status: status, Message: fmt.Sprintf(format, args...)} 40 + } 41 + 42 + // --- Phase 1: validate ------------------------------------------------------ 43 + 44 + // validateEnrollRequest reads the JSON body and query params from the 45 + // public POST /admin/enroll request. Returns the parsed token and the 46 + // optional forward_to address, or an HTTP error to return to the caller. 47 + // 48 + // Body size is capped at 4 KiB; tokens are bounded by the pending row 49 + // they look up so a giant body would be a hostile no-op. 50 + func validateEnrollRequest(r *http.Request) (token, forwardTo string, herr *enrollHTTPError) { 51 + forwardTo = r.URL.Query().Get("forward_to") 52 + if forwardTo != "" && !strings.Contains(forwardTo, "@") { 53 + return "", "", enrollErrf(http.StatusBadRequest, "forward_to must be a valid email address") 54 + } 55 + 56 + body, err := io.ReadAll(io.LimitReader(r.Body, 4096)) 57 + if err != nil { 58 + return "", "", enrollErrf(http.StatusBadRequest, "error reading request body") 59 + } 60 + if len(body) == 0 { 61 + return "", "", enrollErrf(http.StatusBadRequest, "enrollment token required: POST JSON body with {\"token\": \"...\"}") 62 + } 63 + var req EnrollRequest 64 + if err := json.Unmarshal(body, &req); err != nil { 65 + return "", "", enrollErrf(http.StatusBadRequest, "invalid JSON body") 66 + } 67 + if req.Token == "" { 68 + return "", "", enrollErrf(http.StatusBadRequest, "token field required") 69 + } 70 + return req.Token, forwardTo, nil 71 + } 72 + 73 + // --- Phase 2: load + verify ------------------------------------------------- 74 + 75 + // loadAndVerifyPending fetches the pending enrollment by token, runs the 76 + // OAuth-cookie identity gate (#207), enforces the expiry cutoff, and 77 + // re-runs DNS TXT verification. Returns the pending row on success or an 78 + // HTTP error otherwise. 79 + // 80 + // Side effect: deletes the pending row on expiry so the same expired 81 + // token can't be retried. 82 + func (a *API) loadAndVerifyPending(ctx context.Context, r *http.Request, token string) (*relaystore.PendingEnrollment, *enrollHTTPError) { 83 + pending, err := a.store.GetPendingEnrollment(ctx, token) 84 + if err != nil { 85 + log.Printf("admin.enroll: token_lookup_error=%v", err) 86 + return nil, enrollErrf(http.StatusInternalServerError, "internal error") 87 + } 88 + if pending == nil { 89 + // Don't distinguish "never existed" from "already consumed" to 90 + // avoid leaking enrollment state to callers. 91 + return nil, enrollErrf(http.StatusNotFound, "token not found or already used") 92 + } 93 + 94 + // OAuth-verified DID gate, second layer (#207). The pending row was 95 + // created by handleEnrollStart, which already enforces the same 96 + // check, but a stale pending row from before the verifier was wired 97 + // or a path that bypasses /admin/enroll-start altogether (e.g. an 98 + // admin-driven test fixture replay) must not let a verified DID be 99 + // swapped at completion time. Re-check here against pending.DID so 100 + // the *member-creation* moment is also gated. 101 + if a.enrollAuthVerifier != nil { 102 + verifiedDID, ok := a.enrollAuthVerifier.VerifyAuthCookie(r) 103 + if !ok { 104 + log.Printf("admin.enroll.no_oauth: pending_did=%s", pending.DID) 105 + return nil, enrollErrf(http.StatusForbidden, "identity verification required — sign in with your handle before completing enrollment") 106 + } 107 + if !strings.EqualFold(verifiedDID, pending.DID) { 108 + log.Printf("admin.enroll.did_mismatch: pending=%s verified=%s", pending.DID, verifiedDID) 109 + return nil, enrollErrf(http.StatusForbidden, "verified identity does not match the pending enrollment") 110 + } 111 + } 112 + 113 + if time.Now().UTC().After(pending.ExpiresAt) { 114 + // 410 Gone signals "the thing you're pointing at existed but is no 115 + // longer retrievable" — precisely the pending-expired semantic. 116 + // Clean the row so the same token can't be retried. 117 + _ = a.store.DeletePendingEnrollment(ctx, token) 118 + return nil, enrollErrf(http.StatusGone, "enrollment token expired — start over") 119 + } 120 + 121 + if err := a.domainVerifier.Verify(ctx, pending.Domain, token); err != nil { 122 + log.Printf("admin.enroll: did=%s domain=%s dns_verify_error=%v", pending.DID, pending.Domain, err) 123 + switch { 124 + case errors.Is(err, enroll.ErrNoTXTRecord): 125 + return nil, enrollErrf(http.StatusForbidden, "no atmos-verify TXT record found at _atmos-enroll.%s — publish the record and retry", pending.Domain) 126 + case errors.Is(err, enroll.ErrTokenMismatch): 127 + return nil, enrollErrf(http.StatusForbidden, "TXT record does not contain the expected token — double-check the value") 128 + default: 129 + return nil, enrollErrf(http.StatusServiceUnavailable, "DNS lookup failed: %v — retry in a moment", err) 130 + } 131 + } 132 + 133 + log.Printf("admin.enroll: did=%s domain=%s dns_verified=true", pending.DID, pending.Domain) 134 + 135 + // Consume the pending row now that verification succeeded. Don't 136 + // fail the enrollment if cleanup errors — CleanExpired will sweep 137 + // it later and the unique-domain constraint prevents reuse. 138 + if err := a.store.DeletePendingEnrollment(ctx, token); err != nil { 139 + log.Printf("admin.enroll: did=%s domain=%s pending_cleanup_error=%v", pending.DID, pending.Domain, err) 140 + } 141 + return pending, nil 142 + } 143 + 144 + // --- Phase 3: authorize ----------------------------------------------------- 145 + 146 + // checkDomainAvailable confirms the domain is unclaimed and the DID hasn't 147 + // already maxed out its per-account domain quota. Both checks run on every 148 + // enroll completion because handleEnrollStart's check is racy: a second 149 + // enrollment could complete between start and verify if the DID raced to 150 + // acquire domains via another browser tab or API caller. 151 + func (a *API) checkDomainAvailable(ctx context.Context, did, domain string) *enrollHTTPError { 152 + existing, err := a.store.GetMemberDomain(ctx, domain) 153 + if err != nil { 154 + log.Printf("admin.enroll: did=%s error=%v", did, err) 155 + return enrollErrf(http.StatusInternalServerError, "internal error") 156 + } 157 + if existing != nil { 158 + if existing.DID == did { 159 + return enrollErrf(http.StatusConflict, "You've already enrolled this domain. Sign in at /account to manage it.") 160 + } 161 + return enrollErrf(http.StatusConflict, "This domain is registered to another account.") 162 + } 163 + 164 + owned, err := a.store.ListMemberDomains(ctx, did) 165 + if err != nil { 166 + log.Printf("admin.enroll: did=%s list_domains_error=%v", did, err) 167 + return enrollErrf(http.StatusInternalServerError, "internal error") 168 + } 169 + if len(owned) >= maxDomainsPerMember { 170 + return enrollErrf(http.StatusConflict, "domain limit reached — your account currently supports up to %d sending domains", maxDomainsPerMember) 171 + } 172 + return nil 173 + } 174 + 175 + // --- Phase 4: provision ----------------------------------------------------- 176 + 177 + // enrollProvisionResult bundles the records and key material the persist 178 + // + respond phases need. IsNewDID is true when GetMember returned nil, 179 + // signalling that the persist step should also insert a member row and 180 + // dispatch should fire the operator-ping. 181 + type enrollProvisionResult struct { 182 + Member *relaystore.Member // nil when adding a domain to an existing DID 183 + Domain *relaystore.MemberDomain 184 + APIKey string 185 + APIKeyHash []byte 186 + DKIMKeys *relay.DKIMKeys 187 + DKIMSelector string 188 + IsNewDID bool 189 + } 190 + 191 + // provisionMemberAndDomain generates the API key, DKIM keypair, and 192 + // builds the member + domain records for atomic insert. Pure aside from 193 + // the DID lookup against the store and the random-key generation; the 194 + // returned result is what the persist + dispatch + respond phases need. 195 + func (a *API) provisionMemberAndDomain(ctx context.Context, pending *relaystore.PendingEnrollment, forwardTo string) (*enrollProvisionResult, *enrollHTTPError) { 196 + existing, err := a.store.GetMember(ctx, pending.DID) 197 + if err != nil { 198 + log.Printf("admin.enroll: did=%s error=%v", pending.DID, err) 199 + return nil, enrollErrf(http.StatusInternalServerError, "internal error") 200 + } 201 + 202 + apiKey, err := relay.GenerateAPIKey() 203 + if err != nil { 204 + log.Printf("admin.enroll: did=%s error=generate_api_key %v", pending.DID, err) 205 + return nil, enrollErrf(http.StatusInternalServerError, "internal error") 206 + } 207 + apiKeyHash, err := relay.HashAPIKey(apiKey) 208 + if err != nil { 209 + log.Printf("admin.enroll: did=%s error=hash_api_key %v", pending.DID, err) 210 + return nil, enrollErrf(http.StatusInternalServerError, "internal error") 211 + } 212 + 213 + selector := fmt.Sprintf("atmos%s", time.Now().UTC().Format("20060102")) 214 + dkimKeys, err := relay.GenerateDKIMKeys(selector) 215 + if err != nil { 216 + log.Printf("admin.enroll: did=%s error=generate_dkim %v", pending.DID, err) 217 + return nil, enrollErrf(http.StatusInternalServerError, "internal error") 218 + } 219 + 220 + rsaBytes, err := x509.MarshalPKCS8PrivateKey(dkimKeys.RSAPriv) 221 + if err != nil { 222 + log.Printf("admin.enroll: did=%s error=marshal_rsa %v", pending.DID, err) 223 + return nil, enrollErrf(http.StatusInternalServerError, "internal error") 224 + } 225 + edBytes, err := x509.MarshalPKCS8PrivateKey(dkimKeys.EdPriv) 226 + if err != nil { 227 + log.Printf("admin.enroll: did=%s error=marshal_ed %v", pending.DID, err) 228 + return nil, enrollErrf(http.StatusInternalServerError, "internal error") 229 + } 230 + 231 + now := time.Now().UTC() 232 + var member *relaystore.Member 233 + isNewDID := existing == nil 234 + if isNewDID { 235 + if !pending.TermsAccepted { 236 + return nil, enrollErrf(http.StatusBadRequest, "terms acceptance required") 237 + } 238 + member = &relaystore.Member{ 239 + DID: pending.DID, 240 + Status: relaystore.StatusPending, 241 + DIDVerified: false, 242 + TermsAcceptedAt: now, 243 + TermsVersion: relaystore.CurrentTermsVersion, 244 + HourlyLimit: 100, 245 + DailyLimit: 1000, 246 + CreatedAt: now, 247 + UpdatedAt: now, 248 + } 249 + } 250 + 251 + domainRecord := &relaystore.MemberDomain{ 252 + Domain: pending.Domain, 253 + DID: pending.DID, 254 + APIKeyHash: apiKeyHash, 255 + DKIMRSAPriv: rsaBytes, 256 + DKIMEdPriv: edBytes, 257 + DKIMSelector: selector, 258 + ForwardTo: forwardTo, 259 + ContactEmail: pending.ContactEmail, 260 + CreatedAt: now, 261 + } 262 + 263 + return &enrollProvisionResult{ 264 + Member: member, 265 + Domain: domainRecord, 266 + APIKey: apiKey, 267 + APIKeyHash: apiKeyHash, 268 + DKIMKeys: dkimKeys, 269 + DKIMSelector: selector, 270 + IsNewDID: isNewDID, 271 + }, nil 272 + } 273 + 274 + // --- Phase 6: dispatch ------------------------------------------------------ 275 + 276 + // dispatchEnrollNotifications fires the post-persist side effects: 277 + // operator-ping email (only for new DIDs to avoid notification fatigue), 278 + // webhook event, and contact-email verification. Errors are best-effort 279 + // because the enrollment itself has already succeeded. 280 + func (a *API) dispatchEnrollNotifications(pending *relaystore.PendingEnrollment, isNewDID bool) { 281 + if isNewDID { 282 + go a.FireOperatorPing(context.Background(), pending.DID, pending.Domain, pending.ContactEmail) 283 + a.notifyEvent(notify.KindMemberPending, pending.DID, pending.Domain, "", pending.ContactEmail) 284 + } else { 285 + a.notifyEvent(notify.KindMemberDomainAdded, pending.DID, pending.Domain, "", pending.ContactEmail) 286 + } 287 + 288 + if pending.ContactEmail != "" { 289 + go a.TriggerEmailVerification(context.Background(), pending.Domain, pending.ContactEmail) 290 + } 291 + } 292 + 293 + // --- Phase 7: respond ------------------------------------------------------- 294 + 295 + // buildEnrollResponse assembles the JSON response body, including SPF 296 + // alignment when configured. Pure given inputs; ctx is only used for the 297 + // SPF lookup. 298 + func (a *API) buildEnrollResponse(ctx context.Context, p *enrollProvisionResult, domain string) EnrollResponse { 299 + var spfResult *SPFAlignmentResponse 300 + if a.spfChecker != nil { 301 + result := a.spfChecker.CheckAlignment(ctx, domain) 302 + spfResult = &SPFAlignmentResponse{ 303 + Aligned: result.Aligned, 304 + Failures: result.Failures, 305 + } 306 + if !result.Aligned { 307 + log.Printf("admin.enroll.spf_warning: did=%s domain=%s failures=%v", p.Domain.DID, domain, result.Failures) 308 + } 309 + } 310 + 311 + return EnrollResponse{ 312 + DID: p.Domain.DID, 313 + APIKey: p.APIKey, 314 + DKIM: DKIMResponse{ 315 + Selector: p.DKIMSelector, 316 + RSASelector: p.DKIMKeys.RSASelectorName(), 317 + EdSelector: p.DKIMKeys.EdSelectorName(), 318 + RSARecord: p.DKIMKeys.RSADNSRecord(), 319 + EdRecord: p.DKIMKeys.EdDNSRecord(), 320 + RSADNSName: fmt.Sprintf("%s._domainkey.%s", p.DKIMKeys.RSASelectorName(), domain), 321 + EdDNSName: fmt.Sprintf("%s._domainkey.%s", p.DKIMKeys.EdSelectorName(), domain), 322 + }, 323 + SMTP: SMTPResponse{ 324 + Host: "smtp." + a.domain, 325 + Port: 587, 326 + }, 327 + SPFAlignment: spfResult, 328 + apiKeyHash: p.APIKeyHash, 329 + } 330 + }
+113
internal/admin/enroll_phases_test.go
··· 1 + // SPDX-License-Identifier: AGPL-3.0-or-later 2 + 3 + package admin 4 + 5 + import ( 6 + "net/http" 7 + "net/http/httptest" 8 + "strings" 9 + "testing" 10 + ) 11 + 12 + func TestValidateEnrollRequest_Valid(t *testing.T) { 13 + r := httptest.NewRequest(http.MethodPost, "/admin/enroll", strings.NewReader(`{"token":"abc123"}`)) 14 + token, forwardTo, herr := validateEnrollRequest(r) 15 + if herr != nil { 16 + t.Fatalf("unexpected error: %v", herr) 17 + } 18 + if token != "abc123" { 19 + t.Errorf("token=%q, want abc123", token) 20 + } 21 + if forwardTo != "" { 22 + t.Errorf("forwardTo=%q, want empty", forwardTo) 23 + } 24 + } 25 + 26 + func TestValidateEnrollRequest_ForwardToValid(t *testing.T) { 27 + r := httptest.NewRequest(http.MethodPost, "/admin/enroll?forward_to=ops@example.com", 28 + strings.NewReader(`{"token":"t"}`)) 29 + token, forwardTo, herr := validateEnrollRequest(r) 30 + if herr != nil { 31 + t.Fatalf("err=%v", herr) 32 + } 33 + if token != "t" { 34 + t.Errorf("token=%q", token) 35 + } 36 + if forwardTo != "ops@example.com" { 37 + t.Errorf("forwardTo=%q, want ops@example.com", forwardTo) 38 + } 39 + } 40 + 41 + func TestValidateEnrollRequest_ForwardToInvalid(t *testing.T) { 42 + r := httptest.NewRequest(http.MethodPost, "/admin/enroll?forward_to=not-an-email", 43 + strings.NewReader(`{"token":"t"}`)) 44 + _, _, herr := validateEnrollRequest(r) 45 + if herr == nil { 46 + t.Fatal("expected forward_to validation error") 47 + } 48 + if herr.Status != http.StatusBadRequest { 49 + t.Errorf("status=%d, want 400", herr.Status) 50 + } 51 + } 52 + 53 + func TestValidateEnrollRequest_EmptyBody(t *testing.T) { 54 + r := httptest.NewRequest(http.MethodPost, "/admin/enroll", strings.NewReader("")) 55 + _, _, herr := validateEnrollRequest(r) 56 + if herr == nil { 57 + t.Fatal("expected empty-body error") 58 + } 59 + if herr.Status != http.StatusBadRequest { 60 + t.Errorf("status=%d, want 400", herr.Status) 61 + } 62 + if !strings.Contains(herr.Message, "token") { 63 + t.Errorf("message %q should mention token", herr.Message) 64 + } 65 + } 66 + 67 + func TestValidateEnrollRequest_InvalidJSON(t *testing.T) { 68 + r := httptest.NewRequest(http.MethodPost, "/admin/enroll", strings.NewReader(`{not json`)) 69 + _, _, herr := validateEnrollRequest(r) 70 + if herr == nil { 71 + t.Fatal("expected JSON error") 72 + } 73 + if herr.Status != http.StatusBadRequest { 74 + t.Errorf("status=%d, want 400", herr.Status) 75 + } 76 + } 77 + 78 + func TestValidateEnrollRequest_TokenMissing(t *testing.T) { 79 + r := httptest.NewRequest(http.MethodPost, "/admin/enroll", strings.NewReader(`{"token":""}`)) 80 + _, _, herr := validateEnrollRequest(r) 81 + if herr == nil { 82 + t.Fatal("expected token-required error") 83 + } 84 + if herr.Status != http.StatusBadRequest { 85 + t.Errorf("status=%d", herr.Status) 86 + } 87 + } 88 + 89 + func TestValidateEnrollRequest_BodyOver4KiB(t *testing.T) { 90 + // 5 KiB of payload — io.LimitReader truncates so the JSON unmarshals 91 + // to whatever fits, which here will be invalid JSON. We just want to 92 + // confirm we don't OOM or read unbounded input. 93 + huge := `{"token":"` + strings.Repeat("a", 5000) + `"}` 94 + r := httptest.NewRequest(http.MethodPost, "/admin/enroll", strings.NewReader(huge)) 95 + _, _, herr := validateEnrollRequest(r) 96 + if herr == nil { 97 + t.Fatal("expected bounded body to surface a JSON error after truncation") 98 + } 99 + } 100 + 101 + // TestEnrollHTTPError_ImplementsError pins that *enrollHTTPError satisfies 102 + // the error interface so it can be wrapped/unwrapped if a future caller 103 + // needs to do so. 104 + func TestEnrollHTTPError_ImplementsError(t *testing.T) { 105 + var _ error = (*enrollHTTPError)(nil) 106 + herr := enrollErrf(http.StatusBadRequest, "bad %s", "input") 107 + if herr.Error() != "bad input" { 108 + t.Errorf("Error()=%q, want %q", herr.Error(), "bad input") 109 + } 110 + if herr.Status != http.StatusBadRequest { 111 + t.Errorf("Status=%d", herr.Status) 112 + } 113 + }
+9 -2
internal/admin/ui/attest.go
··· 43 43 // when the shared OAuth callback sees a session with no Attestation 44 44 // payload — signalling the flow was initiated for credential recovery 45 45 // rather than enrollment. 46 + // 47 + // UA-binding is required: the only entry point on this interface 48 + // takes the User-Agent of the browser that completed OAuth, so a 49 + // leaked cookie cannot be replayed from a different browser. The 50 + // legacy no-UA helper (IssueRecoveryTicket on *RecoverHandler) is 51 + // retained for tests but deliberately NOT exposed here so production 52 + // callers can't accidentally bypass the binding (#212). 46 53 type RecoveryIssuer interface { 47 - IssueRecoveryTicket(did, domain string) string 54 + IssueRecoveryTicketWithUA(did, domain, ua string) string 48 55 } 49 56 50 57 // DIDHandleResolver resolves a DID to its atproto handle. Used by the ··· 233 240 if h.funnel != nil { 234 241 h.funnel.RecordOAuthCallback("recovery", h.resolveHandle(ctx, sess.AccountDID())) 235 242 } 236 - target := h.recoveryIssuer.IssueRecoveryTicket(sess.AccountDID(), sess.Domain()) 243 + target := h.recoveryIssuer.IssueRecoveryTicketWithUA(sess.AccountDID(), sess.Domain(), r.UserAgent()) 237 244 log.Printf("attest.callback: did=%s domain=%s handoff=recovery target=%s", 238 245 sess.AccountDID(), sess.Domain(), target) 239 246 http.Redirect(w, r, target, http.StatusFound)
+80 -13
internal/admin/ui/enroll.go
··· 13 13 "io" 14 14 "log" 15 15 "net/http" 16 - "net/http/httptest" 17 16 "strings" 18 17 "sync" 19 18 "time" ··· 122 121 h.mux.HandleFunc("/privacy", h.handlePrivacy) 123 122 h.mux.HandleFunc("/aup", h.handleAUP) 124 123 h.mux.HandleFunc("/about", h.handleAbout) 124 + h.mux.HandleFunc("/faq", h.handleFAQ) 125 125 return h 126 126 } 127 127 ··· 261 261 }) 262 262 } 263 263 264 + func (h *EnrollHandler) handleFAQ(w http.ResponseWriter, r *http.Request) { 265 + h.staticPage(w, r, func(w http.ResponseWriter, r *http.Request) { 266 + _ = templates.FAQPage().Render(r.Context(), w) 267 + }) 268 + } 269 + 264 270 // handleResolve takes a handle and returns the resolved DID as JSON. 265 271 // Used by the landing-page JS to turn `scottlanoue.com` into 266 272 // `did:plc:…` before the user submits the form. ··· 445 451 "contactEmail": contactEmail, 446 452 "termsAccepted": termsAccepted, 447 453 }) 448 - resp := h.proxyAdminInner(http.MethodPost, "/admin/enroll-start", bytes.NewReader(body)) 454 + resp := h.proxyAdminInner(http.MethodPost, "/admin/enroll-start", bytes.NewReader(body), r) 449 455 if resp.Code != http.StatusOK { 450 456 msg := strings.TrimSpace(resp.Body.String()) 451 457 if msg == "" { ··· 500 506 } 501 507 502 508 body, _ := json.Marshal(map[string]string{"token": token}) 503 - resp := h.proxyAdminInner(http.MethodPost, "/admin/enroll", bytes.NewReader(body)) 509 + resp := h.proxyAdminInner(http.MethodPost, "/admin/enroll", bytes.NewReader(body), r) 504 510 if resp.Code != http.StatusOK { 505 511 msg := strings.TrimSpace(resp.Body.String()) 506 512 if msg == "" { ··· 752 758 _ = templates.EnrollError(message).Render(r.Context(), w) 753 759 } 754 760 755 - // proxyAdminInner invokes the admin API in-process via httptest. We never 756 - // forward the caller's Authorization header — the admin API's enrollment 757 - // endpoints do their own verification (DNS TXT ownership), so forwarding 758 - // caller auth is unnecessary and would risk leaking admin credentials 759 - // from other contexts into the public path. 760 - func (h *EnrollHandler) proxyAdminInner(method, target string, body io.Reader) *httptest.ResponseRecorder { 761 - req := httptest.NewRequest(method, target, body) 761 + // proxyAdminInner invokes the admin API in-process. We never forward the 762 + // caller's Authorization header — the admin API's enrollment endpoints do 763 + // their own verification (DNS TXT ownership), so forwarding caller auth 764 + // is unnecessary and would risk leaking admin credentials from other 765 + // contexts into the public path. 766 + // 767 + // Cookie + User-Agent are forwarded so the inner admin API can look up 768 + // the enroll-auth ticket the public UI set after a successful AT Proto 769 + // OAuth round-trip — the central defense for #207. 770 + // 771 + // RemoteAddr is also forwarded so the admin API's per-IP enroll-start 772 + // rate limiter sees the real public client IP. Without this, every 773 + // public enrollment request would share a single rate-limit bucket and 774 + // a single attacker could exhaust it for all legitimate users from any 775 + // IP — closes #211. 776 + // 777 + // This used to construct an httptest.NewRequest + httptest.ResponseRecorder 778 + // in the production call chain (#222). The dependency on net/http/httptest 779 + // from non-test code masked the rate-limiter bypass that became #211 and 780 + // made the call site inscrutable to readers expecting test-only types not 781 + // to leak. We now use http.NewRequestWithContext + an in-package response 782 + // writer (inMemoryResponseWriter) so the type signatures match the rest 783 + // of the production stack. 784 + func (h *EnrollHandler) proxyAdminInner(method, target string, body io.Reader, src *http.Request) *adminProxyResponse { 785 + ctx := context.Background() 786 + if src != nil { 787 + ctx = src.Context() 788 + } 789 + req, err := http.NewRequestWithContext(ctx, method, target, body) 790 + if err != nil { 791 + // method/target are package-internal constants; a build error here 792 + // indicates a programming bug, not a runtime condition. Surface it 793 + // as a 500 so the wrapping handler renders an inline error rather 794 + // than panicking. 795 + return &adminProxyResponse{ 796 + Code: http.StatusInternalServerError, 797 + Body: bytes.NewBufferString("internal error: build admin request"), 798 + header: http.Header{}, 799 + } 800 + } 762 801 req.Header.Set("Content-Type", "application/json") 763 - rr := httptest.NewRecorder() 764 - h.adminAPI.ServeHTTP(rr, req) 765 - return rr 802 + if src != nil { 803 + if cookie := src.Header.Get("Cookie"); cookie != "" { 804 + req.Header.Set("Cookie", cookie) 805 + } 806 + if ua := src.UserAgent(); ua != "" { 807 + req.Header.Set("User-Agent", ua) 808 + } 809 + if src.RemoteAddr != "" { 810 + req.RemoteAddr = src.RemoteAddr 811 + } 812 + } 813 + rw := newInMemoryResponseWriter() 814 + h.adminAPI.ServeHTTP(rw, req) 815 + return rw.snapshot() 816 + } 817 + 818 + // VerifyAuthCookie implements admin.EnrollAuthVerifier. Returns the DID 819 + // proven by the most recent successful AT Proto OAuth round-trip if the 820 + // caller presents a valid enroll-auth ticket cookie, or "" / false 821 + // otherwise. The cookie's UA-binding is also enforced so a stolen 822 + // cookie can't be replayed from a different browser. 823 + func (h *EnrollHandler) VerifyAuthCookie(r *http.Request) (string, bool) { 824 + id, ok := enrollAuthTicketFromCookie(r) 825 + if !ok { 826 + return "", false 827 + } 828 + ticket, ok := h.lookupEnrollAuthTicket(id, r.UserAgent()) 829 + if !ok { 830 + return "", false 831 + } 832 + return ticket.did, true 766 833 }
+96 -2
internal/admin/ui/enroll_test.go
··· 26 26 lastAuth string 27 27 lastPath string 28 28 lastBody string 29 + lastRemoteAddr string 30 + lastCookie string 29 31 gotEnrollStart bool 30 32 gotEnroll bool 31 33 } ··· 33 35 func (f *fakeAdminAPI) ServeHTTP(w http.ResponseWriter, r *http.Request) { 34 36 f.lastAuth = r.Header.Get("Authorization") 35 37 f.lastPath = r.URL.Path + "?" + r.URL.RawQuery 38 + f.lastRemoteAddr = r.RemoteAddr 39 + f.lastCookie = r.Header.Get("Cookie") 36 40 if r.Body != nil { 37 41 b, _ := io.ReadAll(r.Body) 38 42 f.lastBody = string(b) ··· 561 565 562 566 func TestStaticPage_HEADReturns200(t *testing.T) { 563 567 h := NewEnrollHandler(&fakeAdminAPI{}, nil) 564 - for _, p := range []string{"/", "/terms", "/privacy", "/aup", "/about"} { 568 + for _, p := range []string{"/", "/terms", "/privacy", "/aup", "/about", "/faq"} { 565 569 req := httptest.NewRequest(http.MethodHead, p, nil) 566 570 w := httptest.NewRecorder() 567 571 h.ServeHTTP(w, req) ··· 675 679 } 676 680 } 677 681 682 + func TestFAQPage_ServesHTML(t *testing.T) { 683 + h := NewEnrollHandler(&fakeAdminAPI{}, nil) 684 + req := httptest.NewRequest(http.MethodGet, "/faq", nil) 685 + w := httptest.NewRecorder() 686 + h.ServeHTTP(w, req) 687 + 688 + if w.Code != http.StatusOK { 689 + t.Fatalf("status = %d, want 200", w.Code) 690 + } 691 + body := w.Body.String() 692 + if !strings.Contains(body, "FAQ") { 693 + t.Error("faq page should contain 'FAQ'") 694 + } 695 + if !strings.Contains(body, "Atmosphere Mail LLC") { 696 + t.Error("faq page must identify the legal entity") 697 + } 698 + // The FAQ must answer the three questions prospective members ask most. 699 + for _, required := range []string{"free", "trust", "commercial relay"} { 700 + if !strings.Contains(strings.ToLower(body), required) { 701 + t.Errorf("faq page must mention %q", required) 702 + } 703 + } 704 + } 705 + 678 706 // TestDropCapOnlyOnLanding pins the Round 2 design decision that the 679 707 // drop-cap brand mark is a landing-page-only element. Putting it on every 680 708 // page dilutes the signature; this test guards against regressing. ··· 691 719 692 720 // Legal + about pages must not carry a drop-cap — they are reference 693 721 // documents, not the brand moment. 694 - for _, p := range []string{"/terms", "/privacy", "/aup", "/about"} { 722 + for _, p := range []string{"/terms", "/privacy", "/aup", "/about", "/faq"} { 695 723 req := httptest.NewRequest(http.MethodGet, p, nil) 696 724 w := httptest.NewRecorder() 697 725 h.ServeHTTP(w, req) ··· 954 982 t.Errorf("body should report false flags, got %q", w.Body.String()) 955 983 } 956 984 } 985 + 986 + // TestEnrollStart_ForwardsRemoteAddr proves the public /enroll/start 987 + // path delivers the caller's real RemoteAddr to the inner admin API 988 + // so the per-IP enroll-start rate limiter sees distinct buckets per 989 + // source. Without this, every public enrollment request shares a 990 + // single bucket (the httptest synthetic default), letting one 991 + // attacker exhaust the limit for everyone — closes #211. 992 + func TestEnrollStart_ForwardsRemoteAddr(t *testing.T) { 993 + fake := &fakeAdminAPI{ 994 + enrollStartStatus: http.StatusOK, 995 + enrollStartBody: `{"token":"tok","dnsName":"_atmos-enroll.x.example","dnsValue":"atmos-verify=tok","expiresAt":"2026-04-17T12:00:00Z"}`, 996 + } 997 + h := NewEnrollHandler(fake, nil) 998 + 999 + form := strings.NewReader("did=did:plc:testtesttesttesttest&domain=x.example&terms_accepted=on") 1000 + req := httptest.NewRequest(http.MethodPost, "/enroll/start", form) 1001 + req.Header.Set("Content-Type", "application/x-www-form-urlencoded") 1002 + // Realistic public-side RemoteAddr — what would arrive after 1003 + // Tailscale Serve termination on the public listener. 1004 + req.RemoteAddr = "203.0.113.42:54321" 1005 + w := httptest.NewRecorder() 1006 + h.ServeHTTP(w, req) 1007 + 1008 + if w.Code != http.StatusOK { 1009 + t.Fatalf("status = %d body = %q", w.Code, w.Body.String()) 1010 + } 1011 + if fake.lastRemoteAddr != "203.0.113.42:54321" { 1012 + t.Errorf("admin API saw RemoteAddr=%q, want %q (proxyAdminInner dropped real client IP — rate limiter would treat all callers as one)", 1013 + fake.lastRemoteAddr, "203.0.113.42:54321") 1014 + } 1015 + } 1016 + 1017 + // TestEnrollStart_DistinctIPsRouteToDistinctBuckets demonstrates the 1018 + // per-IP isolation property end-to-end: two requests from different 1019 + // public IPs both reach the admin API with their original RemoteAddr 1020 + // preserved, so an in-process IP-keyed limiter can distinguish them. 1021 + func TestEnrollStart_DistinctIPsRouteToDistinctBuckets(t *testing.T) { 1022 + fake := &fakeAdminAPI{ 1023 + enrollStartStatus: http.StatusOK, 1024 + enrollStartBody: `{"token":"tok","dnsName":"_atmos-enroll.x.example","dnsValue":"atmos-verify=tok","expiresAt":"2026-04-17T12:00:00Z"}`, 1025 + } 1026 + h := NewEnrollHandler(fake, nil) 1027 + 1028 + send := func(remote string) string { 1029 + t.Helper() 1030 + form := strings.NewReader("did=did:plc:testtesttesttesttest&domain=x.example&terms_accepted=on") 1031 + req := httptest.NewRequest(http.MethodPost, "/enroll/start", form) 1032 + req.Header.Set("Content-Type", "application/x-www-form-urlencoded") 1033 + req.RemoteAddr = remote 1034 + w := httptest.NewRecorder() 1035 + h.ServeHTTP(w, req) 1036 + if w.Code != http.StatusOK { 1037 + t.Fatalf("status = %d body = %q", w.Code, w.Body.String()) 1038 + } 1039 + return fake.lastRemoteAddr 1040 + } 1041 + 1042 + a := send("198.51.100.1:1111") 1043 + b := send("198.51.100.2:2222") 1044 + if a == b { 1045 + t.Errorf("both requests delivered the same RemoteAddr=%q — limiter cannot distinguish them", a) 1046 + } 1047 + if a != "198.51.100.1:1111" || b != "198.51.100.2:2222" { 1048 + t.Errorf("RemoteAddr forwarding garbled: a=%q b=%q", a, b) 1049 + } 1050 + }
+18
internal/admin/ui/handlers.go
··· 253 253 h.handleMemberRegenerateKeyAction(w, r, did) 254 254 case action == "warmup" && r.Method == http.MethodPost: 255 255 h.handleMemberWarmupAction(w, r, did) 256 + case action == "delete" && r.Method == http.MethodDelete: 257 + h.handleMemberDeleteAction(w, r, did) 256 258 default: 257 259 http.NotFound(w, r) 258 260 } ··· 492 494 w.Header().Set("Content-Type", "text/html; charset=utf-8") 493 495 _ = templates.WarmupResult(sent, failed, errors).Render(r.Context(), w) 494 496 } 497 + 498 + func (h *Handler) handleMemberDeleteAction(w http.ResponseWriter, r *http.Request, did string) { 499 + if err := h.store.DeleteMember(r.Context(), did); err != nil { 500 + log.Printf("ui.delete: did=%s error=%v", did, err) 501 + http.Error(w, "delete failed: "+err.Error(), http.StatusInternalServerError) 502 + return 503 + } 504 + 505 + log.Printf("ui.delete: did=%s permanently_deleted=true", did) 506 + if h.onStateChange != nil { 507 + h.onStateChange("member_deleted", did, "permanently deleted via dashboard") 508 + } 509 + 510 + w.Header().Set("HX-Redirect", "/ui/members") 511 + w.WriteHeader(http.StatusOK) 512 + }
+77
internal/admin/ui/inproc.go
··· 1 + // SPDX-License-Identifier: AGPL-3.0-or-later 2 + 3 + package ui 4 + 5 + import ( 6 + "bytes" 7 + "net/http" 8 + ) 9 + 10 + // adminProxyResponse captures the response from invoking the admin API 11 + // in-process. Replaces *httptest.ResponseRecorder so test-only types stay 12 + // out of the production call chain (#222). 13 + // 14 + // Field names mirror the legacy ResponseRecorder API (`Code`, `Body`) so 15 + // callers that read `resp.Code` and `resp.Body.String()` keep working 16 + // without churn. 17 + type adminProxyResponse struct { 18 + Code int 19 + Body *bytes.Buffer 20 + header http.Header 21 + } 22 + 23 + // Header returns the response headers the inner admin handler set. Not 24 + // every caller needs them; exposed for parity with ResponseRecorder. 25 + func (r *adminProxyResponse) Header() http.Header { return r.header } 26 + 27 + // inMemoryResponseWriter is a minimal real http.ResponseWriter the inner 28 + // admin handler writes to. Unlike httptest.ResponseRecorder this lives in 29 + // the regular package, so production code paths no longer depend on 30 + // net/http/httptest just to invoke an in-process handler. 31 + // 32 + // Behavior matches what stdlib serves: WriteHeader is sticky (first call 33 + // wins), Write to a 0-status writer implies 200 OK, and Header() returns 34 + // a mutable map up until WriteHeader fires. 35 + type inMemoryResponseWriter struct { 36 + code int 37 + body *bytes.Buffer 38 + headers http.Header 39 + written bool 40 + } 41 + 42 + func newInMemoryResponseWriter() *inMemoryResponseWriter { 43 + return &inMemoryResponseWriter{ 44 + body: &bytes.Buffer{}, 45 + headers: http.Header{}, 46 + } 47 + } 48 + 49 + func (w *inMemoryResponseWriter) Header() http.Header { return w.headers } 50 + 51 + func (w *inMemoryResponseWriter) WriteHeader(code int) { 52 + if w.written { 53 + return 54 + } 55 + w.code = code 56 + w.written = true 57 + } 58 + 59 + func (w *inMemoryResponseWriter) Write(p []byte) (int, error) { 60 + if !w.written { 61 + w.code = http.StatusOK 62 + w.written = true 63 + } 64 + return w.body.Write(p) 65 + } 66 + 67 + // snapshot freezes the writer state into an adminProxyResponse the 68 + // caller can read without further mutation. 69 + func (w *inMemoryResponseWriter) snapshot() *adminProxyResponse { 70 + code := w.code 71 + if code == 0 { 72 + // Handler returned without writing anything — same convention as 73 + // net/http: empty body, 200. 74 + code = http.StatusOK 75 + } 76 + return &adminProxyResponse{Code: code, Body: w.body, header: w.headers} 77 + }
+75
internal/admin/ui/inproc_test.go
··· 1 + // SPDX-License-Identifier: AGPL-3.0-or-later 2 + 3 + package ui 4 + 5 + import ( 6 + "net/http" 7 + "reflect" 8 + "strings" 9 + "testing" 10 + ) 11 + 12 + func TestInMemoryResponseWriter_DefaultsTo200(t *testing.T) { 13 + w := newInMemoryResponseWriter() 14 + _, _ = w.Write([]byte("hello")) 15 + snap := w.snapshot() 16 + if snap.Code != http.StatusOK { 17 + t.Errorf("Code=%d, want 200 (Write without WriteHeader implies 200)", snap.Code) 18 + } 19 + if got := snap.Body.String(); got != "hello" { 20 + t.Errorf("Body=%q, want %q", got, "hello") 21 + } 22 + } 23 + 24 + func TestInMemoryResponseWriter_WriteHeaderIsSticky(t *testing.T) { 25 + w := newInMemoryResponseWriter() 26 + w.WriteHeader(http.StatusBadRequest) 27 + w.WriteHeader(http.StatusInternalServerError) // ignored 28 + snap := w.snapshot() 29 + if snap.Code != http.StatusBadRequest { 30 + t.Errorf("Code=%d, want 400 (first WriteHeader wins per net/http)", snap.Code) 31 + } 32 + } 33 + 34 + func TestInMemoryResponseWriter_EmptyHandlerReturns200(t *testing.T) { 35 + w := newInMemoryResponseWriter() 36 + snap := w.snapshot() 37 + if snap.Code != http.StatusOK { 38 + t.Errorf("empty handler Code=%d, want 200", snap.Code) 39 + } 40 + if snap.Body.Len() != 0 { 41 + t.Errorf("empty handler body has %d bytes, want 0", snap.Body.Len()) 42 + } 43 + } 44 + 45 + func TestInMemoryResponseWriter_HeaderRoundTrip(t *testing.T) { 46 + w := newInMemoryResponseWriter() 47 + w.Header().Set("X-Test", "value") 48 + w.Header().Add("X-Multi", "a") 49 + w.Header().Add("X-Multi", "b") 50 + snap := w.snapshot() 51 + if snap.Header().Get("X-Test") != "value" { 52 + t.Errorf("X-Test header lost: %v", snap.Header()) 53 + } 54 + if got := snap.Header().Values("X-Multi"); !reflect.DeepEqual(got, []string{"a", "b"}) { 55 + t.Errorf("X-Multi=%v, want [a b]", got) 56 + } 57 + } 58 + 59 + // TestProductionCallChain_FreeOfHttptest pins the #222 fix at compile time: 60 + // adminProxyResponse must NOT be a *httptest.ResponseRecorder. If a future 61 + // change reintroduces test-only types in the production path this test 62 + // breaks before review. 63 + func TestProductionCallChain_FreeOfHttptest(t *testing.T) { 64 + resp := &adminProxyResponse{} 65 + typeName := reflect.TypeOf(resp).String() 66 + if strings.Contains(typeName, "httptest") { 67 + t.Errorf("adminProxyResponse type %q contains 'httptest' — production call chain regressed (#222)", typeName) 68 + } 69 + rw := newInMemoryResponseWriter() 70 + var _ http.ResponseWriter = rw // compile-time interface conformance 71 + rwType := reflect.TypeOf(rw).String() 72 + if strings.Contains(rwType, "httptest") { 73 + t.Errorf("inMemoryResponseWriter type %q contains 'httptest' — production call chain regressed (#222)", rwType) 74 + } 75 + }
+83
internal/admin/ui/recover.go
··· 224 224 mux.Handle("/account", wrap(h.handleLanding)) 225 225 mux.Handle("/account/start", wrap(h.handleStart)) 226 226 mux.Handle("/account/manage", wrap(h.handleManage)) 227 + mux.Handle("/account/deliverability", wrap(h.handleDeliverability)) 227 228 mux.Handle("/account/select-domain", wrap(h.handleSelectDomain)) 228 229 mux.Handle("/account/regenerate", wrap(h.handleRegenerate)) 229 230 mux.Handle("/account/contact-email", wrap(h.handleContactEmail)) ··· 484 485 ContactEmail: memberDomain.ContactEmail, 485 486 EmailVerified: memberDomain.EmailVerified, 486 487 ExpiresAt: ticket.expiry.Format(time.RFC3339), 488 + }).Render(r.Context(), w) 489 + } 490 + 491 + func (h *RecoverHandler) handleDeliverability(w http.ResponseWriter, r *http.Request) { 492 + if r.Method != http.MethodGet { 493 + http.Error(w, "method not allowed", http.StatusMethodNotAllowed) 494 + return 495 + } 496 + id, ok := recoveryTicketFromCookie(r) 497 + if !ok { 498 + h.renderLandingErr(w, r, "Session expired or not found. Start over by signing in.") 499 + return 500 + } 501 + ticket, ok := h.lookupTicket(id, r.UserAgent()) 502 + if !ok { 503 + h.renderLandingErr(w, r, "Session expired or not found. Start over by signing in.") 504 + return 505 + } 506 + if ticket.domain == "" { 507 + http.Redirect(w, r, "/account/manage", http.StatusFound) 508 + return 509 + } 510 + 511 + ctx := r.Context() 512 + member, err := h.store.GetMember(ctx, ticket.did) 513 + if err != nil || member == nil { 514 + log.Printf("account.deliverability: did_hash=%s error=%v", HashForLog(ticket.did), err) 515 + http.Error(w, "internal error", http.StatusInternalServerError) 516 + return 517 + } 518 + 519 + since14d := time.Now().UTC().AddDate(0, 0, -14) 520 + total, bounced, err := h.store.GetMessageCounts(ctx, ticket.did, since14d) 521 + if err != nil { 522 + log.Printf("account.deliverability: GetMessageCounts did=%s error=%v", HashForLog(ticket.did), err) 523 + http.Error(w, "internal error", http.StatusInternalServerError) 524 + return 525 + } 526 + 527 + complaints, err := h.store.GetComplaintCount(ctx, ticket.did, since14d) 528 + if err != nil { 529 + log.Printf("account.deliverability: GetComplaintCount did=%s error=%v", HashForLog(ticket.did), err) 530 + http.Error(w, "internal error", http.StatusInternalServerError) 531 + return 532 + } 533 + 534 + daily, err := h.store.GetDailySendCounts(ctx, ticket.did, 14) 535 + if err != nil { 536 + log.Printf("account.deliverability: GetDailySendCounts did=%s error=%v", HashForLog(ticket.did), err) 537 + http.Error(w, "internal error", http.StatusInternalServerError) 538 + return 539 + } 540 + 541 + warmingTier := relay.MemberTier(relay.DefaultWarmingConfig(), member.CreatedAt, time.Now()) 542 + warmingLabel := "" 543 + switch warmingTier { 544 + case relay.TierWarming: 545 + warmingLabel = "Warming (0–7 days)" 546 + case relay.TierRamping: 547 + warmingLabel = "Ramping (7–14 days)" 548 + } 549 + 550 + bounceRate := 0.0 551 + if total > 0 { 552 + bounceRate = float64(bounced) / float64(total) 553 + } 554 + 555 + w.Header().Set("Content-Type", "text/html; charset=utf-8") 556 + _ = templates.DeliverabilityPage(templates.DeliverabilityData{ 557 + DID: ticket.did, 558 + Domain: ticket.domain, 559 + Status: member.Status, 560 + SuspendReason: member.SuspendReason, 561 + Sent14d: total, 562 + Bounced14d: bounced, 563 + Complaints14d: complaints, 564 + BounceRate: bounceRate, 565 + DailySends: daily, 566 + HourlyLimit: member.HourlyLimit, 567 + DailyLimit: member.DailyLimit, 568 + WarmingTier: warmingTier, 569 + WarmingLabel: warmingLabel, 487 570 }).Render(r.Context(), w) 488 571 } 489 572
+68
internal/admin/ui/recover_test.go
··· 5 5 import ( 6 6 "context" 7 7 "errors" 8 + "fmt" 8 9 "net/http" 9 10 "net/http/httptest" 10 11 "net/url" ··· 296 297 } 297 298 if strings.Contains(body, `action="/account/select-domain"`) { 298 299 t.Error("single-domain account should not show the domain picker") 300 + } 301 + } 302 + 303 + func TestRecover_DeliverabilityRendersForValidTicket(t *testing.T) { 304 + store := newRecoverTestStore(t) 305 + did := "did:plc:deliver2222222222222" 306 + domain := "deliver.example.com" 307 + seedRecoverMember(t, store, did, domain) 308 + 309 + ctx := context.Background() 310 + now := time.Now().UTC() 311 + for i := 0; i < 3; i++ { 312 + _, _ = store.InsertMessage(ctx, &relaystore.Message{ 313 + MemberDID: did, FromAddr: "x@deliver.example.com", ToAddr: "y@z.com", 314 + MessageID: fmt.Sprintf("<m%d>", i), Status: relaystore.MsgSent, CreatedAt: now, 315 + }) 316 + } 317 + 318 + h := NewRecoverHandler(&fakePublisher{}, store, "https://example.com", nil) 319 + target := h.IssueRecoveryTicket(did, domain) 320 + ticket := strings.TrimPrefix(target, "/account/manage?ticket=") 321 + 322 + mux := http.NewServeMux() 323 + h.RegisterRoutes(mux) 324 + 325 + req := httptest.NewRequest(http.MethodGet, "/account/deliverability", nil) 326 + req.AddCookie(&http.Cookie{Name: RecoveryCookieName, Value: ticket}) 327 + rec := httptest.NewRecorder() 328 + mux.ServeHTTP(rec, req) 329 + 330 + if rec.Code != http.StatusOK { 331 + t.Fatalf("status = %d, want 200", rec.Code) 332 + } 333 + body := rec.Body.String() 334 + if !strings.Contains(body, "Deliverability") { 335 + t.Error("deliverability page missing heading") 336 + } 337 + if !strings.Contains(body, domain) { 338 + t.Error("deliverability page missing domain") 339 + } 340 + if !strings.Contains(body, "Sent (14d)") { 341 + t.Error("deliverability page missing sent stat") 342 + } 343 + } 344 + 345 + func TestRecover_DeliverabilityRedirectsWithoutDomain(t *testing.T) { 346 + store := newRecoverTestStore(t) 347 + did := "did:plc:deliver3333333333333" 348 + seedRecoverMember(t, store, did, "deliver2.example.com") 349 + 350 + h := NewRecoverHandler(&fakePublisher{}, store, "https://example.com", nil) 351 + target := h.IssueRecoveryTicket(did, "") 352 + ticket := strings.TrimPrefix(target, "/account/manage?ticket=") 353 + 354 + mux := http.NewServeMux() 355 + h.RegisterRoutes(mux) 356 + 357 + req := httptest.NewRequest(http.MethodGet, "/account/deliverability", nil) 358 + req.AddCookie(&http.Cookie{Name: RecoveryCookieName, Value: ticket}) 359 + rec := httptest.NewRecorder() 360 + mux.ServeHTTP(rec, req) 361 + 362 + if rec.Code != http.StatusFound { 363 + t.Fatalf("status = %d, want 302", rec.Code) 364 + } 365 + if loc := rec.Header().Get("Location"); loc != "/account/manage" { 366 + t.Errorf("redirect = %q, want /account/manage", loc) 299 367 } 300 368 } 301 369
+44
internal/admin/ui/recover_uabind_contract_test.go
··· 1 + // SPDX-License-Identifier: AGPL-3.0-or-later 2 + 3 + package ui 4 + 5 + import ( 6 + "reflect" 7 + "testing" 8 + ) 9 + 10 + // TestRecoveryIssuer_RequiresUABinding pins the interface shape so a 11 + // future refactor can't accidentally re-introduce a no-UA entry point 12 + // on the contract that production callers use. The whole point of 13 + // #212 is that the OAuth callback MUST forward the User-Agent into 14 + // the ticket binding — an interface that exposes a no-UA method 15 + // would let that drift back over time. 16 + func TestRecoveryIssuer_RequiresUABinding(t *testing.T) { 17 + typ := reflect.TypeOf((*RecoveryIssuer)(nil)).Elem() 18 + if typ.NumMethod() != 1 { 19 + t.Fatalf("RecoveryIssuer should have exactly 1 method, got %d", typ.NumMethod()) 20 + } 21 + m := typ.Method(0) 22 + if m.Name != "IssueRecoveryTicketWithUA" { 23 + t.Errorf("RecoveryIssuer.method[0] = %q, want IssueRecoveryTicketWithUA — re-introducing a no-UA entry point regresses #212", 24 + m.Name) 25 + } 26 + // The signature is (did, domain, ua string) string — 3 string args 27 + // (plus the receiver), one string return. 28 + if m.Type.NumIn() != 3 { 29 + t.Errorf("IssueRecoveryTicketWithUA should accept 3 args (did, domain, ua), got %d", m.Type.NumIn()) 30 + } 31 + if m.Type.NumOut() != 1 { 32 + t.Errorf("IssueRecoveryTicketWithUA should return 1 value (URL), got %d", m.Type.NumOut()) 33 + } 34 + } 35 + 36 + // TestRecoverHandler_SatisfiesRecoveryIssuer is a compile-time guard: 37 + // if *RecoverHandler ever drops IssueRecoveryTicketWithUA, this test 38 + // won't link. The whole production path (cmd/relay calls 39 + // AttestHandler.SetRecoveryIssuer with a *RecoverHandler) depends on 40 + // this assignment compiling. 41 + func TestRecoverHandler_SatisfiesRecoveryIssuer(t *testing.T) { 42 + var _ RecoveryIssuer = (*RecoverHandler)(nil) 43 + t.Log("*RecoverHandler satisfies RecoveryIssuer; compile-time check OK") 44 + }
+127
internal/admin/ui/templates/deliverability.go
··· 1 + // SPDX-License-Identifier: AGPL-3.0-or-later 2 + 3 + package templates 4 + 5 + // DeliverabilityPage is the member-facing view of their own sending 6 + // reputation: bounces, complaints, warming tier, and daily volume trend. 7 + 8 + import ( 9 + "context" 10 + "fmt" 11 + "html" 12 + "io" 13 + "strings" 14 + 15 + "github.com/a-h/templ" 16 + ) 17 + 18 + // DeliverabilityData carries all metrics for the /account/deliverability page. 19 + type DeliverabilityData struct { 20 + DID string 21 + Domain string 22 + Status string 23 + SuspendReason string 24 + 25 + Sent14d int64 26 + Bounced14d int64 27 + Complaints14d int64 28 + BounceRate float64 // 0.0–1.0 29 + 30 + DailySends []int64 // 14 buckets, oldest-to-newest 31 + 32 + HourlyLimit int 33 + DailyLimit int 34 + 35 + WarmingTier string // "warming" | "ramping" | "warmed" | "" 36 + WarmingLabel string // human-readable, e.g. "warming (3/7 days)" 37 + 38 + Labels []string // Osprey + labeler labels 39 + } 40 + 41 + func DeliverabilityPage(d DeliverabilityData) templ.Component { 42 + return templ.ComponentFunc(func(ctx context.Context, w io.Writer) error { 43 + inner := templ.ComponentFunc(func(_ context.Context, w io.Writer) error { 44 + var b strings.Builder 45 + 46 + b.WriteString(`<nav class="topnav" aria-label="breadcrumb"><a href="/account" class="topnav-home">← Account</a></nav>`) 47 + b.WriteString(`<h1 class="masthead masthead-sub">Deliverability</h1>`) 48 + fmt.Fprintf(&b, `<p class="lede">Sending reputation for <code>%s</code>.</p>`, html.EscapeString(d.Domain)) 49 + 50 + // Status banner 51 + if d.Status == "suspended" { 52 + b.WriteString(`<div class="error-note" role="alert"><p style="margin: 0;"><strong>Account suspended.</strong>`) 53 + if d.SuspendReason != "" { 54 + fmt.Fprintf(&b, ` Reason: %s`, html.EscapeString(d.SuspendReason)) 55 + } 56 + b.WriteString(` SMTP submission is currently rejected. Contact the operator to appeal.</p></div>`) 57 + } 58 + 59 + b.WriteString(`<div class="stat-grid" style="display: grid; grid-template-columns: repeat(auto-fit, minmax(140px, 1fr)); gap: 1rem; margin: 1.5rem 0;">`) 60 + b.WriteString(statCard("Sent (14d)", fmt.Sprintf("%d", d.Sent14d))) 61 + b.WriteString(statCard("Bounced", fmt.Sprintf("%d", d.Bounced14d))) 62 + b.WriteString(statCard("Complaints", fmt.Sprintf("%d", d.Complaints14d))) 63 + b.WriteString(statCard("Bounce rate", fmt.Sprintf("%.1f%%", d.BounceRate*100))) 64 + b.WriteString(`</div>`) 65 + 66 + // Sparkline 67 + if len(d.DailySends) > 0 { 68 + b.WriteString(`<section class="section">`) 69 + b.WriteString(`<h2>Sends per day</h2>`) 70 + b.WriteString(sparklineSVG(d.DailySends)) 71 + b.WriteString(`</section>`) 72 + } 73 + 74 + // Warming tier 75 + if d.WarmingLabel != "" { 76 + b.WriteString(`<section class="section">`) 77 + b.WriteString(`<h2>Warming progress</h2>`) 78 + fmt.Fprintf(&b, `<p class="section-lede">%s</p>`, html.EscapeString(d.WarmingLabel)) 79 + b.WriteString(warningNote(d.WarmingTier)) 80 + b.WriteString(`</section>`) 81 + } 82 + 83 + // Limits 84 + b.WriteString(`<section class="section">`) 85 + b.WriteString(`<h2>Current limits</h2>`) 86 + fmt.Fprintf(&b, `<dl class="bullets"><dt>Hourly limit</dt><dd>%d</dd><dt>Daily limit</dt><dd>%d</dd></dl>`, d.HourlyLimit, d.DailyLimit) 87 + b.WriteString(`</section>`) 88 + 89 + // Labels 90 + if len(d.Labels) > 0 { 91 + b.WriteString(`<section class="section">`) 92 + b.WriteString(`<h2>Reputation labels</h2>`) 93 + b.WriteString(`<p class="section-lede">Labels published by the atproto labeler. Other services can query these to decide whether to trust mail from your domain.</p>`) 94 + for _, l := range d.Labels { 95 + fmt.Fprintf(&b, `<span class="badge badge-label">%s</span> `, html.EscapeString(l)) 96 + } 97 + b.WriteString(`</section>`) 98 + } 99 + 100 + b.WriteString(`<section class="section">`) 101 + b.WriteString(`<p class="section-lede">These numbers update in real time. Bounce rate above 5%% or complaint rate above 0.1%% can trigger automatic throttling or suspension. The fix is always the same: send only to engaged recipients who asked for your mail.</p>`) 102 + b.WriteString(`</section>`) 103 + 104 + _, err := io.WriteString(w, b.String()) 105 + return err 106 + }) 107 + return publicLayout("Deliverability — "+d.Domain, false).Render(templ.WithChildren(ctx, inner), w) 108 + }) 109 + } 110 + 111 + func statCard(title, value string) string { 112 + return fmt.Sprintf(`<article style="background: var(--surface); border: 1px solid var(--line); padding: 1rem; border-radius: 2px;"> 113 + <div style="font-size: var(--t-xs); text-transform: uppercase; letter-spacing: 0.1em; color: var(--muted); margin-bottom: 0.5rem;">%s</div> 114 + <div style="font-size: var(--t-2xl); font-family: var(--font-display); color: var(--ink);">%s</div> 115 + </article>`, html.EscapeString(title), html.EscapeString(value)) 116 + } 117 + 118 + func warningNote(tier string) string { 119 + switch tier { 120 + case "warming": 121 + return `<p class="section-lede" style="color: var(--accent-ink);">Your domain is in the warming tier: 5 emails per hour, 20 per day. This protects the shared IP while Gmail learns your sending pattern. The cap lifts automatically after 7 days of clean sending.</p>` 122 + case "ramping": 123 + return `<p class="section-lede" style="color: var(--accent-ink);">Your domain is ramping: 20 emails per hour, 100 per day. Keep engagement high and complaints low. Full limits unlock after 14 days total.</p>` 124 + default: 125 + return "" 126 + } 127 + }
+7 -6
internal/admin/ui/templates/enroll.templ
··· 529 529 <footer> 530 530 Atmosphere Mail LLC · cooperative email infrastructure for atproto 531 531 <br/> 532 - <a href="/terms">Terms</a> · 533 - <a href="/privacy">Privacy</a> · 534 - <a href="/aup">Acceptable use</a> · 535 - <a href="/about">About</a> · 536 - <a href="https://status.atmos.email">Status</a> · 537 - <a href="https://tangled.org/scottlanoue.com/atmosphere-mail">Source</a> 532 + <a href="/terms">Terms</a> · 533 + <a href="/privacy">Privacy</a> · 534 + <a href="/aup">Acceptable use</a> · 535 + <a href="/faq">FAQ</a> · 536 + <a href="/about">About</a> · 537 + <a href="https://status.atmos.email">Status</a> · 538 + <a href="https://tangled.org/scottlanoue.com/atmosphere-mail">Source</a> 538 539 </footer> 539 540 </main> 540 541 </body>
+1 -1
internal/admin/ui/templates/enroll_templ.go
··· 98 98 if templ_7745c5c3_Err != nil { 99 99 return templ_7745c5c3_Err 100 100 } 101 - templ_7745c5c3_Err = templruntime.WriteString(templ_7745c5c3_Buffer, 5, "<footer>Atmosphere Mail LLC · cooperative email infrastructure for atproto<br><a href=\"/terms\">Terms</a> · <a href=\"/privacy\">Privacy</a> · <a href=\"/aup\">Acceptable use</a> · <a href=\"/about\">About</a> · <a href=\"https://status.atmos.email\">Status</a> · <a href=\"https://tangled.org/scottlanoue.com/atmosphere-mail\">Source</a></footer></main></body></html>") 101 + templ_7745c5c3_Err = templruntime.WriteString(templ_7745c5c3_Buffer, 5, "<footer>Atmosphere Mail LLC · cooperative email infrastructure for atproto<br><a href=\"/terms\">Terms</a> · <a href=\"/privacy\">Privacy</a> · <a href=\"/aup\">Acceptable use</a> · <a href=\"/faq\">FAQ</a> · <a href=\"/about\">About</a> · <a href=\"https://status.atmos.email\">Status</a> · <a href=\"https://tangled.org/scottlanoue.com/atmosphere-mail\">Source</a></footer></main></body></html>") 102 102 if templ_7745c5c3_Err != nil { 103 103 return templ_7745c5c3_Err 104 104 }
+73
internal/admin/ui/templates/faq.go
··· 1 + // SPDX-License-Identifier: AGPL-3.0-or-later 2 + 3 + package templates 4 + 5 + // FAQPage answers the questions prospective members ask before they enroll. 6 + // Honest, concise, and written to defuse the obvious objections. 7 + 8 + import ( 9 + "context" 10 + "io" 11 + "strings" 12 + 13 + "github.com/a-h/templ" 14 + ) 15 + 16 + func FAQPage() templ.Component { 17 + return templ.ComponentFunc(func(ctx context.Context, w io.Writer) error { 18 + inner := templ.ComponentFunc(func(_ context.Context, w io.Writer) error { 19 + var b strings.Builder 20 + 21 + b.WriteString(`<h1 class="masthead masthead-sub">FAQ</h1>`) 22 + b.WriteString(`<p class="lede">Questions we expect — answered honestly.</p>`) 23 + 24 + b.WriteString(`<section class="section">`) 25 + b.WriteString(`<span class="step-marker">Pricing</span>`) 26 + b.WriteString(`<h2>Why is this free? Will it stay free?</h2>`) 27 + b.WriteString(`<p class="section-lede">It's free now because the relay needs a diverse, honest sender base to build IP reputation before we can responsibly charge anyone. Once the pool is warm and the first billing system is wired, paid tiers will start at around $10–15 per month per PDS operator. There will always be a generous free tier for low-volume senders.</p>`) 28 + b.WriteString(`<p class="section-lede">If you enroll today, you are not signing up for a future invoice. We will announce pricing changes with at least 30 days' notice, and you can export your reputation or leave at any time.</p>`) 29 + b.WriteString(`</section>`) 30 + 31 + b.WriteString(`<section class="section">`) 32 + b.WriteString(`<span class="step-marker">Trust</span>`) 33 + b.WriteString(`<h2>How can I trust this?</h2>`) 34 + b.WriteString(`<p class="section-lede">You don't have to trust us blindly. The relay source code is open source (AGPL-3.0-or-later), the Osprey reputation rules are published, and the atproto labeler feed is public. You — or your favorite LLM — can audit exactly how deliverability decisions are made.</p>`) 35 + b.WriteString(`<p class="section-lede">On privacy: the relay sees message metadata (sender, recipient, timestamp, size) but never the raw message body. That is the same trust model as Postmark, Mailgun, or Amazon SES, except here the code is open and the operator is a small LLC instead of a public company.</p>`) 36 + b.WriteString(`</section>`) 37 + 38 + b.WriteString(`<section class="section">`) 39 + b.WriteString(`<span class="step-marker">Alternatives</span>`) 40 + b.WriteString(`<h2>Why not use a trusted commercial relay?</h2>`) 41 + b.WriteString(`<p class="section-lede">Commercial relays work well, but your domain reputation lives inside their business. If you switch providers, you start from zero. Atmosphere Mail is designed so your reputation stays with you: your DID, your domain, your attestation record. If you ever want to run your own relay, the code and the reputation layer come with you.</p>`) 42 + b.WriteString(`<p class="section-lede">The long-term goal is a federation of cooperative relays that share a reputation blocklist indexed through atproto. One relay is live today; the architecture is built for many.</p>`) 43 + b.WriteString(`</section>`) 44 + 45 + b.WriteString(`<section class="section">`) 46 + b.WriteString(`<span class="step-marker">Deliverability</span>`) 47 + b.WriteString(`<h2>Will my mail reach the inbox?</h2>`) 48 + b.WriteString(`<p class="section-lede">Maybe not on day one. Gmail treats mail from a new IP as suspicious regardless of authentication cleanliness. The relay protects the shared pool with warming tier caps: 5 emails per hour for the first week, graduating as your domain builds reputation. Expect some messages to land in spam initially. The fix is slow, engaged sending — not better DNS records.</p>`) 49 + b.WriteString(`<p class="section-lede">We run pool-level feedback loops with Gmail, Microsoft, and Yahoo so complaints route back to the offending member, not the whole cooperative. That is how shared reputation stays shared instead of collective punishment.</p>`) 50 + b.WriteString(`</section>`) 51 + 52 + b.WriteString(`<section class="section">`) 53 + b.WriteString(`<span class="step-marker">Portability</span>`) 54 + b.WriteString(`<h2>What if I want to leave?</h2>`) 55 + b.WriteString(`<p class="section-lede">Your domain reputation is yours. The DKIM keys are published in your DNS, the attestation record lives on your PDS, and the <code>verified-mail-operator</code> label is signed against your DID. If you graduate to self-hosted delivery, those signals travel with you. If you want your member record deleted, email <a href="mailto:postmaster@atmos.email">postmaster@atmos.email</a> and we will remove it within 14 days.</p>`) 56 + b.WriteString(`</section>`) 57 + 58 + b.WriteString(`<section class="section">`) 59 + b.WriteString(`<span class="step-marker">Scope</span>`) 60 + b.WriteString(`<h2>What can I send through this relay?</h2>`) 61 + b.WriteString(`<p class="section-lede">Transactional and operational mail from your own domain: verification codes, password resets, notifications, personal correspondence. Unsolicited bulk mail, scraped lists, and relaying for third parties will get you suspended quickly. See the <a href="/aup">Acceptable Use Policy</a> for the full list.</p>`) 62 + b.WriteString(`</section>`) 63 + 64 + b.WriteString(`<section class="section">`) 65 + b.WriteString(`<p class="section-lede">Still have questions? Reach the operator at <a href="https://bsky.app/profile/scottlanoue.com">@scottlanoue.com</a> or <a href="mailto:postmaster@atmos.email">postmaster@atmos.email</a>.</p>`) 66 + b.WriteString(`</section>`) 67 + 68 + _, err := io.WriteString(w, b.String()) 69 + return err 70 + }) 71 + return publicLayout("FAQ — Atmosphere Mail", false).Render(templ.WithChildren(ctx, inner), w) 72 + }) 73 + }
+12
internal/admin/ui/templates/member_detail.templ
··· 223 223 </button> 224 224 } 225 225 </div> 226 + <details> 227 + <summary role="button" class="outline contrast" style="margin-top: 1rem; font-size: 0.85rem;">Permanently Delete Member</summary> 228 + <p style="margin: 0.5rem 0;"><small>This permanently removes the member, all domains, DKIM keys, message history, and rate counters. Suppressions are preserved for compliance. This cannot be undone.</small></p> 229 + <button 230 + class="secondary" 231 + style="background: var(--pico-del-color, #c62828); border-color: var(--pico-del-color, #c62828);" 232 + hx-delete={ "/ui/member/" + m.DID + "/delete" } 233 + hx-confirm={ "PERMANENTLY DELETE " + m.Domain + " (" + m.DID + ")? This cannot be undone." } 234 + > 235 + Delete { m.Domain } Forever 236 + </button> 237 + </details> 226 238 }
+70 -33
internal/admin/ui/templates/member_detail_templ.go
··· 1 - // SPDX-License-Identifier: AGPL-3.0-or-later 2 - 3 1 // Code generated by templ - DO NOT EDIT. 4 2 5 3 // templ: version: v0.3.1001 ··· 87 85 var templ_7745c5c3_Var3 string 88 86 templ_7745c5c3_Var3, templ_7745c5c3_Err = templ.JoinStringErrs(m.Domain) 89 87 if templ_7745c5c3_Err != nil { 90 - return templ.Error{Err: templ_7745c5c3_Err, FileName: `internal/admin/ui/templates/member_detail.templ`, Line: 44, Col: 18} 88 + return templ.Error{Err: templ_7745c5c3_Err, FileName: `internal/admin/ui/templates/member_detail.templ`, Line: 45, Col: 18} 91 89 } 92 90 _, templ_7745c5c3_Err = templ_7745c5c3_Buffer.WriteString(templ.EscapeString(templ_7745c5c3_Var3)) 93 91 if templ_7745c5c3_Err != nil { ··· 139 137 var templ_7745c5c3_Var5 string 140 138 templ_7745c5c3_Var5, templ_7745c5c3_Err = templ.JoinStringErrs(m.Domain) 141 139 if templ_7745c5c3_Err != nil { 142 - return templ.Error{Err: templ_7745c5c3_Err, FileName: `internal/admin/ui/templates/member_detail.templ`, Line: 54, Col: 17} 140 + return templ.Error{Err: templ_7745c5c3_Err, FileName: `internal/admin/ui/templates/member_detail.templ`, Line: 55, Col: 17} 143 141 } 144 142 _, templ_7745c5c3_Err = templ_7745c5c3_Buffer.WriteString(templ.EscapeString(templ_7745c5c3_Var5)) 145 143 if templ_7745c5c3_Err != nil { ··· 152 150 var templ_7745c5c3_Var6 string 153 151 templ_7745c5c3_Var6, templ_7745c5c3_Err = templ.JoinStringErrs(m.DID) 154 152 if templ_7745c5c3_Err != nil { 155 - return templ.Error{Err: templ_7745c5c3_Err, FileName: `internal/admin/ui/templates/member_detail.templ`, Line: 55, Col: 19} 153 + return templ.Error{Err: templ_7745c5c3_Err, FileName: `internal/admin/ui/templates/member_detail.templ`, Line: 56, Col: 19} 156 154 } 157 155 _, templ_7745c5c3_Err = templ_7745c5c3_Buffer.WriteString(templ.EscapeString(templ_7745c5c3_Var6)) 158 156 if templ_7745c5c3_Err != nil { ··· 173 171 var templ_7745c5c3_Var7 string 174 172 templ_7745c5c3_Var7, templ_7745c5c3_Err = templ.JoinStringErrs(fmt.Sprintf("%d", m.SendCount)) 175 173 if templ_7745c5c3_Err != nil { 176 - return templ.Error{Err: templ_7745c5c3_Err, FileName: `internal/admin/ui/templates/member_detail.templ`, Line: 68, Col: 41} 174 + return templ.Error{Err: templ_7745c5c3_Err, FileName: `internal/admin/ui/templates/member_detail.templ`, Line: 69, Col: 41} 177 175 } 178 176 _, templ_7745c5c3_Err = templ_7745c5c3_Buffer.WriteString(templ.EscapeString(templ_7745c5c3_Var7)) 179 177 if templ_7745c5c3_Err != nil { ··· 186 184 var templ_7745c5c3_Var8 string 187 185 templ_7745c5c3_Var8, templ_7745c5c3_Err = templ.JoinStringErrs(fmt.Sprintf("%d", m.HourlyLimit)) 188 186 if templ_7745c5c3_Err != nil { 189 - return templ.Error{Err: templ_7745c5c3_Err, FileName: `internal/admin/ui/templates/member_detail.templ`, Line: 70, Col: 43} 187 + return templ.Error{Err: templ_7745c5c3_Err, FileName: `internal/admin/ui/templates/member_detail.templ`, Line: 71, Col: 43} 190 188 } 191 189 _, templ_7745c5c3_Err = templ_7745c5c3_Buffer.WriteString(templ.EscapeString(templ_7745c5c3_Var8)) 192 190 if templ_7745c5c3_Err != nil { ··· 199 197 var templ_7745c5c3_Var9 string 200 198 templ_7745c5c3_Var9, templ_7745c5c3_Err = templ.JoinStringErrs(fmt.Sprintf("%d", m.DailyLimit)) 201 199 if templ_7745c5c3_Err != nil { 202 - return templ.Error{Err: templ_7745c5c3_Err, FileName: `internal/admin/ui/templates/member_detail.templ`, Line: 72, Col: 42} 200 + return templ.Error{Err: templ_7745c5c3_Err, FileName: `internal/admin/ui/templates/member_detail.templ`, Line: 73, Col: 42} 203 201 } 204 202 _, templ_7745c5c3_Err = templ_7745c5c3_Buffer.WriteString(templ.EscapeString(templ_7745c5c3_Var9)) 205 203 if templ_7745c5c3_Err != nil { ··· 212 210 var templ_7745c5c3_Var10 string 213 211 templ_7745c5c3_Var10, templ_7745c5c3_Err = templ.JoinStringErrs(m.CreatedAt) 214 212 if templ_7745c5c3_Err != nil { 215 - return templ.Error{Err: templ_7745c5c3_Err, FileName: `internal/admin/ui/templates/member_detail.templ`, Line: 79, Col: 22} 213 + return templ.Error{Err: templ_7745c5c3_Err, FileName: `internal/admin/ui/templates/member_detail.templ`, Line: 80, Col: 22} 216 214 } 217 215 _, templ_7745c5c3_Err = templ_7745c5c3_Buffer.WriteString(templ.EscapeString(templ_7745c5c3_Var10)) 218 216 if templ_7745c5c3_Err != nil { ··· 230 228 var templ_7745c5c3_Var11 string 231 229 templ_7745c5c3_Var11, templ_7745c5c3_Err = templ.JoinStringErrs(fmt.Sprintf("Domains (%d)", len(m.AllDomains))) 232 230 if templ_7745c5c3_Err != nil { 233 - return templ.Error{Err: templ_7745c5c3_Err, FileName: `internal/admin/ui/templates/member_detail.templ`, Line: 85, Col: 60} 231 + return templ.Error{Err: templ_7745c5c3_Err, FileName: `internal/admin/ui/templates/member_detail.templ`, Line: 86, Col: 60} 234 232 } 235 233 _, templ_7745c5c3_Err = templ_7745c5c3_Buffer.WriteString(templ.EscapeString(templ_7745c5c3_Var11)) 236 234 if templ_7745c5c3_Err != nil { ··· 248 246 var templ_7745c5c3_Var12 string 249 247 templ_7745c5c3_Var12, templ_7745c5c3_Err = templ.JoinStringErrs(d) 250 248 if templ_7745c5c3_Err != nil { 251 - return templ.Error{Err: templ_7745c5c3_Err, FileName: `internal/admin/ui/templates/member_detail.templ`, Line: 89, Col: 16} 249 + return templ.Error{Err: templ_7745c5c3_Err, FileName: `internal/admin/ui/templates/member_detail.templ`, Line: 90, Col: 16} 252 250 } 253 251 _, templ_7745c5c3_Err = templ_7745c5c3_Buffer.WriteString(templ.EscapeString(templ_7745c5c3_Var12)) 254 252 if templ_7745c5c3_Err != nil { ··· 266 264 var templ_7745c5c3_Var13 string 267 265 templ_7745c5c3_Var13, templ_7745c5c3_Err = templ.JoinStringErrs(" · ") 268 266 if templ_7745c5c3_Err != nil { 269 - return templ.Error{Err: templ_7745c5c3_Err, FileName: `internal/admin/ui/templates/member_detail.templ`, Line: 91, Col: 23} 267 + return templ.Error{Err: templ_7745c5c3_Err, FileName: `internal/admin/ui/templates/member_detail.templ`, Line: 92, Col: 23} 270 268 } 271 269 _, templ_7745c5c3_Err = templ_7745c5c3_Buffer.WriteString(templ.EscapeString(templ_7745c5c3_Var13)) 272 270 if templ_7745c5c3_Err != nil { ··· 279 277 var templ_7745c5c3_Var14 string 280 278 templ_7745c5c3_Var14, templ_7745c5c3_Err = templ.JoinStringErrs(m.AllContactEmails[i]) 281 279 if templ_7745c5c3_Err != nil { 282 - return templ.Error{Err: templ_7745c5c3_Err, FileName: `internal/admin/ui/templates/member_detail.templ`, Line: 91, Col: 55} 280 + return templ.Error{Err: templ_7745c5c3_Err, FileName: `internal/admin/ui/templates/member_detail.templ`, Line: 92, Col: 55} 283 281 } 284 282 _, templ_7745c5c3_Err = templ_7745c5c3_Buffer.WriteString(templ.EscapeString(templ_7745c5c3_Var14)) 285 283 if templ_7745c5c3_Err != nil { 286 284 return templ_7745c5c3_Err 287 285 } 288 - templ_7745c5c3_Err = templruntime.WriteString(templ_7745c5c3_Buffer, 17, "</code></small>") 286 + templ_7745c5c3_Err = templruntime.WriteString(templ_7745c5c3_Buffer, 17, "</code></small> ") 289 287 if templ_7745c5c3_Err != nil { 290 288 return templ_7745c5c3_Err 291 289 } ··· 310 308 var templ_7745c5c3_Var15 string 311 309 templ_7745c5c3_Var15, templ_7745c5c3_Err = templ.JoinStringErrs(" · no contact email") 312 310 if templ_7745c5c3_Err != nil { 313 - return templ.Error{Err: templ_7745c5c3_Err, FileName: `internal/admin/ui/templates/member_detail.templ`, Line: 93, Col: 67} 311 + return templ.Error{Err: templ_7745c5c3_Err, FileName: `internal/admin/ui/templates/member_detail.templ`, Line: 101, Col: 67} 314 312 } 315 313 _, templ_7745c5c3_Err = templ_7745c5c3_Buffer.WriteString(templ.EscapeString(templ_7745c5c3_Var15)) 316 314 if templ_7745c5c3_Err != nil { ··· 357 355 var templ_7745c5c3_Var16 string 358 356 templ_7745c5c3_Var16, templ_7745c5c3_Err = templ.JoinStringErrs(" ") 359 357 if templ_7745c5c3_Err != nil { 360 - return templ.Error{Err: templ_7745c5c3_Err, FileName: `internal/admin/ui/templates/member_detail.templ`, Line: 108, Col: 11} 358 + return templ.Error{Err: templ_7745c5c3_Err, FileName: `internal/admin/ui/templates/member_detail.templ`, Line: 116, Col: 11} 361 359 } 362 360 _, templ_7745c5c3_Err = templ_7745c5c3_Buffer.WriteString(templ.EscapeString(templ_7745c5c3_Var16)) 363 361 if templ_7745c5c3_Err != nil { ··· 418 416 var templ_7745c5c3_Var18 string 419 417 templ_7745c5c3_Var18, templ_7745c5c3_Err = templ.JoinStringErrs(fmt.Sprintf("%d", m.WarmupSeeds)) 420 418 if templ_7745c5c3_Err != nil { 421 - return templ.Error{Err: templ_7745c5c3_Err, FileName: `internal/admin/ui/templates/member_detail.templ`, Line: 122, Col: 56} 419 + return templ.Error{Err: templ_7745c5c3_Err, FileName: `internal/admin/ui/templates/member_detail.templ`, Line: 130, Col: 56} 422 420 } 423 421 _, templ_7745c5c3_Err = templ_7745c5c3_Buffer.WriteString(templ.EscapeString(templ_7745c5c3_Var18)) 424 422 if templ_7745c5c3_Err != nil { ··· 431 429 var templ_7745c5c3_Var19 string 432 430 templ_7745c5c3_Var19, templ_7745c5c3_Err = templ.JoinStringErrs(m.Domain) 433 431 if templ_7745c5c3_Err != nil { 434 - return templ.Error{Err: templ_7745c5c3_Err, FileName: `internal/admin/ui/templates/member_detail.templ`, Line: 122, Col: 94} 432 + return templ.Error{Err: templ_7745c5c3_Err, FileName: `internal/admin/ui/templates/member_detail.templ`, Line: 130, Col: 94} 435 433 } 436 434 _, templ_7745c5c3_Err = templ_7745c5c3_Buffer.WriteString(templ.EscapeString(templ_7745c5c3_Var19)) 437 435 if templ_7745c5c3_Err != nil { ··· 444 442 var templ_7745c5c3_Var20 string 445 443 templ_7745c5c3_Var20, templ_7745c5c3_Err = templ.JoinStringErrs("/ui/member/" + m.DID + "/warmup") 446 444 if templ_7745c5c3_Err != nil { 447 - return templ.Error{Err: templ_7745c5c3_Err, FileName: `internal/admin/ui/templates/member_detail.templ`, Line: 124, Col: 47} 445 + return templ.Error{Err: templ_7745c5c3_Err, FileName: `internal/admin/ui/templates/member_detail.templ`, Line: 132, Col: 47} 448 446 } 449 447 _, templ_7745c5c3_Err = templ_7745c5c3_Buffer.WriteString(templ.EscapeString(templ_7745c5c3_Var20)) 450 448 if templ_7745c5c3_Err != nil { ··· 457 455 var templ_7745c5c3_Var21 string 458 456 templ_7745c5c3_Var21, templ_7745c5c3_Err = templ.JoinStringErrs(fmt.Sprintf("Send %d warmup emails from %s?", m.WarmupSeeds, m.Domain)) 459 457 if templ_7745c5c3_Err != nil { 460 - return templ.Error{Err: templ_7745c5c3_Err, FileName: `internal/admin/ui/templates/member_detail.templ`, Line: 127, Col: 87} 458 + return templ.Error{Err: templ_7745c5c3_Err, FileName: `internal/admin/ui/templates/member_detail.templ`, Line: 135, Col: 87} 461 459 } 462 460 _, templ_7745c5c3_Err = templ_7745c5c3_Buffer.WriteString(templ.EscapeString(templ_7745c5c3_Var21)) 463 461 if templ_7745c5c3_Err != nil { ··· 500 498 var templ_7745c5c3_Var23 string 501 499 templ_7745c5c3_Var23, templ_7745c5c3_Err = templ.JoinStringErrs(fmt.Sprintf("%d", sent)) 502 500 if templ_7745c5c3_Err != nil { 503 - return templ.Error{Err: templ_7745c5c3_Err, FileName: `internal/admin/ui/templates/member_detail.templ`, Line: 138, Col: 35} 501 + return templ.Error{Err: templ_7745c5c3_Err, FileName: `internal/admin/ui/templates/member_detail.templ`, Line: 146, Col: 35} 504 502 } 505 503 _, templ_7745c5c3_Err = templ_7745c5c3_Buffer.WriteString(templ.EscapeString(templ_7745c5c3_Var23)) 506 504 if templ_7745c5c3_Err != nil { ··· 513 511 var templ_7745c5c3_Var24 string 514 512 templ_7745c5c3_Var24, templ_7745c5c3_Err = templ.JoinStringErrs(fmt.Sprintf("%d", failed)) 515 513 if templ_7745c5c3_Err != nil { 516 - return templ.Error{Err: templ_7745c5c3_Err, FileName: `internal/admin/ui/templates/member_detail.templ`, Line: 139, Col: 37} 514 + return templ.Error{Err: templ_7745c5c3_Err, FileName: `internal/admin/ui/templates/member_detail.templ`, Line: 147, Col: 37} 517 515 } 518 516 _, templ_7745c5c3_Err = templ_7745c5c3_Buffer.WriteString(templ.EscapeString(templ_7745c5c3_Var24)) 519 517 if templ_7745c5c3_Err != nil { ··· 536 534 var templ_7745c5c3_Var25 string 537 535 templ_7745c5c3_Var25, templ_7745c5c3_Err = templ.JoinStringErrs(e) 538 536 if templ_7745c5c3_Err != nil { 539 - return templ.Error{Err: templ_7745c5c3_Err, FileName: `internal/admin/ui/templates/member_detail.templ`, Line: 144, Col: 18} 537 + return templ.Error{Err: templ_7745c5c3_Err, FileName: `internal/admin/ui/templates/member_detail.templ`, Line: 152, Col: 18} 540 538 } 541 539 _, templ_7745c5c3_Err = templ_7745c5c3_Buffer.WriteString(templ.EscapeString(templ_7745c5c3_Var25)) 542 540 if templ_7745c5c3_Err != nil { ··· 602 600 var templ_7745c5c3_Var27 string 603 601 templ_7745c5c3_Var27, templ_7745c5c3_Err = templ.JoinStringErrs(e.ReviewedAt) 604 602 if templ_7745c5c3_Err != nil { 605 - return templ.Error{Err: templ_7745c5c3_Err, FileName: `internal/admin/ui/templates/member_detail.templ`, Line: 162, Col: 28} 603 + return templ.Error{Err: templ_7745c5c3_Err, FileName: `internal/admin/ui/templates/member_detail.templ`, Line: 170, Col: 28} 606 604 } 607 605 _, templ_7745c5c3_Err = templ_7745c5c3_Buffer.WriteString(templ.EscapeString(templ_7745c5c3_Var27)) 608 606 if templ_7745c5c3_Err != nil { ··· 615 613 var templ_7745c5c3_Var28 string 616 614 templ_7745c5c3_Var28, templ_7745c5c3_Err = templ.JoinStringErrs(" — ") 617 615 if templ_7745c5c3_Err != nil { 618 - return templ.Error{Err: templ_7745c5c3_Err, FileName: `internal/admin/ui/templates/member_detail.templ`, Line: 163, Col: 15} 616 + return templ.Error{Err: templ_7745c5c3_Err, FileName: `internal/admin/ui/templates/member_detail.templ`, Line: 171, Col: 15} 619 617 } 620 618 _, templ_7745c5c3_Err = templ_7745c5c3_Buffer.WriteString(templ.EscapeString(templ_7745c5c3_Var28)) 621 619 if templ_7745c5c3_Err != nil { ··· 629 627 var templ_7745c5c3_Var29 string 630 628 templ_7745c5c3_Var29, templ_7745c5c3_Err = templ.JoinStringErrs(" by ") 631 629 if templ_7745c5c3_Err != nil { 632 - return templ.Error{Err: templ_7745c5c3_Err, FileName: `internal/admin/ui/templates/member_detail.templ`, Line: 166, Col: 15} 630 + return templ.Error{Err: templ_7745c5c3_Err, FileName: `internal/admin/ui/templates/member_detail.templ`, Line: 174, Col: 15} 633 631 } 634 632 _, templ_7745c5c3_Err = templ_7745c5c3_Buffer.WriteString(templ.EscapeString(templ_7745c5c3_Var29)) 635 633 if templ_7745c5c3_Err != nil { ··· 642 640 var templ_7745c5c3_Var30 string 643 641 templ_7745c5c3_Var30, templ_7745c5c3_Err = templ.JoinStringErrs(e.Actor) 644 642 if templ_7745c5c3_Err != nil { 645 - return templ.Error{Err: templ_7745c5c3_Err, FileName: `internal/admin/ui/templates/member_detail.templ`, Line: 167, Col: 22} 643 + return templ.Error{Err: templ_7745c5c3_Err, FileName: `internal/admin/ui/templates/member_detail.templ`, Line: 175, Col: 22} 646 644 } 647 645 _, templ_7745c5c3_Err = templ_7745c5c3_Buffer.WriteString(templ.EscapeString(templ_7745c5c3_Var30)) 648 646 if templ_7745c5c3_Err != nil { ··· 661 659 var templ_7745c5c3_Var31 string 662 660 templ_7745c5c3_Var31, templ_7745c5c3_Err = templ.JoinStringErrs(e.Note) 663 661 if templ_7745c5c3_Err != nil { 664 - return templ.Error{Err: templ_7745c5c3_Err, FileName: `internal/admin/ui/templates/member_detail.templ`, Line: 170, Col: 27} 662 + return templ.Error{Err: templ_7745c5c3_Err, FileName: `internal/admin/ui/templates/member_detail.templ`, Line: 178, Col: 27} 665 663 } 666 664 _, templ_7745c5c3_Err = templ_7745c5c3_Buffer.WriteString(templ.EscapeString(templ_7745c5c3_Var31)) 667 665 if templ_7745c5c3_Err != nil { ··· 729 727 var templ_7745c5c3_Var33 string 730 728 templ_7745c5c3_Var33, templ_7745c5c3_Err = templ.JoinStringErrs(action) 731 729 if templ_7745c5c3_Err != nil { 732 - return templ.Error{Err: templ_7745c5c3_Err, FileName: `internal/admin/ui/templates/member_detail.templ`, Line: 185, Col: 30} 730 + return templ.Error{Err: templ_7745c5c3_Err, FileName: `internal/admin/ui/templates/member_detail.templ`, Line: 193, Col: 30} 733 731 } 734 732 _, templ_7745c5c3_Err = templ_7745c5c3_Buffer.WriteString(templ.EscapeString(templ_7745c5c3_Var33)) 735 733 if templ_7745c5c3_Err != nil { ··· 785 783 var templ_7745c5c3_Var35 string 786 784 templ_7745c5c3_Var35, templ_7745c5c3_Err = templ.JoinStringErrs(m.SuspendReason) 787 785 if templ_7745c5c3_Err != nil { 788 - return templ.Error{Err: templ_7745c5c3_Err, FileName: `internal/admin/ui/templates/member_detail.templ`, Line: 194, Col: 37} 786 + return templ.Error{Err: templ_7745c5c3_Err, FileName: `internal/admin/ui/templates/member_detail.templ`, Line: 202, Col: 37} 789 787 } 790 788 _, templ_7745c5c3_Err = templ_7745c5c3_Buffer.WriteString(templ.EscapeString(templ_7745c5c3_Var35)) 791 789 if templ_7745c5c3_Err != nil { ··· 808 806 var templ_7745c5c3_Var36 string 809 807 templ_7745c5c3_Var36, templ_7745c5c3_Err = templ.JoinStringErrs("/ui/member/" + m.DID + "/suspend") 810 808 if templ_7745c5c3_Err != nil { 811 - return templ.Error{Err: templ_7745c5c3_Err, FileName: `internal/admin/ui/templates/member_detail.templ`, Line: 200, Col: 48} 809 + return templ.Error{Err: templ_7745c5c3_Err, FileName: `internal/admin/ui/templates/member_detail.templ`, Line: 208, Col: 48} 812 810 } 813 811 _, templ_7745c5c3_Err = templ_7745c5c3_Buffer.WriteString(templ.EscapeString(templ_7745c5c3_Var36)) 814 812 if templ_7745c5c3_Err != nil { ··· 826 824 var templ_7745c5c3_Var37 string 827 825 templ_7745c5c3_Var37, templ_7745c5c3_Err = templ.JoinStringErrs("/ui/member/" + m.DID + "/reactivate") 828 826 if templ_7745c5c3_Err != nil { 829 - return templ.Error{Err: templ_7745c5c3_Err, FileName: `internal/admin/ui/templates/member_detail.templ`, Line: 209, Col: 51} 827 + return templ.Error{Err: templ_7745c5c3_Err, FileName: `internal/admin/ui/templates/member_detail.templ`, Line: 217, Col: 51} 830 828 } 831 829 _, templ_7745c5c3_Err = templ_7745c5c3_Buffer.WriteString(templ.EscapeString(templ_7745c5c3_Var37)) 832 830 if templ_7745c5c3_Err != nil { ··· 837 835 return templ_7745c5c3_Err 838 836 } 839 837 } 840 - templ_7745c5c3_Err = templruntime.WriteString(templ_7745c5c3_Buffer, 68, "</div>") 838 + templ_7745c5c3_Err = templruntime.WriteString(templ_7745c5c3_Buffer, 68, "</div><details><summary role=\"button\" class=\"outline contrast\" style=\"margin-top: 1rem; font-size: 0.85rem;\">Permanently Delete Member</summary><p style=\"margin: 0.5rem 0;\"><small>This permanently removes the member, all domains, DKIM keys, message history, and rate counters. Suppressions are preserved for compliance. This cannot be undone.</small></p><button class=\"secondary\" style=\"background: var(--pico-del-color, #c62828); border-color: var(--pico-del-color, #c62828);\" hx-delete=\"") 839 + if templ_7745c5c3_Err != nil { 840 + return templ_7745c5c3_Err 841 + } 842 + var templ_7745c5c3_Var38 string 843 + templ_7745c5c3_Var38, templ_7745c5c3_Err = templ.JoinStringErrs("/ui/member/" + m.DID + "/delete") 844 + if templ_7745c5c3_Err != nil { 845 + return templ.Error{Err: templ_7745c5c3_Err, FileName: `internal/admin/ui/templates/member_detail.templ`, Line: 232, Col: 48} 846 + } 847 + _, templ_7745c5c3_Err = templ_7745c5c3_Buffer.WriteString(templ.EscapeString(templ_7745c5c3_Var38)) 848 + if templ_7745c5c3_Err != nil { 849 + return templ_7745c5c3_Err 850 + } 851 + templ_7745c5c3_Err = templruntime.WriteString(templ_7745c5c3_Buffer, 69, "\" hx-confirm=\"") 852 + if templ_7745c5c3_Err != nil { 853 + return templ_7745c5c3_Err 854 + } 855 + var templ_7745c5c3_Var39 string 856 + templ_7745c5c3_Var39, templ_7745c5c3_Err = templ.JoinStringErrs("PERMANENTLY DELETE " + m.Domain + " (" + m.DID + ")? This cannot be undone.") 857 + if templ_7745c5c3_Err != nil { 858 + return templ.Error{Err: templ_7745c5c3_Err, FileName: `internal/admin/ui/templates/member_detail.templ`, Line: 233, Col: 93} 859 + } 860 + _, templ_7745c5c3_Err = templ_7745c5c3_Buffer.WriteString(templ.EscapeString(templ_7745c5c3_Var39)) 861 + if templ_7745c5c3_Err != nil { 862 + return templ_7745c5c3_Err 863 + } 864 + templ_7745c5c3_Err = templruntime.WriteString(templ_7745c5c3_Buffer, 70, "\">Delete ") 865 + if templ_7745c5c3_Err != nil { 866 + return templ_7745c5c3_Err 867 + } 868 + var templ_7745c5c3_Var40 string 869 + templ_7745c5c3_Var40, templ_7745c5c3_Err = templ.JoinStringErrs(m.Domain) 870 + if templ_7745c5c3_Err != nil { 871 + return templ.Error{Err: templ_7745c5c3_Err, FileName: `internal/admin/ui/templates/member_detail.templ`, Line: 235, Col: 20} 872 + } 873 + _, templ_7745c5c3_Err = templ_7745c5c3_Buffer.WriteString(templ.EscapeString(templ_7745c5c3_Var40)) 874 + if templ_7745c5c3_Err != nil { 875 + return templ_7745c5c3_Err 876 + } 877 + templ_7745c5c3_Err = templruntime.WriteString(templ_7745c5c3_Buffer, 71, " Forever</button></details>") 841 878 if templ_7745c5c3_Err != nil { 842 879 return templ_7745c5c3_Err 843 880 }
+7
internal/admin/ui/templates/recover.go
··· 372 372 html.EscapeString(d.DKIMSelector), html.EscapeString(d.Domain)) 373 373 b.WriteString(`</section>`) 374 374 375 + // Deliverability dashboard 376 + b.WriteString(`<section class="section">`) 377 + b.WriteString(`<h2>Deliverability</h2>`) 378 + b.WriteString(`<p class="section-lede">View your sending reputation: bounce rate, complaints, daily volume, and warming progress.</p>`) 379 + b.WriteString(`<a href="/account/deliverability" class="btn">View deliverability →</a>`) 380 + b.WriteString(`</section>`) 381 + 375 382 // API key rotation 376 383 b.WriteString(`<section class="section">`) 377 384 b.WriteString(`<h2>API key</h2>`)
+4 -4
internal/config/config.go
··· 41 41 func Load(path string) (*Config, error) { 42 42 data, err := os.ReadFile(path) 43 43 if err != nil { 44 - return nil, fmt.Errorf("cannot read config %s: %v", path, err) 44 + return nil, fmt.Errorf("cannot read config %s: %w", path, err) 45 45 } 46 46 47 47 data, err = hujson.Standardize(data) 48 48 if err != nil { 49 - return nil, fmt.Errorf("cannot parse config %s: %v", path, err) 49 + return nil, fmt.Errorf("cannot parse config %s: %w", path, err) 50 50 } 51 51 52 52 var raw configJSON 53 53 if err := json.Unmarshal(data, &raw); err != nil { 54 - return nil, fmt.Errorf("cannot unmarshal config %s: %v", path, err) 54 + return nil, fmt.Errorf("cannot unmarshal config %s: %w", path, err) 55 55 } 56 56 57 57 cfg := &Config{ ··· 77 77 if raw.ReverifyInterval != "" { 78 78 d, err := time.ParseDuration(raw.ReverifyInterval) 79 79 if err != nil { 80 - return nil, fmt.Errorf("invalid reverifyInterval %q: %v", raw.ReverifyInterval, err) 80 + return nil, fmt.Errorf("invalid reverifyInterval %q: %w", raw.ReverifyInterval, err) 81 81 } 82 82 cfg.ReverifyInterval = d 83 83 }
+2 -2
internal/jetstream/consumer.go
··· 103 103 } 104 104 conn, _, err := dialer.DialContext(ctx, url, nil) 105 105 if err != nil { 106 - return fmt.Errorf("dial: %v", err) 106 + return fmt.Errorf("dial: %w", err) 107 107 } 108 108 defer conn.Close() 109 109 ··· 121 121 conn.SetReadDeadline(time.Now().Add(30 * time.Second)) 122 122 _, msg, err := conn.ReadMessage() 123 123 if err != nil { 124 - return fmt.Errorf("read: %v", err) 124 + return fmt.Errorf("read: %w", err) 125 125 } 126 126 127 127 var ev Event
+10 -10
internal/label/signer.go
··· 57 57 func GenerateKey(path string) error { 58 58 key, err := secec.GenerateKey() 59 59 if err != nil { 60 - return fmt.Errorf("generate key: %v", err) 60 + return fmt.Errorf("generate key: %w", err) 61 61 } 62 62 encoded := hex.EncodeToString(key.Bytes()) 63 63 return os.WriteFile(path, []byte(encoded+"\n"), 0600) ··· 68 68 func NewSigner(keyPath string) (*Signer, error) { 69 69 info, err := os.Stat(keyPath) 70 70 if err != nil { 71 - return nil, fmt.Errorf("stat key: %v", err) 71 + return nil, fmt.Errorf("stat key: %w", err) 72 72 } 73 73 if mode := info.Mode().Perm(); mode&0077 != 0 { 74 74 return nil, fmt.Errorf("signing key %s has unsafe permissions %o (want 0600, got %o)", keyPath, mode, mode) ··· 76 76 77 77 data, err := os.ReadFile(keyPath) 78 78 if err != nil { 79 - return nil, fmt.Errorf("read key: %v", err) 79 + return nil, fmt.Errorf("read key: %w", err) 80 80 } 81 81 keyBytes, err := hex.DecodeString(strings.TrimSpace(string(data))) 82 82 if err != nil { 83 - return nil, fmt.Errorf("decode key hex: %v", err) 83 + return nil, fmt.Errorf("decode key hex: %w", err) 84 84 } 85 85 key, err := secec.NewPrivateKey(keyBytes) 86 86 if err != nil { 87 - return nil, fmt.Errorf("parse key: %v", err) 87 + return nil, fmt.Errorf("parse key: %w", err) 88 88 } 89 89 90 90 // did:key from compressed public key with secp256k1-pub multicodec prefix (0xe7, 0x01) ··· 94 94 95 95 enc, err := cbor.CoreDetEncOptions().EncMode() 96 96 if err != nil { 97 - return nil, fmt.Errorf("cbor enc mode: %v", err) 97 + return nil, fmt.Errorf("cbor enc mode: %w", err) 98 98 } 99 99 100 100 return &Signer{key: key, did: did, enc: enc}, nil ··· 122 122 123 123 raw, err := s.enc.Marshal(ul) 124 124 if err != nil { 125 - return nil, fmt.Errorf("cbor encode: %v", err) 125 + return nil, fmt.Errorf("cbor encode: %w", err) 126 126 } 127 127 128 128 hash := sha256.Sum256(raw) 129 129 derSig, err := s.key.Sign(nil, hash[:], crypto.SHA256) 130 130 if err != nil { 131 - return nil, fmt.Errorf("sign: %v", err) 131 + return nil, fmt.Errorf("sign: %w", err) 132 132 } 133 133 134 134 compact, err := derToCompactLowS(derSig) 135 135 if err != nil { 136 - return nil, fmt.Errorf("compact sig: %v", err) 136 + return nil, fmt.Errorf("compact sig: %w", err) 137 137 } 138 138 139 139 return &SignedLabel{ ··· 153 153 func derToCompactLowS(der []byte) ([]byte, error) { 154 154 var sig struct{ R, S *big.Int } 155 155 if _, err := asn1.Unmarshal(der, &sig); err != nil { 156 - return nil, fmt.Errorf("unmarshal DER: %v", err) 156 + return nil, fmt.Errorf("unmarshal DER: %w", err) 157 157 } 158 158 159 159 // Low-S normalization: if S > N/2, replace with N - S
+10
internal/notify/webhook.go
··· 65 65 // visibility but no action is required when the member is already 66 66 // approved. 67 67 KindMemberDomainAdded EventKind = "member_domain_added" 68 + 69 + // KindBypassAdded fires when an admin adds a label-bypass entry for 70 + // a DID. High signal: bypass disables T&S enforcement, so operators 71 + // must see every add land in their notification stream (#213). 72 + KindBypassAdded EventKind = "bypass_added" 73 + 74 + // KindBypassRemoved fires when an admin or the expiry janitor 75 + // removes a bypass entry. Distinguish manual vs janitor via the 76 + // Reason field on the event. 77 + KindBypassRemoved EventKind = "bypass_removed" 68 78 ) 69 79 70 80 // Event is the payload shape every webhook call carries. Fields are
+105 -5
internal/osprey/emitter.go
··· 26 26 type EmitterMetrics interface { 27 27 IncEmitted(eventType string) 28 28 IncFailed(eventType string) 29 + // IncSpooled fires when an event lands on disk because the 30 + // broker rejected/silently dropped it (#214 DLQ). 31 + IncSpooled(eventType string) 32 + // IncReplayed fires when a previously-spooled event finally 33 + // makes it to the broker on a subsequent retry. 34 + IncReplayed(eventType string) 35 + // IncDropped fires when the spool overflows and an event is 36 + // permanently lost. reason names the trigger ("overflow", 37 + // "corrupt"). 38 + IncDropped(reason string) 39 + // SetSpoolDepth republishes the current spool size as a gauge. 40 + SetSpoolDepth(n int) 29 41 } 30 42 31 43 // Emitter sends relay events to Osprey via Kafka. ··· 35 47 counter atomic.Int64 36 48 enabled bool 37 49 metrics EmitterMetrics 50 + // spool persists events that didn't reach the broker on first 51 + // attempt. The replayer drains it back when Kafka is healthy 52 + // again. nil = no spool wired (legacy fire-and-forget). 53 + spool *EventSpool 38 54 } 39 55 40 56 // NewEmitter creates an emitter that writes to the given Kafka broker. ··· 65 81 // 66 82 // Per-event-type attribution relies on the "event_type" header that Emit 67 83 // attaches to every produced message. 84 + // 85 + // On batch failure with a spool wired, every message in the batch is 86 + // re-spooled so a downstream replayer can retry. Without the spool, the 87 + // failure is logged + counted only — legacy behavior. 68 88 func (e *Emitter) handleCompletion(messages []kafka.Message, err error) { 69 89 if err != nil { 70 90 log.Printf("osprey.kafka_batch_error: messages=%d error=%v", len(messages), err) 71 91 } 72 - if e.metrics == nil { 73 - return 74 - } 75 92 for _, m := range messages { 76 93 et := eventTypeFromHeaders(m.Headers) 77 94 if err != nil { 78 - e.metrics.IncFailed(et) 79 - } else { 95 + if e.metrics != nil { 96 + e.metrics.IncFailed(et) 97 + } 98 + e.spoolEvent(et, string(m.Key), m.Value) 99 + } else if e.metrics != nil { 80 100 e.metrics.IncEmitted(et) 81 101 } 82 102 } ··· 109 129 e.metrics = m 110 130 } 111 131 132 + // SetSpool wires an on-disk dead-letter queue. When set, events that 133 + // fail to write or that the broker rejects asynchronously are landed 134 + // to the spool instead of being silently dropped. Call ReplaySpool 135 + // periodically (cmd/relay drives this from a GoSafe goroutine) to 136 + // drain the queue back to the broker after recovery. Closes #214. 137 + func (e *Emitter) SetSpool(s *EventSpool) { 138 + e.spool = s 139 + if s != nil && e.metrics != nil { 140 + s.SetDropper(spoolDropperBridge{e.metrics}) 141 + } 142 + } 143 + 144 + // spoolDropperBridge adapts EmitterMetrics.IncDropped to the 145 + // SpoolDropper interface so the spool itself doesn't need to import 146 + // the emitter's metrics shape. 147 + type spoolDropperBridge struct{ m EmitterMetrics } 148 + 149 + func (s spoolDropperBridge) IncDropped(reason string) { s.m.IncDropped(reason) } 150 + 151 + // spoolEvent persists a (key, payload) pair for later replay, 152 + // recording the spool/drop metrics as appropriate. Caller MUST have 153 + // already failed a real send attempt; spoolEvent is a recovery path, 154 + // not a primary write. 155 + func (e *Emitter) spoolEvent(eventType, key string, payload []byte) { 156 + if e.spool == nil { 157 + return 158 + } 159 + if err := e.spool.Write(eventType, key, payload); err != nil { 160 + log.Printf("osprey.spool.write_error: event_type=%s error=%v", eventType, err) 161 + if e.metrics != nil { 162 + e.metrics.IncDropped("spool_write_error") 163 + } 164 + return 165 + } 166 + if e.metrics != nil { 167 + e.metrics.IncSpooled(eventType) 168 + } 169 + } 170 + 171 + // ReplaySpool drains spooled events back to the broker. Call from a 172 + // periodic loop in cmd/relay. Returns (replayed, failed) counts so 173 + // the caller can log a single summary line per pass. Errors from the 174 + // underlying directory listing surface as the third return. 175 + // 176 + // Per-event replay uses the same writer.WriteMessages path as live 177 + // Emit; failures land back on the spool (the entry was never deleted) 178 + // for the next pass. So a sustained Kafka outage manifests as a 179 + // growing spool depth (visible via the gauge) without permanent 180 + // loss until the cap is hit. 181 + func (e *Emitter) ReplaySpool(ctx context.Context) (int, int, error) { 182 + if e.spool == nil || !e.enabled { 183 + return 0, 0, nil 184 + } 185 + replayed, failed, err := e.spool.Walk(func(se SpooledEvent) error { 186 + writeErr := e.writer.WriteMessages(ctx, kafka.Message{ 187 + Key: []byte(se.Key), 188 + Value: se.Payload, 189 + Headers: []kafka.Header{ 190 + {Key: "event_type", Value: []byte(se.EventType)}, 191 + }, 192 + }) 193 + if writeErr != nil { 194 + return writeErr 195 + } 196 + if e.metrics != nil { 197 + e.metrics.IncReplayed(se.EventType) 198 + } 199 + return nil 200 + }) 201 + if e.metrics != nil { 202 + e.metrics.SetSpoolDepth(e.spool.Depth()) 203 + } 204 + return replayed, failed, err 205 + } 206 + 112 207 // Emit sends an event to Osprey. It is non-blocking (async writes) 113 208 // and never returns an error to avoid impacting relay operations. 114 209 func (e *Emitter) Emit(ctx context.Context, data EventData) { ··· 150 245 if e.metrics != nil { 151 246 e.metrics.IncFailed(data.EventType) 152 247 } 248 + // Sync-error spool: same failure mode as the async batch 249 + // case in handleCompletion. Without this branch the buffer- 250 + // full / shutdown class of failures is silently lost even 251 + // when the spool is wired (#214). 252 + e.spoolEvent(data.EventType, data.SenderDID, payload) 153 253 } 154 254 // Happy-path IncEmitted is intentionally NOT here — it fires in 155 255 // handleCompletion once the broker actually confirms the batch. Doing
+41 -4
internal/osprey/emitter_integration_test.go
··· 276 276 // queue-accepted write, so the actual broker success/failure can only be 277 277 // observed from the Completion callback. These tests pin that contract. 278 278 279 - // fakeMetrics records IncEmitted/IncFailed calls for assertion. 279 + // fakeMetrics records EmitterMetrics calls for assertion. 280 280 type fakeMetrics struct { 281 - mu sync.Mutex 282 - emitted map[string]int 283 - failed map[string]int 281 + mu sync.Mutex 282 + emitted map[string]int 283 + failed map[string]int 284 + spooled map[string]int 285 + replayed map[string]int 286 + dropped map[string]int 287 + spoolDepth int 284 288 } 285 289 286 290 func (f *fakeMetrics) IncEmitted(t string) { ··· 299 303 f.failed = map[string]int{} 300 304 } 301 305 f.failed[t]++ 306 + } 307 + 308 + func (f *fakeMetrics) IncSpooled(t string) { 309 + f.mu.Lock() 310 + defer f.mu.Unlock() 311 + if f.spooled == nil { 312 + f.spooled = map[string]int{} 313 + } 314 + f.spooled[t]++ 315 + } 316 + 317 + func (f *fakeMetrics) IncReplayed(t string) { 318 + f.mu.Lock() 319 + defer f.mu.Unlock() 320 + if f.replayed == nil { 321 + f.replayed = map[string]int{} 322 + } 323 + f.replayed[t]++ 324 + } 325 + 326 + func (f *fakeMetrics) IncDropped(reason string) { 327 + f.mu.Lock() 328 + defer f.mu.Unlock() 329 + if f.dropped == nil { 330 + f.dropped = map[string]int{} 331 + } 332 + f.dropped[reason]++ 333 + } 334 + 335 + func (f *fakeMetrics) SetSpoolDepth(n int) { 336 + f.mu.Lock() 337 + f.spoolDepth = n 338 + f.mu.Unlock() 302 339 } 303 340 304 341 // TestEmitAttachesEventTypeHeader: the Completion callback runs long after
+240
internal/osprey/spool.go
··· 1 + // SPDX-License-Identifier: AGPL-3.0-or-later 2 + 3 + package osprey 4 + 5 + import ( 6 + "crypto/rand" 7 + "encoding/hex" 8 + "encoding/json" 9 + "fmt" 10 + "log" 11 + "os" 12 + "path/filepath" 13 + "sort" 14 + "strings" 15 + "sync" 16 + "time" 17 + ) 18 + 19 + // EventSpool persists Osprey events to disk when the Kafka emitter 20 + // can't deliver them. A background replayer drains the spool back to 21 + // the broker after reconnect. 22 + // 23 + // Without this, an atmos-ops outage silently loses every relay event 24 + // fired during the window — labels stop propagating, trust scoring 25 + // freezes on stale data, and there is no signal an operator can see 26 + // after-the-fact that says "we lost N events between 03:14 and 04:02." 27 + // Closes #214. 28 + // 29 + // On-disk format: each event is one JSON object per file, named 30 + // {unix-nanos}-{8-hex-rand}.json, stored under dir. Filenames sort 31 + // chronologically so Walk can replay in arrival order. 32 + // 33 + // Bounded: when len(spool) >= maxEntries, the oldest file is dropped 34 + // to make room. Operators MUST graph the drop counter — a non-zero 35 + // drop rate means events were permanently lost. 36 + type EventSpool struct { 37 + dir string 38 + maxEntries int 39 + mu sync.Mutex 40 + dropMetrics interface{ IncDropped(reason string) } 41 + } 42 + 43 + // SpooledEvent is the persisted on-disk representation. Includes the 44 + // event_type so the Kafka header can be reconstructed at replay 45 + // without re-parsing the body. 46 + type SpooledEvent struct { 47 + EventType string `json:"event_type"` 48 + Key string `json:"key"` 49 + Payload json.RawMessage `json:"payload"` 50 + SpooledAt time.Time `json:"spooled_at"` 51 + } 52 + 53 + // SpoolDropper is the narrow interface the spool uses to count 54 + // permanent drops (overflow). Optional; nil-safe. 55 + type SpoolDropper interface { 56 + IncDropped(reason string) 57 + } 58 + 59 + // NewEventSpool creates a spool rooted at dir. Creates the directory 60 + // if it doesn't exist. maxEntries caps the spool depth — when full, 61 + // the oldest entries are dropped to make room. Pass 0 for the 62 + // default cap (10k entries). 63 + func NewEventSpool(dir string, maxEntries int) (*EventSpool, error) { 64 + if maxEntries <= 0 { 65 + maxEntries = 10_000 66 + } 67 + if err := os.MkdirAll(dir, 0o755); err != nil { 68 + return nil, fmt.Errorf("mkdir spool: %w", err) 69 + } 70 + return &EventSpool{dir: dir, maxEntries: maxEntries}, nil 71 + } 72 + 73 + // SetDropper wires a counter that fires when the spool overflows. 74 + // Without it, drops are logged only — fine for tests, insufficient 75 + // for production observability. 76 + func (s *EventSpool) SetDropper(d SpoolDropper) { 77 + s.dropMetrics = d 78 + } 79 + 80 + // Write persists one event. Atomic via tmp+rename + fsync of the 81 + // parent directory so a crash mid-write can't leave a partial file 82 + // that Walk would later choke on. 83 + func (s *EventSpool) Write(eventType, key string, payload []byte) error { 84 + s.mu.Lock() 85 + defer s.mu.Unlock() 86 + 87 + // Enforce cap before write so a flood doesn't blow past it on 88 + // race conditions between checks. 89 + if err := s.enforceCapLocked(); err != nil { 90 + log.Printf("osprey.spool.cap_error: %v", err) 91 + // Continue — losing a write because the cleanup failed is 92 + // strictly worse than tolerating temporary over-cap. 93 + } 94 + 95 + se := SpooledEvent{ 96 + EventType: eventType, 97 + Key: key, 98 + Payload: payload, 99 + SpooledAt: time.Now().UTC(), 100 + } 101 + data, err := json.Marshal(se) 102 + if err != nil { 103 + return fmt.Errorf("marshal: %w", err) 104 + } 105 + 106 + var raw [4]byte 107 + if _, err := rand.Read(raw[:]); err != nil { 108 + return fmt.Errorf("rand: %w", err) 109 + } 110 + name := fmt.Sprintf("%d-%s.json", time.Now().UTC().UnixNano(), hex.EncodeToString(raw[:])) 111 + path := filepath.Join(s.dir, name) 112 + tmp := path + ".tmp" 113 + 114 + f, err := os.OpenFile(tmp, os.O_WRONLY|os.O_CREATE|os.O_TRUNC, 0o600) 115 + if err != nil { 116 + return fmt.Errorf("open tmp: %w", err) 117 + } 118 + if _, err := f.Write(data); err != nil { 119 + f.Close() 120 + os.Remove(tmp) 121 + return fmt.Errorf("write tmp: %w", err) 122 + } 123 + if err := f.Sync(); err != nil { 124 + f.Close() 125 + os.Remove(tmp) 126 + return fmt.Errorf("fsync tmp: %w", err) 127 + } 128 + if err := f.Close(); err != nil { 129 + os.Remove(tmp) 130 + return fmt.Errorf("close tmp: %w", err) 131 + } 132 + if err := os.Rename(tmp, path); err != nil { 133 + os.Remove(tmp) 134 + return fmt.Errorf("rename: %w", err) 135 + } 136 + if d, err := os.Open(s.dir); err == nil { 137 + _ = d.Sync() 138 + _ = d.Close() 139 + } 140 + return nil 141 + } 142 + 143 + // Walk invokes fn for each spooled event, in arrival (filename) order. 144 + // On fn returning nil, the entry is removed. On fn returning an error, 145 + // the entry is left in place and the walk continues to the next entry 146 + // — failures are per-entry, not fatal to the loop. 147 + // 148 + // Returns (replayed, failed, err) where err is non-nil only on a 149 + // directory-listing failure (permission, missing dir, etc.). 150 + func (s *EventSpool) Walk(fn func(SpooledEvent) error) (int, int, error) { 151 + s.mu.Lock() 152 + defer s.mu.Unlock() 153 + 154 + entries, err := s.listLocked() 155 + if err != nil { 156 + return 0, 0, err 157 + } 158 + var replayed, failed int 159 + for _, name := range entries { 160 + path := filepath.Join(s.dir, name) 161 + raw, err := os.ReadFile(path) 162 + if err != nil { 163 + log.Printf("osprey.spool.read_error: file=%s error=%v", name, err) 164 + failed++ 165 + continue 166 + } 167 + var se SpooledEvent 168 + if err := json.Unmarshal(raw, &se); err != nil { 169 + // Corrupt entry — drop so it doesn't block the queue forever. 170 + log.Printf("osprey.spool.corrupt: file=%s error=%v — dropping", name, err) 171 + os.Remove(path) 172 + failed++ 173 + continue 174 + } 175 + if err := fn(se); err != nil { 176 + failed++ 177 + continue // leave in place; replayer will retry next pass 178 + } 179 + if err := os.Remove(path); err != nil { 180 + log.Printf("osprey.spool.remove_error: file=%s error=%v", name, err) 181 + } 182 + replayed++ 183 + } 184 + return replayed, failed, nil 185 + } 186 + 187 + // Depth returns the current number of spooled entries. Cheap; called 188 + // every replay tick to update the depth gauge. 189 + func (s *EventSpool) Depth() int { 190 + s.mu.Lock() 191 + defer s.mu.Unlock() 192 + entries, err := s.listLocked() 193 + if err != nil { 194 + return 0 195 + } 196 + return len(entries) 197 + } 198 + 199 + // listLocked returns the sorted list of spool filenames. Caller MUST 200 + // hold s.mu. 201 + func (s *EventSpool) listLocked() ([]string, error) { 202 + dirEntries, err := os.ReadDir(s.dir) 203 + if err != nil { 204 + return nil, err 205 + } 206 + var names []string 207 + for _, de := range dirEntries { 208 + if de.IsDir() { 209 + continue 210 + } 211 + n := de.Name() 212 + if !strings.HasSuffix(n, ".json") { 213 + continue 214 + } 215 + names = append(names, n) 216 + } 217 + sort.Strings(names) // chronological by Unix-nanos prefix 218 + return names, nil 219 + } 220 + 221 + // enforceCapLocked drops oldest entries until the spool is below cap. 222 + // Caller MUST hold s.mu. 223 + func (s *EventSpool) enforceCapLocked() error { 224 + entries, err := s.listLocked() 225 + if err != nil { 226 + return err 227 + } 228 + for len(entries) >= s.maxEntries { 229 + oldest := entries[0] 230 + if err := os.Remove(filepath.Join(s.dir, oldest)); err != nil { 231 + return err 232 + } 233 + entries = entries[1:] 234 + if s.dropMetrics != nil { 235 + s.dropMetrics.IncDropped("overflow") 236 + } 237 + log.Printf("osprey.spool.dropped: file=%s reason=overflow cap=%d", oldest, s.maxEntries) 238 + } 239 + return nil 240 + }
+244
internal/osprey/spool_test.go
··· 1 + // SPDX-License-Identifier: AGPL-3.0-or-later 2 + 3 + package osprey 4 + 5 + import ( 6 + "context" 7 + "errors" 8 + "os" 9 + "path/filepath" 10 + "strings" 11 + "sync" 12 + "testing" 13 + 14 + "github.com/segmentio/kafka-go" 15 + ) 16 + 17 + func TestEventSpool_WriteAndWalkRoundTrip(t *testing.T) { 18 + dir := t.TempDir() 19 + s, err := NewEventSpool(dir, 0) 20 + if err != nil { 21 + t.Fatalf("NewEventSpool: %v", err) 22 + } 23 + 24 + if err := s.Write("relay_attempt", "did:plc:a", []byte(`{"k":"v"}`)); err != nil { 25 + t.Fatalf("Write: %v", err) 26 + } 27 + if err := s.Write("delivery_result", "did:plc:b", []byte(`{"k":2}`)); err != nil { 28 + t.Fatalf("Write: %v", err) 29 + } 30 + 31 + if d := s.Depth(); d != 2 { 32 + t.Errorf("Depth = %d, want 2", d) 33 + } 34 + 35 + var seen []string 36 + r, fail, err := s.Walk(func(se SpooledEvent) error { 37 + seen = append(seen, se.EventType) 38 + return nil 39 + }) 40 + if err != nil { 41 + t.Fatalf("Walk: %v", err) 42 + } 43 + if r != 2 || fail != 0 { 44 + t.Errorf("replay count: replayed=%d failed=%d, want 2/0", r, fail) 45 + } 46 + if len(seen) != 2 || seen[0] != "relay_attempt" || seen[1] != "delivery_result" { 47 + t.Errorf("Walk order = %v, want [relay_attempt delivery_result]", seen) 48 + } 49 + if d := s.Depth(); d != 0 { 50 + t.Errorf("post-Walk depth = %d, want 0", d) 51 + } 52 + } 53 + 54 + func TestEventSpool_WalkLeavesFailedInPlace(t *testing.T) { 55 + dir := t.TempDir() 56 + s, _ := NewEventSpool(dir, 0) 57 + _ = s.Write("a", "k", []byte(`{}`)) 58 + _ = s.Write("b", "k", []byte(`{}`)) 59 + 60 + // First pass: fail the first entry, succeed the rest. 61 + failOnce := true 62 + r, fail, _ := s.Walk(func(se SpooledEvent) error { 63 + if failOnce { 64 + failOnce = false 65 + return errors.New("simulated broker outage") 66 + } 67 + return nil 68 + }) 69 + if r != 1 || fail != 1 { 70 + t.Errorf("first walk replayed=%d failed=%d, want 1/1", r, fail) 71 + } 72 + if d := s.Depth(); d != 1 { 73 + t.Errorf("depth after partial walk = %d, want 1 (failed entry retained)", d) 74 + } 75 + 76 + // Second pass: everything succeeds. 77 + r, fail, _ = s.Walk(func(se SpooledEvent) error { return nil }) 78 + if r != 1 || fail != 0 { 79 + t.Errorf("second walk replayed=%d failed=%d, want 1/0", r, fail) 80 + } 81 + if d := s.Depth(); d != 0 { 82 + t.Errorf("post-recovery depth = %d, want 0", d) 83 + } 84 + } 85 + 86 + type countingDropper struct { 87 + mu sync.Mutex 88 + calls map[string]int 89 + } 90 + 91 + func (c *countingDropper) IncDropped(reason string) { 92 + c.mu.Lock() 93 + defer c.mu.Unlock() 94 + if c.calls == nil { 95 + c.calls = map[string]int{} 96 + } 97 + c.calls[reason]++ 98 + } 99 + func (c *countingDropper) count(r string) int { 100 + c.mu.Lock() 101 + defer c.mu.Unlock() 102 + return c.calls[r] 103 + } 104 + 105 + func TestEventSpool_OverflowDropsOldest(t *testing.T) { 106 + dir := t.TempDir() 107 + s, _ := NewEventSpool(dir, 3) // tiny cap to make overflow easy 108 + d := &countingDropper{} 109 + s.SetDropper(d) 110 + 111 + for i := 0; i < 5; i++ { 112 + if err := s.Write("t", "k", []byte(`{}`)); err != nil { 113 + t.Fatalf("Write %d: %v", i, err) 114 + } 115 + } 116 + if got := s.Depth(); got != 3 { 117 + t.Errorf("Depth = %d, want 3 (cap)", got) 118 + } 119 + if d.count("overflow") != 2 { 120 + t.Errorf("overflow count = %d, want 2", d.count("overflow")) 121 + } 122 + } 123 + 124 + func TestEventSpool_CorruptEntryDroppedNotBlocking(t *testing.T) { 125 + dir := t.TempDir() 126 + s, _ := NewEventSpool(dir, 0) 127 + if err := s.Write("good", "k", []byte(`{}`)); err != nil { 128 + t.Fatal(err) 129 + } 130 + // Inject a corrupt file that bypasses Write — simulate a partial 131 + // pre-fsync write that survived a crash. 132 + corrupt := filepath.Join(dir, "0-deadbeef.json") 133 + if err := os.WriteFile(corrupt, []byte("not-json{"), 0o600); err != nil { 134 + t.Fatal(err) 135 + } 136 + 137 + r, fail, err := s.Walk(func(se SpooledEvent) error { return nil }) 138 + if err != nil { 139 + t.Fatalf("Walk: %v", err) 140 + } 141 + // Corrupt entry counts as failed AND is removed; good entry replays. 142 + if r != 1 { 143 + t.Errorf("replayed = %d, want 1", r) 144 + } 145 + if fail != 1 { 146 + t.Errorf("failed = %d, want 1 (corrupt)", fail) 147 + } 148 + // Corrupt file must be gone — otherwise it blocks the queue forever. 149 + if _, err := os.Stat(corrupt); !os.IsNotExist(err) { 150 + t.Errorf("corrupt file survived: stat err=%v", err) 151 + } 152 + } 153 + 154 + // stubWriter is a messageWriter that captures messages and lets tests 155 + // flip between success and failure modes. 156 + type stubWriter struct { 157 + mu sync.Mutex 158 + written []kafka.Message 159 + failNow bool 160 + } 161 + 162 + func (w *stubWriter) WriteMessages(_ context.Context, msgs ...kafka.Message) error { 163 + w.mu.Lock() 164 + defer w.mu.Unlock() 165 + if w.failNow { 166 + return errors.New("broker unreachable") 167 + } 168 + w.written = append(w.written, msgs...) 169 + return nil 170 + } 171 + func (w *stubWriter) Close() error { return nil } 172 + func (w *stubWriter) count() int { 173 + w.mu.Lock() 174 + defer w.mu.Unlock() 175 + return len(w.written) 176 + } 177 + 178 + // TestEmitter_FailedSyncWriteSpoolsAndReplays exercises the 179 + // integration between Emit's sync-error branch and ReplaySpool. 180 + func TestEmitter_FailedSyncWriteSpoolsAndReplays(t *testing.T) { 181 + dir := t.TempDir() 182 + spool, _ := NewEventSpool(dir, 0) 183 + 184 + w := &stubWriter{failNow: true} 185 + e := newEmitterWithWriter(w) 186 + m := &fakeMetrics{} 187 + e.SetMetrics(m) 188 + e.SetSpool(spool) 189 + 190 + // First emit: writer fails, event lands on disk. 191 + e.Emit(context.Background(), EventData{EventType: "relay_attempt", SenderDID: "did:plc:test"}) 192 + if got := spool.Depth(); got != 1 { 193 + t.Fatalf("post-failure spool depth = %d, want 1", got) 194 + } 195 + if got := m.spooled["relay_attempt"]; got != 1 { 196 + t.Errorf("spooled[relay_attempt] = %d, want 1", got) 197 + } 198 + 199 + // Recover the broker, replay. 200 + w.mu.Lock() 201 + w.failNow = false 202 + w.mu.Unlock() 203 + r, fail, err := e.ReplaySpool(context.Background()) 204 + if err != nil { 205 + t.Fatalf("ReplaySpool: %v", err) 206 + } 207 + if r != 1 || fail != 0 { 208 + t.Errorf("replay r=%d fail=%d, want 1/0", r, fail) 209 + } 210 + if got := spool.Depth(); got != 0 { 211 + t.Errorf("post-replay spool depth = %d, want 0", got) 212 + } 213 + if got := m.replayed["relay_attempt"]; got != 1 { 214 + t.Errorf("replayed[relay_attempt] = %d, want 1", got) 215 + } 216 + if got := w.count(); got != 1 { 217 + t.Errorf("writer received %d messages, want 1", got) 218 + } 219 + } 220 + 221 + // TestEmitter_NoSpoolFallsBackToLegacy confirms backward-compat: an 222 + // emitter with no spool wired drops failed events the same way as 223 + // before #214 (logged + IncFailed only). 224 + func TestEmitter_NoSpoolFallsBackToLegacy(t *testing.T) { 225 + w := &stubWriter{failNow: true} 226 + e := newEmitterWithWriter(w) 227 + m := &fakeMetrics{} 228 + e.SetMetrics(m) 229 + // SetSpool intentionally NOT called. 230 + 231 + e.Emit(context.Background(), EventData{EventType: "x", SenderDID: "k"}) 232 + if m.failed["x"] != 1 { 233 + t.Errorf("failed[x] = %d, want 1", m.failed["x"]) 234 + } 235 + if m.spooled["x"] != 0 { 236 + t.Errorf("spooled[x] = %d, want 0 (no spool wired)", m.spooled["x"]) 237 + } 238 + } 239 + 240 + // hasSubstr is a tiny helper to keep the corrupt-file test resilient 241 + // to error-string drift. 242 + func hasSubstr(s, sub string) bool { return strings.Contains(s, sub) } 243 + 244 + var _ = hasSubstr // keep import used if a future test wants it
+1 -1
internal/relay/auth.go
··· 14 14 func GenerateAPIKey() (string, error) { 15 15 b := make([]byte, 32) 16 16 if _, err := rand.Read(b); err != nil { 17 - return "", fmt.Errorf("generate random bytes: %v", err) 17 + return "", fmt.Errorf("generate random bytes: %w", err) 18 18 } 19 19 return "atmos_" + base64.RawURLEncoding.EncodeToString(b), nil 20 20 }
+2 -2
internal/relay/bounce.go
··· 77 77 // Get bounce stats for this member 78 78 stats, err := bp.GetBounceStats(ctx, memberDID) 79 79 if err != nil { 80 - return "", fmt.Errorf("get bounce stats: %v", err) 80 + return "", fmt.Errorf("get bounce stats: %w", err) 81 81 } 82 82 83 83 // Not enough sends to evaluate ··· 93 93 stats.BounceRate*100, bp.suspendBounceRate*100, bp.bounceWindowHours, stats.TotalSent, stats.TotalBounced) 94 94 95 95 if err := bp.store.UpdateMemberStatus(ctx, memberDID, relaystore.StatusSuspended, reason); err != nil { 96 - return "", fmt.Errorf("suspend member: %v", err) 96 + return "", fmt.Errorf("suspend member: %w", err) 97 97 } 98 98 99 99 log.Printf("bounce.auto_suspend: did=%s rate=%.4f threshold=%.4f reason=%q",
+131
internal/relay/cert_reload.go
··· 1 + // SPDX-License-Identifier: AGPL-3.0-or-later 2 + 3 + package relay 4 + 5 + import ( 6 + "crypto/tls" 7 + "fmt" 8 + "log" 9 + "os" 10 + "sync" 11 + "time" 12 + ) 13 + 14 + // CertReloader serves TLS certificates from disk with mtime-based 15 + // reload. Plug it into a *tls.Config via the GetCertificate callback; 16 + // the underlying cert is automatically refreshed when ACME (or any 17 + // other process) replaces the files on disk. 18 + // 19 + // Without this, every cert renewal forced a full relay restart via 20 + // systemd's reloadServices hook — dropping in-flight SMTP/HTTP 21 + // sessions and triggering the spool-reload race in #208. The 22 + // GetCertificate callback is invoked per TLS handshake, which is 23 + // many orders of magnitude cheaper than a process restart. 24 + // 25 + // Concurrency: safe for concurrent handshakes. Cert reads are 26 + // serialized via a mutex; the cached *tls.Certificate is shared 27 + // across all callers. 28 + // 29 + // Closes #216. 30 + type CertReloader struct { 31 + certPath string 32 + keyPath string 33 + 34 + mu sync.RWMutex 35 + cert *tls.Certificate 36 + loadedAt time.Time 37 + certMtime time.Time 38 + keyMtime time.Time 39 + } 40 + 41 + // NewCertReloader builds a reloader for the given cert/key pair. 42 + // Loads the cert immediately so callers can fail fast on bad paths. 43 + // On a missing file (first deploy before ACME has minted a cert), 44 + // returns a non-nil reloader with no cached cert; GetCertificate 45 + // returns an error in that state. Callers can keep calling and the 46 + // reloader picks up the cert as soon as it lands. 47 + func NewCertReloader(certPath, keyPath string) (*CertReloader, error) { 48 + r := &CertReloader{certPath: certPath, keyPath: keyPath} 49 + if err := r.reload(); err != nil { 50 + // Don't fail construction — first-boot ACME timing means the 51 + // file may not exist yet. Log and keep going; the next 52 + // GetCertificate invocation will retry. 53 + log.Printf("cert.reload.initial_load_failed: cert=%s key=%s error=%v", 54 + certPath, keyPath, err) 55 + } 56 + return r, nil 57 + } 58 + 59 + // GetCertificate is the tls.Config.GetCertificate callback. Returns 60 + // the cached cert, reloading from disk first if the underlying file 61 + // mtime has changed since the last read. 62 + func (r *CertReloader) GetCertificate(_ *tls.ClientHelloInfo) (*tls.Certificate, error) { 63 + if r.changed() { 64 + if err := r.reload(); err != nil { 65 + // Reload failed — fall back to the existing cached cert 66 + // rather than fail the handshake. ACME mid-renewal can 67 + // briefly leave the file inconsistent; surfacing that as 68 + // a TLS handshake failure would be worse than serving the 69 + // previous cert for one more poll cycle. 70 + log.Printf("cert.reload.error: %v (continuing with cached cert)", err) 71 + } 72 + } 73 + r.mu.RLock() 74 + defer r.mu.RUnlock() 75 + if r.cert == nil { 76 + return nil, fmt.Errorf("cert.reload: no certificate available (cert=%s key=%s)", r.certPath, r.keyPath) 77 + } 78 + return r.cert, nil 79 + } 80 + 81 + // changed reports whether either file's mtime has moved since the 82 + // last successful reload. Read-locked so concurrent handshakes 83 + // coalesce on a single mtime stat without serializing. 84 + func (r *CertReloader) changed() bool { 85 + cs, cerr := os.Stat(r.certPath) 86 + ks, kerr := os.Stat(r.keyPath) 87 + if cerr != nil || kerr != nil { 88 + // File disappeared — pretend nothing changed; the existing 89 + // cached cert is still usable. A real disappearance gets 90 + // surfaced on the next reload attempt. 91 + return false 92 + } 93 + r.mu.RLock() 94 + defer r.mu.RUnlock() 95 + return !cs.ModTime().Equal(r.certMtime) || !ks.ModTime().Equal(r.keyMtime) 96 + } 97 + 98 + // reload reads cert+key from disk and atomically swaps the cached 99 + // *tls.Certificate. Holds the write lock briefly; returning an 100 + // error leaves the previous cache untouched. 101 + func (r *CertReloader) reload() error { 102 + cs, err := os.Stat(r.certPath) 103 + if err != nil { 104 + return fmt.Errorf("stat cert: %w", err) 105 + } 106 + ks, err := os.Stat(r.keyPath) 107 + if err != nil { 108 + return fmt.Errorf("stat key: %w", err) 109 + } 110 + cert, err := tls.LoadX509KeyPair(r.certPath, r.keyPath) 111 + if err != nil { 112 + return fmt.Errorf("load keypair: %w", err) 113 + } 114 + r.mu.Lock() 115 + r.cert = &cert 116 + r.loadedAt = time.Now() 117 + r.certMtime = cs.ModTime() 118 + r.keyMtime = ks.ModTime() 119 + r.mu.Unlock() 120 + log.Printf("cert.reload: loaded cert=%s key=%s mtime=%s", 121 + r.certPath, r.keyPath, cs.ModTime().Format(time.RFC3339)) 122 + return nil 123 + } 124 + 125 + // LoadedAt returns the wall-clock time of the most recent successful 126 + // reload. Used by metrics + the cert-age dashboard panel. 127 + func (r *CertReloader) LoadedAt() time.Time { 128 + r.mu.RLock() 129 + defer r.mu.RUnlock() 130 + return r.loadedAt 131 + }
+204
internal/relay/cert_reload_test.go
··· 1 + // SPDX-License-Identifier: AGPL-3.0-or-later 2 + 3 + package relay 4 + 5 + import ( 6 + "crypto/ecdsa" 7 + "crypto/elliptic" 8 + "crypto/rand" 9 + "crypto/tls" 10 + "crypto/x509" 11 + "crypto/x509/pkix" 12 + "encoding/pem" 13 + "math/big" 14 + "os" 15 + "path/filepath" 16 + "testing" 17 + "time" 18 + ) 19 + 20 + // writeTestCertPair generates a self-signed cert pair into the given 21 + // dir, returning the cert and key paths. CommonName is set to the 22 + // supplied identifier so two successive calls produce visibly 23 + // different certificates and the reload-on-mtime-change test can 24 + // distinguish them after a refresh. 25 + func writeTestCertPair(t *testing.T, dir, identifier string) (string, string) { 26 + t.Helper() 27 + key, err := ecdsa.GenerateKey(elliptic.P256(), rand.Reader) 28 + if err != nil { 29 + t.Fatalf("genkey: %v", err) 30 + } 31 + template := &x509.Certificate{ 32 + SerialNumber: big.NewInt(1), 33 + Subject: pkix.Name{Organization: []string{identifier}, CommonName: identifier}, 34 + NotBefore: time.Now(), 35 + NotAfter: time.Now().Add(time.Hour), 36 + KeyUsage: x509.KeyUsageDigitalSignature | x509.KeyUsageKeyEncipherment, 37 + ExtKeyUsage: []x509.ExtKeyUsage{x509.ExtKeyUsageServerAuth}, 38 + DNSNames: []string{"localhost"}, 39 + } 40 + certDER, err := x509.CreateCertificate(rand.Reader, template, template, &key.PublicKey, key) 41 + if err != nil { 42 + t.Fatalf("create cert: %v", err) 43 + } 44 + keyDER, err := x509.MarshalECPrivateKey(key) 45 + if err != nil { 46 + t.Fatalf("marshal key: %v", err) 47 + } 48 + certPath := filepath.Join(dir, identifier+".crt") 49 + keyPath := filepath.Join(dir, identifier+".key") 50 + certPEM := pem.EncodeToMemory(&pem.Block{Type: "CERTIFICATE", Bytes: certDER}) 51 + keyPEM := pem.EncodeToMemory(&pem.Block{Type: "EC PRIVATE KEY", Bytes: keyDER}) 52 + if err := os.WriteFile(certPath, certPEM, 0o600); err != nil { 53 + t.Fatalf("write cert: %v", err) 54 + } 55 + if err := os.WriteFile(keyPath, keyPEM, 0o600); err != nil { 56 + t.Fatalf("write key: %v", err) 57 + } 58 + return certPath, keyPath 59 + } 60 + 61 + // commonNameOf parses the leaf cert from a tls.Certificate and 62 + // returns its Subject Common Name. Used by tests to confirm the 63 + // reloader served the new cert (different CN) rather than the 64 + // cached old one. 65 + func commonNameOf(t *testing.T, c *tls.Certificate) string { 66 + t.Helper() 67 + if c == nil || len(c.Certificate) == 0 { 68 + t.Fatal("nil or empty certificate") 69 + } 70 + leaf, err := x509.ParseCertificate(c.Certificate[0]) 71 + if err != nil { 72 + t.Fatalf("parse leaf: %v", err) 73 + } 74 + return leaf.Subject.CommonName 75 + } 76 + 77 + func TestCertReloader_LoadsOnConstruction(t *testing.T) { 78 + dir := t.TempDir() 79 + certPath, keyPath := writeTestCertPair(t, dir, "v1") 80 + 81 + r, err := NewCertReloader(certPath, keyPath) 82 + if err != nil { 83 + t.Fatalf("NewCertReloader: %v", err) 84 + } 85 + got, err := r.GetCertificate(nil) 86 + if err != nil { 87 + t.Fatalf("GetCertificate: %v", err) 88 + } 89 + if cn := commonNameOf(t, got); cn != "v1" { 90 + t.Errorf("CommonName = %q, want v1", cn) 91 + } 92 + } 93 + 94 + // TestCertReloader_ReloadsOnMtimeChange is the core invariant of 95 + // #216: rewriting the cert on disk causes the next handshake to 96 + // serve the new cert, with no process restart. 97 + func TestCertReloader_ReloadsOnMtimeChange(t *testing.T) { 98 + dir := t.TempDir() 99 + certPath, keyPath := writeTestCertPair(t, dir, "v1") 100 + r, err := NewCertReloader(certPath, keyPath) 101 + if err != nil { 102 + t.Fatalf("NewCertReloader: %v", err) 103 + } 104 + 105 + // First handshake: v1. 106 + got, _ := r.GetCertificate(nil) 107 + if cn := commonNameOf(t, got); cn != "v1" { 108 + t.Fatalf("first CN = %q, want v1", cn) 109 + } 110 + 111 + // Sleep briefly so mtime is observably different across 112 + // filesystems with second-granularity stat (older macOS, ext3). 113 + time.Sleep(1100 * time.Millisecond) 114 + 115 + // Rewrite cert+key with a new identifier. 116 + v2Cert, v2Key := writeTestCertPair(t, dir, "v2") 117 + // Move the v2 files into the v1 paths so the reloader sees the 118 + // SAME paths but DIFFERENT mtimes/contents — which is exactly 119 + // what ACME does when it renews in-place. 120 + if err := os.Rename(v2Cert, certPath); err != nil { 121 + t.Fatal(err) 122 + } 123 + if err := os.Rename(v2Key, keyPath); err != nil { 124 + t.Fatal(err) 125 + } 126 + 127 + got, err = r.GetCertificate(nil) 128 + if err != nil { 129 + t.Fatalf("post-rotation GetCertificate: %v", err) 130 + } 131 + if cn := commonNameOf(t, got); cn != "v2" { 132 + t.Errorf("post-rotation CN = %q, want v2 (reloader didn't pick up new file)", cn) 133 + } 134 + } 135 + 136 + // TestCertReloader_FallsBackOnReadError confirms that a transient 137 + // rename-in-flight (where the cert file briefly disappears or is 138 + // corrupted) doesn't fail the next TLS handshake — we keep serving 139 + // the previously-cached cert. 140 + func TestCertReloader_FallsBackOnReadError(t *testing.T) { 141 + dir := t.TempDir() 142 + certPath, keyPath := writeTestCertPair(t, dir, "v1") 143 + r, err := NewCertReloader(certPath, keyPath) 144 + if err != nil { 145 + t.Fatalf("NewCertReloader: %v", err) 146 + } 147 + 148 + // Prime the cache. 149 + if _, err := r.GetCertificate(nil); err != nil { 150 + t.Fatal(err) 151 + } 152 + 153 + // Corrupt the cert file mid-flight. 154 + time.Sleep(1100 * time.Millisecond) 155 + if err := os.WriteFile(certPath, []byte("not a pem"), 0o600); err != nil { 156 + t.Fatal(err) 157 + } 158 + 159 + // GetCertificate should fall back to the cached v1 rather than 160 + // fail the handshake. 161 + got, err := r.GetCertificate(nil) 162 + if err != nil { 163 + t.Fatalf("expected fallback to cached cert, got err=%v", err) 164 + } 165 + if cn := commonNameOf(t, got); cn != "v1" { 166 + t.Errorf("fallback CN = %q, want v1", cn) 167 + } 168 + } 169 + 170 + // TestCertReloader_FirstBootMissingFile pins the first-deploy 171 + // behavior: the cert may not exist yet (ACME hasn't minted it). 172 + // NewCertReloader must not fail; GetCertificate returns an error 173 + // the TLS layer can surface, and a later GetCertificate after the 174 + // file lands picks it up. 175 + func TestCertReloader_FirstBootMissingFile(t *testing.T) { 176 + dir := t.TempDir() 177 + certPath := filepath.Join(dir, "missing.crt") 178 + keyPath := filepath.Join(dir, "missing.key") 179 + 180 + r, err := NewCertReloader(certPath, keyPath) 181 + if err != nil { 182 + t.Fatalf("NewCertReloader should not fail on missing files, got %v", err) 183 + } 184 + if _, err := r.GetCertificate(nil); err == nil { 185 + t.Fatal("expected error from GetCertificate when no cert exists") 186 + } 187 + 188 + // ACME mints the cert post-startup. 189 + cp, kp := writeTestCertPair(t, dir, "fresh") 190 + if err := os.Rename(cp, certPath); err != nil { 191 + t.Fatal(err) 192 + } 193 + if err := os.Rename(kp, keyPath); err != nil { 194 + t.Fatal(err) 195 + } 196 + 197 + got, err := r.GetCertificate(nil) 198 + if err != nil { 199 + t.Fatalf("post-mint GetCertificate: %v", err) 200 + } 201 + if cn := commonNameOf(t, got); cn != "fresh" { 202 + t.Errorf("CN = %q, want fresh", cn) 203 + } 204 + }
+4 -4
internal/relay/didresolver.go
··· 76 76 77 77 req, err := http.NewRequestWithContext(ctx, "GET", url, nil) 78 78 if err != nil { 79 - return nil, fmt.Errorf("create request: %v", err) 79 + return nil, fmt.Errorf("create request: %w", err) 80 80 } 81 81 req.Header.Set("Accept", "application/json") 82 82 83 83 resp, err := r.client.Do(req) 84 84 if err != nil { 85 - return nil, fmt.Errorf("fetch DID document: %v", err) 85 + return nil, fmt.Errorf("fetch DID document: %w", err) 86 86 } 87 87 defer resp.Body.Close() 88 88 ··· 92 92 93 93 body, err := io.ReadAll(io.LimitReader(resp.Body, 64*1024)) 94 94 if err != nil { 95 - return nil, fmt.Errorf("read DID document: %v", err) 95 + return nil, fmt.Errorf("read DID document: %w", err) 96 96 } 97 97 98 98 var doc DIDDocument 99 99 if err := json.Unmarshal(body, &doc); err != nil { 100 - return nil, fmt.Errorf("parse DID document: %v", err) 100 + return nil, fmt.Errorf("parse DID document: %w", err) 101 101 } 102 102 103 103 return &doc, nil
+6 -6
internal/relay/dkim.go
··· 28 28 func GenerateDKIMKeys(selector string) (*DKIMKeys, error) { 29 29 rsaKey, err := rsa.GenerateKey(rand.Reader, 2048) 30 30 if err != nil { 31 - return nil, fmt.Errorf("generate RSA key: %v", err) 31 + return nil, fmt.Errorf("generate RSA key: %w", err) 32 32 } 33 33 34 34 _, edKey, err := ed25519.GenerateKey(rand.Reader) 35 35 if err != nil { 36 - return nil, fmt.Errorf("generate Ed25519 key: %v", err) 36 + return nil, fmt.Errorf("generate Ed25519 key: %w", err) 37 37 } 38 38 39 39 return &DKIMKeys{ ··· 109 109 func (s *DKIMSigner) Sign(r io.Reader) ([]byte, error) { 110 110 msgBytes, err := io.ReadAll(r) 111 111 if err != nil { 112 - return nil, fmt.Errorf("read message: %v", err) 112 + return nil, fmt.Errorf("read message: %w", err) 113 113 } 114 114 return s.signBytes(msgBytes) 115 115 } ··· 128 128 129 129 var edBuf bytes.Buffer 130 130 if err := dkim.Sign(&edBuf, bytes.NewReader(msgBytes), edOpts); err != nil { 131 - return nil, fmt.Errorf("Ed25519 DKIM sign: %v", err) 131 + return nil, fmt.Errorf("Ed25519 DKIM sign: %w", err) 132 132 } 133 133 134 134 // Sign with RSA on top of the Ed25519-signed message ··· 142 142 143 143 var rsaBuf bytes.Buffer 144 144 if err := dkim.Sign(&rsaBuf, bytes.NewReader(edBuf.Bytes()), rsaOpts); err != nil { 145 - return nil, fmt.Errorf("RSA DKIM sign: %v", err) 145 + return nil, fmt.Errorf("RSA DKIM sign: %w", err) 146 146 } 147 147 148 148 return rsaBuf.Bytes(), nil ··· 184 184 func (s *DualDomainSigner) Sign(r io.Reader) ([]byte, error) { 185 185 msgBytes, err := io.ReadAll(r) 186 186 if err != nil { 187 - return nil, fmt.Errorf("read message: %v", err) 187 + return nil, fmt.Errorf("read message: %w", err) 188 188 } 189 189 primarySigned, err := s.primary.signBytes(msgBytes) 190 190 if err != nil {
+15 -6
internal/relay/events_consumer.go
··· 39 39 // the consumer is reading the right place. 40 40 const OspreyOutputTopic = "osprey.execution_results" 41 41 42 + const maxOspreyMessageBytes = 1 << 20 // 1 MiB 43 + 42 44 // ospreyConsumerGroupID identifies the relay's consumer in Kafka so 43 45 // rebalances stay stable across restarts. 44 46 const ospreyConsumerGroupID = "atmos-relay-events" ··· 139 141 continue 140 142 } 141 143 144 + // Skip oversized or malformed messages rather than block the topic. 145 + // kafka-go auto-commits on ReadMessage success (it already returned), 146 + // so the offset advances past skipped messages on the next poll. 147 + if len(msg.Value) > maxOspreyMessageBytes { 148 + log.Printf("relay_events.consumer.oversize: offset=%d bytes=%d limit=%d", msg.Offset, len(msg.Value), maxOspreyMessageBytes) 149 + continue 150 + } 151 + 142 152 evt, err := decodeOspreyMessage(msg) 143 153 if err != nil { 144 154 log.Printf("relay_events.consumer.decode_error: offset=%d error=%v", msg.Offset, err) 145 - // Skip the bad message rather than block the topic. kafka-go 146 - // auto-commits on ReadMessage success (it already returned), 147 - // but since we use consumer-group mode the commit happens 148 - // on the next ReadMessage anyway. 149 155 continue 150 156 } 151 157 ··· 204 210 func decodeOspreyMessage(msg kafka.Message) (*relaystore.RelayEvent, error) { 205 211 var om ospreyMessage 206 212 if err := json.Unmarshal(msg.Value, &om); err != nil { 207 - return nil, fmt.Errorf("unmarshal osprey message: %v", err) 213 + return nil, fmt.Errorf("unmarshal osprey message: %w", err) 208 214 } 209 215 210 216 // Osprey's action_name drives everything downstream. Refuse to ··· 213 219 if strings.TrimSpace(om.ActionName) == "" { 214 220 return nil, fmt.Errorf("osprey message missing ActionName") 215 221 } 222 + if om.SenderDID != "" && !strings.HasPrefix(om.SenderDID, "did:") { 223 + return nil, fmt.Errorf("osprey message has invalid SenderDID %q", om.SenderDID) 224 + } 216 225 217 226 eventTS, err := parseOspreyTimestamp(om.Timestamp) 218 227 if err != nil { 219 - return nil, fmt.Errorf("parse __timestamp: %v", err) 228 + return nil, fmt.Errorf("parse __timestamp: %w", err) 220 229 } 221 230 222 231 return &relaystore.RelayEvent{
+47
internal/relay/events_consumer_test.go
··· 165 165 } 166 166 } 167 167 168 + func TestDecodeOspreyMessageRejectsInvalidDID(t *testing.T) { 169 + _, err := decodeOspreyMessage(kafka.Message{ 170 + Value: []byte(`{"__action_id":1,"__timestamp":"2026-04-13T23:59:14+00:00","ActionName":"relay_attempt","SenderDID":"not-a-did"}`), 171 + Offset: 1, 172 + }) 173 + if err == nil { 174 + t.Fatal("want error for SenderDID without did: prefix") 175 + } 176 + } 177 + 178 + func TestDecodeOspreyMessageAllowsEmptyDID(t *testing.T) { 179 + _, err := decodeOspreyMessage(kafka.Message{ 180 + Value: []byte(`{"__action_id":1,"__timestamp":"2026-04-13T23:59:14+00:00","ActionName":"relay_attempt","SenderDID":""}`), 181 + Offset: 1, 182 + }) 183 + if err != nil { 184 + t.Fatalf("empty SenderDID should be allowed: %v", err) 185 + } 186 + } 187 + 188 + func TestConsumerSkipsOversizeMessages(t *testing.T) { 189 + store := newMemStore(t) 190 + 191 + oversized := make([]byte, maxOspreyMessageBytes+1) 192 + for i := range oversized { 193 + oversized[i] = 'x' 194 + } 195 + 196 + reader := &fakeReader{messages: []kafka.Message{ 197 + {Value: oversized, Offset: 1}, 198 + {Value: []byte(sampleRelayAttempt), Offset: 2}, 199 + }} 200 + 201 + c := newOspreyEventConsumerWithReader(reader, store) 202 + if err := c.Run(context.Background()); err != nil { 203 + t.Fatalf("Run: %v", err) 204 + } 205 + 206 + events, err := store.ListRelayEvents(context.Background(), relaystore.ListRelayEventsFilter{}) 207 + if err != nil { 208 + t.Fatalf("ListRelayEvents: %v", err) 209 + } 210 + if len(events) != 1 { 211 + t.Fatalf("want 1 event (oversize skipped), got %d", len(events)) 212 + } 213 + } 214 + 168 215 // --- Consumer integration (with a fake reader) --- 169 216 170 217 // fakeReader replays a fixed batch of messages then returns io.EOF.
+64
internal/relay/gosafe.go
··· 1 + // SPDX-License-Identifier: AGPL-3.0-or-later 2 + 3 + package relay 4 + 5 + import ( 6 + "log" 7 + "runtime/debug" 8 + ) 9 + 10 + // PanicRecorder is the narrow interface GoSafe needs to count 11 + // recovered panics, so callers can wire metrics.GoroutineCrashes 12 + // without the relay package taking a hard dependency on Prometheus 13 + // types here. Implementations must be safe for concurrent use. 14 + type PanicRecorder interface { 15 + IncGoroutineCrash(name string) 16 + } 17 + 18 + // goSafePanicRecorder is the package-level recorder. nil means 19 + // panics are still recovered + logged but not counted. Set via 20 + // SetPanicRecorder during cmd/relay wiring. 21 + var goSafePanicRecorder PanicRecorder 22 + 23 + // SetPanicRecorder installs a metrics recorder used by GoSafe to 24 + // count recovered panics. Calling more than once replaces the 25 + // previous recorder. Safe to call before any GoSafe invocation. 26 + func SetPanicRecorder(r PanicRecorder) { 27 + goSafePanicRecorder = r 28 + } 29 + 30 + // GoSafe runs fn in a new goroutine with a deferred recover that 31 + // converts a panic into a log line + stack trace + metric increment 32 + // instead of process termination. 33 + // 34 + // Without this wrapper, every long-lived background goroutine in the 35 + // relay (queue worker, inbound server, public listener, events 36 + // consumer, health probe, hourly cleanups, notify worker, warmup 37 + // scheduler, ...) crashes the entire relay process on any panic. 38 + // A malformed inbound ARF report or a poison Kafka record is enough 39 + // to take the SMTP service down indefinitely. The deferred recover 40 + // here turns those into observable, contained failures the operator 41 + // can investigate without an outage. Closes #209. 42 + // 43 + // name is a stable label suitable for Prometheus and grep — keep it 44 + // short and stable across deploys ("queue.run", "inbound.serve", 45 + // etc.). Anonymous goroutines are intentionally rejected; if a 46 + // caller passes "", GoSafe still runs but panics report as 47 + // name="unnamed" so the metric remains non-empty. 48 + func GoSafe(name string, fn func()) { 49 + if name == "" { 50 + name = "unnamed" 51 + } 52 + go func() { 53 + defer func() { 54 + if r := recover(); r != nil { 55 + log.Printf("goroutine.panic: name=%s recovered=%v\n%s", 56 + name, r, string(debug.Stack())) 57 + if goSafePanicRecorder != nil { 58 + goSafePanicRecorder.IncGoroutineCrash(name) 59 + } 60 + } 61 + }() 62 + fn() 63 + }() 64 + }
+142
internal/relay/gosafe_test.go
··· 1 + // SPDX-License-Identifier: AGPL-3.0-or-later 2 + 3 + package relay 4 + 5 + import ( 6 + "sync" 7 + "sync/atomic" 8 + "testing" 9 + "time" 10 + ) 11 + 12 + // fakePanicRecorder captures IncGoroutineCrash calls so tests can 13 + // assert the panic was both recovered and counted. 14 + type fakePanicRecorder struct { 15 + mu sync.Mutex 16 + calls map[string]int 17 + } 18 + 19 + func newFakePanicRecorder() *fakePanicRecorder { 20 + return &fakePanicRecorder{calls: map[string]int{}} 21 + } 22 + 23 + func (f *fakePanicRecorder) IncGoroutineCrash(name string) { 24 + f.mu.Lock() 25 + f.calls[name]++ 26 + f.mu.Unlock() 27 + } 28 + 29 + func (f *fakePanicRecorder) count(name string) int { 30 + f.mu.Lock() 31 + defer f.mu.Unlock() 32 + return f.calls[name] 33 + } 34 + 35 + // withRecorder swaps the package-global recorder for the duration of 36 + // a test and restores the previous value on cleanup. Tests run in a 37 + // single process so we serialize via a mutex to avoid races between 38 + // parallel tests that might otherwise see each other's recorders. 39 + var goSafeTestMu sync.Mutex 40 + 41 + func withRecorder(t *testing.T, r PanicRecorder) { 42 + t.Helper() 43 + goSafeTestMu.Lock() 44 + prev := goSafePanicRecorder 45 + SetPanicRecorder(r) 46 + t.Cleanup(func() { 47 + SetPanicRecorder(prev) 48 + goSafeTestMu.Unlock() 49 + }) 50 + } 51 + 52 + // awaitCount polls until f.count(name) reaches want or timeout. 53 + // Failing fast on a missing increment is more useful than a 54 + // timeout after the deferred recover ate the panic invisibly. 55 + func awaitCount(t *testing.T, f *fakePanicRecorder, name string, want int) { 56 + t.Helper() 57 + deadline := time.Now().Add(2 * time.Second) 58 + for time.Now().Before(deadline) { 59 + if f.count(name) >= want { 60 + return 61 + } 62 + time.Sleep(5 * time.Millisecond) 63 + } 64 + t.Fatalf("expected %s count >= %d, got %d", name, want, f.count(name)) 65 + } 66 + 67 + func TestGoSafe_RunsFnNoPanic(t *testing.T) { 68 + rec := newFakePanicRecorder() 69 + withRecorder(t, rec) 70 + var ran atomic.Bool 71 + done := make(chan struct{}) 72 + GoSafe("happy", func() { 73 + ran.Store(true) 74 + close(done) 75 + }) 76 + select { 77 + case <-done: 78 + case <-time.After(time.Second): 79 + t.Fatal("fn never ran") 80 + } 81 + if !ran.Load() { 82 + t.Fatal("ran flag not set") 83 + } 84 + if got := rec.count("happy"); got != 0 { 85 + t.Errorf("crash count = %d, want 0 on no-panic path", got) 86 + } 87 + } 88 + 89 + func TestGoSafe_RecoversPanicAndCounts(t *testing.T) { 90 + rec := newFakePanicRecorder() 91 + withRecorder(t, rec) 92 + GoSafe("crashy", func() { 93 + panic("intentional test panic") 94 + }) 95 + awaitCount(t, rec, "crashy", 1) 96 + } 97 + 98 + // TestGoSafe_ProcessSurvivesPanic — the load-bearing assertion of 99 + // #209: a panicking goroutine must NOT terminate the process. Test 100 + // runs the panicking GoSafe and then verifies a subsequent line of 101 + // test code executes (which it cannot if the runtime crashed). 102 + func TestGoSafe_ProcessSurvivesPanic(t *testing.T) { 103 + rec := newFakePanicRecorder() 104 + withRecorder(t, rec) 105 + GoSafe("survivor", func() { panic("boom") }) 106 + awaitCount(t, rec, "survivor", 1) 107 + // Reaching this line proves the runtime is still alive. If GoSafe 108 + // regresses to a no-recover, the panic propagates and the test 109 + // process dies before this assertion can run. 110 + if t.Failed() { 111 + t.Fatal("unreachable failure") 112 + } 113 + } 114 + 115 + func TestGoSafe_NilRecorderStillRecovers(t *testing.T) { 116 + withRecorder(t, nil) // explicit nil 117 + // Should not panic the test process. 118 + GoSafe("orphan", func() { panic("no recorder, no problem") }) 119 + // Allow the goroutine time to schedule and recover. 120 + time.Sleep(50 * time.Millisecond) 121 + } 122 + 123 + func TestGoSafe_EmptyNameFallsBackToUnnamed(t *testing.T) { 124 + rec := newFakePanicRecorder() 125 + withRecorder(t, rec) 126 + GoSafe("", func() { panic("anon") }) 127 + awaitCount(t, rec, "unnamed", 1) 128 + } 129 + 130 + // TestGoSafe_MultiplePanicsCounted ensures the metric is per-name 131 + // and accumulates correctly under load — covers the "poison record 132 + // in a tight loop" scenario where the same goroutine panics 133 + // repeatedly while a supervisor restarts it. 134 + func TestGoSafe_MultiplePanicsCounted(t *testing.T) { 135 + rec := newFakePanicRecorder() 136 + withRecorder(t, rec) 137 + const n = 5 138 + for i := 0; i < n; i++ { 139 + GoSafe("burst", func() { panic("repeat") }) 140 + } 141 + awaitCount(t, rec, "burst", n) 142 + }
+41 -3
internal/relay/inbound.go
··· 24 24 ListenAddr string // default ":25" 25 25 Domain string // relay domain (e.g. "atmos.email") 26 26 MaxMsgSize int64 // default 10MB (replies can include larger bodies than DSNs) 27 + 28 + // RateLimitMsgsPerMinute caps the per-source-IP message rate at MAIL 29 + // FROM. Zero or negative disables rate limiting (legacy behavior). A 30 + // reasonable production default is 30 (50% headroom over the highest 31 + // rate any legitimate single-source provider has been observed at). 32 + RateLimitMsgsPerMinute float64 33 + // RateLimitBurst is the token-bucket capacity. Zero defaults to 10. 34 + // Bursts above this size from a single IP get a 421 retry-later; 35 + // over a sustained window the IP is held to RateLimitMsgsPerMinute. 36 + RateLimitBurst int 27 37 } 28 38 29 39 // BounceHandler is called when a valid bounce DSN is received and matched. ··· 56 66 type InboundMetrics interface { 57 67 RecordInbound(classification string) 58 68 RecordForward(status string) 69 + // RecordRejected fires when an inbound session is rejected before 70 + // classification (e.g. rate-limited). reason is a short stable 71 + // identifier ("rate_limit", ...) suitable for a Prometheus label. 72 + RecordRejected(reason string) 59 73 } 60 74 61 75 // InboundLogEntry is a single structured record of an accepted inbound ··· 112 126 // Without this, provider authorization emails (Microsoft SNDS, 113 127 // Yahoo CFL) and ops-team mail never reach a human. 114 128 operatorForwardTo string 129 + 130 + // rateLimiter, when non-nil, enforces a per-source-IP rate limit at 131 + // MAIL FROM. nil means rate limiting is disabled. 132 + rateLimiter *inboundRateLimiter 115 133 } 116 134 117 135 // NewInboundServer creates an inbound SMTP server. domainLookup, forwarder, ··· 130 148 domain: cfg.Domain, 131 149 onBounce: onBounce, 132 150 memberLookup: memberLookup, 151 + rateLimiter: newInboundRateLimiter(cfg.RateLimitMsgsPerMinute, cfg.RateLimitBurst, 0), 133 152 } 134 153 135 154 smtpSrv := smtp.NewServer(s) ··· 191 210 192 211 // Close shuts down the inbound SMTP server. 193 212 func (s *InboundServer) Close() error { 213 + if s.rateLimiter != nil { 214 + s.rateLimiter.Close() 215 + } 194 216 return s.server.Close() 195 217 } 196 218 197 219 // NewSession implements smtp.Backend for the inbound server. 198 220 func (s *InboundServer) NewSession(c *smtp.Conn) (smtp.Session, error) { 199 - return &inboundSession{server: s}, nil 221 + var ip string 222 + if conn := c.Conn(); conn != nil { 223 + ip = remoteIP(conn.RemoteAddr().String()) 224 + } 225 + return &inboundSession{server: s, remoteIP: ip}, nil 200 226 } 201 227 202 228 // inboundSession handles a single inbound SMTP connection. 203 229 type inboundSession struct { 204 - server *InboundServer 205 - from string 230 + server *InboundServer 231 + remoteIP string // captured at NewSession; "" if unavailable 232 + from string 206 233 // rcpts holds per-recipient classification so Data() can route each 207 234 // recipient to the right handler. 208 235 rcpts []inboundRcpt ··· 232 259 } 233 260 234 261 func (s *inboundSession) Mail(from string, opts *smtp.MailOptions) error { 262 + if !s.server.rateLimiter.Allow(s.remoteIP) { 263 + log.Printf("inbound.rate_limited: ip=%s from=%s", s.remoteIP, from) 264 + if s.server.metrics != nil { 265 + s.server.metrics.RecordRejected("rate_limit") 266 + } 267 + return &smtp.SMTPError{ 268 + Code: 421, 269 + EnhancedCode: smtp.EnhancedCode{4, 7, 0}, 270 + Message: "rate limit exceeded; please retry later", 271 + } 272 + } 235 273 s.from = from 236 274 return nil 237 275 }
+156
internal/relay/inbound_ratelimit.go
··· 1 + // SPDX-License-Identifier: AGPL-3.0-or-later 2 + 3 + package relay 4 + 5 + import ( 6 + "net" 7 + "sync" 8 + "time" 9 + 10 + "golang.org/x/time/rate" 11 + ) 12 + 13 + // inboundRateLimiter enforces a per-source-IP token-bucket rate limit on 14 + // inbound SMTP MAIL FROM commands. It exists to prevent the inbound 15 + // listener (port 25) from being used as an open-relay-shaped amplifier: 16 + // a single attacker IP can otherwise burn through the relay's outbound 17 + // reputation by flooding member forward_to mailboxes or dumping noise 18 + // at the VERP/FBL/postmaster handlers. 19 + // 20 + // The limiter is keyed on the remote IP only (no port). It does not 21 + // distinguish between recipient types — bounces, FBL reports, replies, 22 + // and postmaster mail share the same budget per source. Legitimate 23 + // providers send from many IPs and never approach the per-IP rate; an 24 + // abuse source coming from a single IP burns its budget quickly and 25 + // gets a 421 deferral. 26 + // 27 + // A background goroutine evicts entries that haven't been touched in 28 + // idleTimeout, keeping the map bounded under sustained scanning traffic. 29 + type inboundRateLimiter struct { 30 + mu sync.Mutex 31 + buckets map[string]*ipBucket 32 + 33 + rate rate.Limit // tokens per second 34 + burst int // bucket capacity 35 + idleTimeout time.Duration // evict entries idle this long 36 + 37 + stop chan struct{} 38 + stopWG sync.WaitGroup 39 + } 40 + 41 + type ipBucket struct { 42 + limiter *rate.Limiter 43 + lastSeen time.Time 44 + } 45 + 46 + // newInboundRateLimiter constructs a limiter and starts the cleanup 47 + // goroutine. msgsPerMinute <= 0 returns nil — callers must handle the 48 + // nil case as "rate limiting disabled". Negative or zero burst is 49 + // clamped to a sane default. 50 + func newInboundRateLimiter(msgsPerMinute float64, burst int, idleTimeout time.Duration) *inboundRateLimiter { 51 + if msgsPerMinute <= 0 { 52 + return nil 53 + } 54 + if burst <= 0 { 55 + burst = 10 56 + } 57 + if idleTimeout <= 0 { 58 + idleTimeout = 10 * time.Minute 59 + } 60 + rl := &inboundRateLimiter{ 61 + buckets: make(map[string]*ipBucket), 62 + rate: rate.Limit(msgsPerMinute / 60.0), 63 + burst: burst, 64 + idleTimeout: idleTimeout, 65 + stop: make(chan struct{}), 66 + } 67 + rl.stopWG.Add(1) 68 + go rl.cleanupLoop() 69 + return rl 70 + } 71 + 72 + // Allow returns true if a message from ip is permitted. The empty string 73 + // is allowed (no rate limit applied) so unit tests and tools that bypass 74 + // network plumbing aren't blocked. Production callers always pass a real 75 + // remote IP because the smtp.Conn carries one. 76 + func (rl *inboundRateLimiter) Allow(ip string) bool { 77 + if rl == nil || ip == "" { 78 + return true 79 + } 80 + rl.mu.Lock() 81 + b, ok := rl.buckets[ip] 82 + if !ok { 83 + b = &ipBucket{limiter: rate.NewLimiter(rl.rate, rl.burst)} 84 + rl.buckets[ip] = b 85 + } 86 + b.lastSeen = time.Now() 87 + rl.mu.Unlock() 88 + return b.limiter.Allow() 89 + } 90 + 91 + // Close stops the cleanup goroutine. Safe to call multiple times. 92 + func (rl *inboundRateLimiter) Close() { 93 + if rl == nil { 94 + return 95 + } 96 + select { 97 + case <-rl.stop: 98 + return // already closed 99 + default: 100 + close(rl.stop) 101 + } 102 + rl.stopWG.Wait() 103 + } 104 + 105 + // cleanupLoop evicts buckets that haven't been seen within idleTimeout. 106 + // Runs at idleTimeout/2 cadence so an entry never lingers more than 107 + // 1.5×idleTimeout after its last use. 108 + func (rl *inboundRateLimiter) cleanupLoop() { 109 + defer rl.stopWG.Done() 110 + tick := time.NewTicker(rl.idleTimeout / 2) 111 + defer tick.Stop() 112 + for { 113 + select { 114 + case <-rl.stop: 115 + return 116 + case now := <-tick.C: 117 + rl.evictIdle(now) 118 + } 119 + } 120 + } 121 + 122 + func (rl *inboundRateLimiter) evictIdle(now time.Time) { 123 + cutoff := now.Add(-rl.idleTimeout) 124 + rl.mu.Lock() 125 + for ip, b := range rl.buckets { 126 + if b.lastSeen.Before(cutoff) { 127 + delete(rl.buckets, ip) 128 + } 129 + } 130 + rl.mu.Unlock() 131 + } 132 + 133 + // size returns the number of tracked IPs. Used by tests to verify 134 + // cleanup; not exported. 135 + func (rl *inboundRateLimiter) size() int { 136 + if rl == nil { 137 + return 0 138 + } 139 + rl.mu.Lock() 140 + defer rl.mu.Unlock() 141 + return len(rl.buckets) 142 + } 143 + 144 + // remoteIP extracts the IP portion of a "host:port" string. Returns the 145 + // input unchanged if it doesn't parse as a host:port (e.g. unix sockets 146 + // in tests). Always returns lowercase to normalize IPv6. 147 + func remoteIP(addr string) string { 148 + if addr == "" { 149 + return "" 150 + } 151 + host, _, err := net.SplitHostPort(addr) 152 + if err != nil { 153 + return addr 154 + } 155 + return host 156 + }
+286
internal/relay/inbound_ratelimit_test.go
··· 1 + // SPDX-License-Identifier: AGPL-3.0-or-later 2 + 3 + package relay 4 + 5 + import ( 6 + "context" 7 + "fmt" 8 + "net" 9 + gosmtp "net/smtp" 10 + "strings" 11 + "sync" 12 + "sync/atomic" 13 + "testing" 14 + "time" 15 + ) 16 + 17 + func TestInboundRateLimiter_AllowsWithinBurst(t *testing.T) { 18 + rl := newInboundRateLimiter(60, 5, time.Minute) 19 + defer rl.Close() 20 + for i := 0; i < 5; i++ { 21 + if !rl.Allow("1.2.3.4") { 22 + t.Fatalf("call %d denied within burst", i+1) 23 + } 24 + } 25 + } 26 + 27 + func TestInboundRateLimiter_BlocksOnceBurstExhausted(t *testing.T) { 28 + // Very low refill so the burst is the only budget within the test window. 29 + rl := newInboundRateLimiter(1, 3, time.Minute) 30 + defer rl.Close() 31 + for i := 0; i < 3; i++ { 32 + if !rl.Allow("1.2.3.4") { 33 + t.Fatalf("call %d denied within burst", i+1) 34 + } 35 + } 36 + if rl.Allow("1.2.3.4") { 37 + t.Fatal("4th call allowed; expected rate-limit denial") 38 + } 39 + } 40 + 41 + func TestInboundRateLimiter_PerIPIsolation(t *testing.T) { 42 + rl := newInboundRateLimiter(1, 2, time.Minute) 43 + defer rl.Close() 44 + // Exhaust IP A's burst. 45 + rl.Allow("1.1.1.1") 46 + rl.Allow("1.1.1.1") 47 + if rl.Allow("1.1.1.1") { 48 + t.Fatal("IP A still allowed after burst") 49 + } 50 + // IP B has its own bucket. 51 + if !rl.Allow("2.2.2.2") { 52 + t.Fatal("IP B denied — buckets leaked across IPs") 53 + } 54 + } 55 + 56 + func TestInboundRateLimiter_NilIsAlwaysAllow(t *testing.T) { 57 + var rl *inboundRateLimiter 58 + if !rl.Allow("anything") { 59 + t.Fatal("nil limiter should allow") 60 + } 61 + // Close on nil must not panic. 62 + rl.Close() 63 + } 64 + 65 + func TestInboundRateLimiter_ZeroDisables(t *testing.T) { 66 + if rl := newInboundRateLimiter(0, 10, time.Minute); rl != nil { 67 + t.Fatal("zero rate should return nil limiter") 68 + } 69 + if rl := newInboundRateLimiter(-5, 10, time.Minute); rl != nil { 70 + t.Fatal("negative rate should return nil limiter") 71 + } 72 + } 73 + 74 + func TestInboundRateLimiter_EmptyIPAlwaysAllowed(t *testing.T) { 75 + rl := newInboundRateLimiter(60, 1, time.Minute) 76 + defer rl.Close() 77 + // Burst is 1; if "" mapped to a bucket it'd run out fast. It shouldn't. 78 + for i := 0; i < 100; i++ { 79 + if !rl.Allow("") { 80 + t.Fatalf("empty IP denied at call %d", i) 81 + } 82 + } 83 + } 84 + 85 + func TestInboundRateLimiter_EvictsIdleEntries(t *testing.T) { 86 + rl := newInboundRateLimiter(60, 5, 50*time.Millisecond) 87 + defer rl.Close() 88 + rl.Allow("9.9.9.9") 89 + if rl.size() != 1 { 90 + t.Fatalf("size = %d, want 1", rl.size()) 91 + } 92 + // Force an eviction with a synthetic future time. 93 + rl.evictIdle(time.Now().Add(time.Hour)) 94 + if rl.size() != 0 { 95 + t.Fatalf("size after eviction = %d, want 0", rl.size()) 96 + } 97 + } 98 + 99 + func TestRemoteIP_StripsPort(t *testing.T) { 100 + cases := []struct { 101 + in, want string 102 + }{ 103 + {"1.2.3.4:567", "1.2.3.4"}, 104 + {"[::1]:567", "::1"}, 105 + {"[2001:db8::1]:25", "2001:db8::1"}, 106 + {"127.0.0.1:0", "127.0.0.1"}, 107 + {"", ""}, 108 + {"not-a-host-port", "not-a-host-port"}, 109 + } 110 + for _, c := range cases { 111 + if got := remoteIP(c.in); got != c.want { 112 + t.Errorf("remoteIP(%q) = %q, want %q", c.in, got, c.want) 113 + } 114 + } 115 + } 116 + 117 + // recordingMetrics captures InboundMetrics calls for tests that need to 118 + // observe rate-limit rejections without exposing internal state. 119 + type recordingMetrics struct { 120 + mu sync.Mutex 121 + rejected map[string]int 122 + inbound map[string]int 123 + forward map[string]int 124 + } 125 + 126 + func newRecordingMetrics() *recordingMetrics { 127 + return &recordingMetrics{ 128 + rejected: make(map[string]int), 129 + inbound: make(map[string]int), 130 + forward: make(map[string]int), 131 + } 132 + } 133 + 134 + func (m *recordingMetrics) RecordInbound(c string) { 135 + m.mu.Lock() 136 + m.inbound[c]++ 137 + m.mu.Unlock() 138 + } 139 + func (m *recordingMetrics) RecordForward(s string) { 140 + m.mu.Lock() 141 + m.forward[s]++ 142 + m.mu.Unlock() 143 + } 144 + func (m *recordingMetrics) RecordRejected(r string) { 145 + m.mu.Lock() 146 + m.rejected[r]++ 147 + m.mu.Unlock() 148 + } 149 + func (m *recordingMetrics) rejectedCount(reason string) int { 150 + m.mu.Lock() 151 + defer m.mu.Unlock() 152 + return m.rejected[reason] 153 + } 154 + 155 + // TestInbound_RateLimitRejects421OverWire wires the limiter into a real 156 + // SMTP server bound to 127.0.0.1, sends N+1 messages over the burst, and 157 + // asserts the (N+1)th gets a 421 with the expected enhanced code. 158 + func TestInbound_RateLimitRejects421OverWire(t *testing.T) { 159 + ln, err := net.Listen("tcp", "127.0.0.1:0") 160 + if err != nil { 161 + t.Fatalf("listen: %v", err) 162 + } 163 + addr := ln.Addr().String() 164 + ln.Close() 165 + 166 + memberHash := MemberHashFromDID("did:plc:testmember") 167 + memberLookup := func(ctx context.Context, hash string) (string, bool) { 168 + if hash == memberHash { 169 + return "did:plc:testmember", true 170 + } 171 + return "", false 172 + } 173 + 174 + srv := NewInboundServer(InboundConfig{ 175 + ListenAddr: addr, 176 + Domain: "atmos.email", 177 + // Tight rate so the test runs fast: 60/min = 1/sec, burst=2. 178 + // Three messages in immediate succession exhausts the bucket 179 + // before the next token is minted. 180 + RateLimitMsgsPerMinute: 60, 181 + RateLimitBurst: 2, 182 + }, func(ctx context.Context, did, rcpt, btype, details string) {}, memberLookup) 183 + metrics := newRecordingMetrics() 184 + srv.SetMetrics(metrics) 185 + 186 + go srv.ListenAndServe() 187 + defer srv.Close() 188 + 189 + for i := 0; i < 50; i++ { 190 + c, err := net.DialTimeout("tcp", addr, 100*time.Millisecond) 191 + if err == nil { 192 + c.Close() 193 + break 194 + } 195 + time.Sleep(10 * time.Millisecond) 196 + } 197 + 198 + rcptHash := RecipientHashFromAddr("user@example.com") 199 + verp := fmt.Sprintf("bounces+%s+%s@atmos.email", memberHash, rcptHash) 200 + body := []byte("From: a@b\r\nTo: " + verp + "\r\n\r\nbody\r\n") 201 + 202 + // Each call opens a fresh TCP session; same client IP (127.0.0.1). 203 + // The first two should succeed, the third should get 421. 204 + send := func() error { 205 + return gosmtp.SendMail(addr, nil, "a@b.example", []string{verp}, body) 206 + } 207 + 208 + if err := send(); err != nil { 209 + t.Fatalf("call 1: %v", err) 210 + } 211 + if err := send(); err != nil { 212 + t.Fatalf("call 2: %v", err) 213 + } 214 + err = send() 215 + if err == nil { 216 + t.Fatal("call 3: expected rate-limit error, got success") 217 + } 218 + // net/smtp surfaces the server reply verbatim in err.Error(); 219 + // substring-check the 421 code rather than asserting on a private type. 220 + if !strings.Contains(err.Error(), "421") { 221 + t.Errorf("err = %v, want to contain 421", err) 222 + } 223 + // Metric must have been incremented for the rejection. 224 + if got := metrics.rejectedCount("rate_limit"); got != 1 { 225 + t.Errorf("rejected[rate_limit] = %d, want 1", got) 226 + } 227 + } 228 + 229 + // TestInbound_RateLimitDisabledByDefault verifies legacy behavior: when 230 + // RateLimitMsgsPerMinute is 0 (default), no rejections occur regardless 231 + // of burst. 232 + func TestInbound_RateLimitDisabledByDefault(t *testing.T) { 233 + ln, err := net.Listen("tcp", "127.0.0.1:0") 234 + if err != nil { 235 + t.Fatalf("listen: %v", err) 236 + } 237 + addr := ln.Addr().String() 238 + ln.Close() 239 + 240 + memberHash := MemberHashFromDID("did:plc:testmember") 241 + memberLookup := func(ctx context.Context, hash string) (string, bool) { 242 + if hash == memberHash { 243 + return "did:plc:testmember", true 244 + } 245 + return "", false 246 + } 247 + var bounces atomic.Int32 248 + onBounce := func(ctx context.Context, did, rcpt, btype, details string) { 249 + bounces.Add(1) 250 + } 251 + 252 + srv := NewInboundServer(InboundConfig{ 253 + ListenAddr: addr, 254 + Domain: "atmos.email", 255 + // Rate-limit explicitly NOT set → disabled. 256 + }, onBounce, memberLookup) 257 + go srv.ListenAndServe() 258 + defer srv.Close() 259 + 260 + for i := 0; i < 50; i++ { 261 + c, err := net.DialTimeout("tcp", addr, 100*time.Millisecond) 262 + if err == nil { 263 + c.Close() 264 + break 265 + } 266 + time.Sleep(10 * time.Millisecond) 267 + } 268 + 269 + rcptHash := RecipientHashFromAddr("user@example.com") 270 + verp := fmt.Sprintf("bounces+%s+%s@atmos.email", memberHash, rcptHash) 271 + dsn := "Content-Type: multipart/report; report-type=delivery-status; boundary=\"b1\"\r\n" + 272 + "From: mailer-daemon@example.com\r\n" + 273 + "To: " + verp + "\r\n" + 274 + "\r\n" + 275 + "--b1\r\n" + 276 + "Content-Type: text/plain\r\n\r\n" + 277 + "failed\r\n--b1\r\n" + 278 + "Content-Type: message/delivery-status\r\n\r\n" + 279 + "Final-Recipient: rfc822; user@example.com\r\nAction: failed\r\nStatus: 5.1.1\r\n\r\n--b1--\r\n" 280 + 281 + for i := 0; i < 10; i++ { 282 + if err := gosmtp.SendMail(addr, nil, "mailer-daemon@example.com", []string{verp}, []byte(dsn)); err != nil { 283 + t.Fatalf("send %d: %v", i+1, err) 284 + } 285 + } 286 + }
+3 -3
internal/relay/labelcheck.go
··· 111 111 verified, err := lc.queryLabeler(ctx, did) 112 112 if err != nil { 113 113 // Fail-closed: if labeler is unreachable and cache is expired, return error 114 - return false, fmt.Errorf("label check failed (fail-closed): %v", err) 114 + return false, fmt.Errorf("label check failed (fail-closed): %w", err) 115 115 } 116 116 117 117 // Update cache ··· 214 214 215 215 resp, err := lc.client.Do(req) 216 216 if err != nil { 217 - return false, fmt.Errorf("labeler request: %v", err) 217 + return false, fmt.Errorf("labeler request: %w", err) 218 218 } 219 219 defer resp.Body.Close() 220 220 ··· 224 224 225 225 var result queryLabelsResponse 226 226 if err := json.NewDecoder(io.LimitReader(resp.Body, 1<<20)).Decode(&result); err != nil { 227 - return false, fmt.Errorf("decode labeler response: %v", err) 227 + return false, fmt.Errorf("decode labeler response: %w", err) 228 228 } 229 229 230 230 // Check that all required labels are present and not negated
+239
internal/relay/memberhash_cache.go
··· 1 + // SPDX-License-Identifier: AGPL-3.0-or-later 2 + 3 + package relay 4 + 5 + import ( 6 + "context" 7 + "sync" 8 + "time" 9 + ) 10 + 11 + // MemberHashCache answers VERP "is this hash a member?" queries from a 12 + // process-local cache. The previous implementation rebuilt the cache from the 13 + // full members table on every miss, so a sender pumping random VERP local 14 + // parts at port 25 could trigger an O(N) full-table scan per inbound message 15 + // and DoS the relay. See #218. 16 + // 17 + // This cache adds two defenses: 18 + // 19 + // 1. Negative cache. A hash that resolved to "no such member" stays 20 + // non-existent for negTTL (default 5 min) — repeat misses for the same 21 + // fake hash become O(1). 22 + // 2. Rebuild rate limit. The positive cache only rebuilds every 23 + // rebuildInterval (default 30 s). Repeated misses for *different* fake 24 + // hashes can no longer trigger a stampede of full-table scans. New 25 + // enrollments are also picked up by a periodic background rebuild 26 + // (PeriodicRebuild) so the rebuild-on-miss path is no longer the only 27 + // freshness mechanism. 28 + type MemberHashCache struct { 29 + mu sync.RWMutex 30 + positive map[string]string // hash → DID 31 + negative map[string]time.Time // hash → expiry 32 + lastRebuild time.Time 33 + 34 + rebuildInterval time.Duration 35 + negTTL time.Duration 36 + maxNeg int 37 + 38 + rebuild func() (map[string]string, error) 39 + now func() time.Time 40 + metrics MemberHashMetrics 41 + } 42 + 43 + // MemberHashMetrics is the narrow metrics surface used by MemberHashCache. 44 + // Implementations record counts to Prometheus; nil-safe in tests. 45 + type MemberHashMetrics interface { 46 + IncMemberHashHit() // positive cache hit 47 + IncMemberHashNegHit() // negative cache hit (DoS short-circuit) 48 + IncMemberHashMiss() // confirmed miss after rebuild 49 + IncMemberHashRebuild() // a rebuild ran 50 + IncMemberHashRebuildSkip() // rebuild rate-limited 51 + SetMemberHashSize(positive, negative int) 52 + } 53 + 54 + // MemberHashCacheConfig configures a MemberHashCache. 55 + type MemberHashCacheConfig struct { 56 + // Rebuild loads the current positive cache from the source of truth. 57 + // Required. 58 + Rebuild func() (map[string]string, error) 59 + // RebuildInterval is the minimum gap between successive rebuilds. Default 30s. 60 + RebuildInterval time.Duration 61 + // NegTTL is how long a hash stays in the negative cache. Default 5min. 62 + NegTTL time.Duration 63 + // MaxNegative caps the negative-cache size. Default 10000. 64 + MaxNegative int 65 + // Now overrides time.Now (for tests). Default time.Now. 66 + Now func() time.Time 67 + // Metrics receives counters/gauges. nil → no-op. 68 + Metrics MemberHashMetrics 69 + } 70 + 71 + // NewMemberHashCache builds a lookup from cfg. It performs the initial 72 + // rebuild synchronously so the cache is warm before the first request. 73 + func NewMemberHashCache(cfg MemberHashCacheConfig) *MemberHashCache { 74 + if cfg.Rebuild == nil { 75 + panic("MemberHashCache: Rebuild is required") 76 + } 77 + if cfg.RebuildInterval == 0 { 78 + cfg.RebuildInterval = 30 * time.Second 79 + } 80 + if cfg.NegTTL == 0 { 81 + cfg.NegTTL = 5 * time.Minute 82 + } 83 + if cfg.MaxNegative == 0 { 84 + cfg.MaxNegative = 10000 85 + } 86 + if cfg.Now == nil { 87 + cfg.Now = time.Now 88 + } 89 + if cfg.Metrics == nil { 90 + cfg.Metrics = noopMemberHashMetrics{} 91 + } 92 + h := &MemberHashCache{ 93 + positive: map[string]string{}, 94 + negative: map[string]time.Time{}, 95 + rebuildInterval: cfg.RebuildInterval, 96 + negTTL: cfg.NegTTL, 97 + maxNeg: cfg.MaxNegative, 98 + rebuild: cfg.Rebuild, 99 + now: cfg.Now, 100 + metrics: cfg.Metrics, 101 + } 102 + // Initial warm-up — block until the first load completes so we don't 103 + // serve traffic with an empty positive cache. 104 + h.runRebuild(true) 105 + return h 106 + } 107 + 108 + // Lookup returns (DID, true) for known members, ("", false) otherwise. 109 + // Negative-cached misses short-circuit without touching the store. 110 + func (h *MemberHashCache) Lookup(hash string) (string, bool) { 111 + now := h.now() 112 + 113 + h.mu.RLock() 114 + if did, ok := h.positive[hash]; ok { 115 + h.mu.RUnlock() 116 + h.metrics.IncMemberHashHit() 117 + return did, true 118 + } 119 + if exp, ok := h.negative[hash]; ok && now.Before(exp) { 120 + h.mu.RUnlock() 121 + h.metrics.IncMemberHashNegHit() 122 + return "", false 123 + } 124 + mayRebuild := now.Sub(h.lastRebuild) >= h.rebuildInterval 125 + h.mu.RUnlock() 126 + 127 + if mayRebuild { 128 + h.runRebuild(false) 129 + h.mu.RLock() 130 + if did, ok := h.positive[hash]; ok { 131 + h.mu.RUnlock() 132 + h.metrics.IncMemberHashHit() 133 + return did, true 134 + } 135 + h.mu.RUnlock() 136 + } 137 + 138 + h.recordMiss(hash, now) 139 + h.metrics.IncMemberHashMiss() 140 + return "", false 141 + } 142 + 143 + // runRebuild reloads the positive cache. force=true bypasses the interval 144 + // gate (used at construction). When the gate fires, the rebuild is skipped. 145 + func (h *MemberHashCache) runRebuild(force bool) { 146 + h.mu.Lock() 147 + if !force && h.now().Sub(h.lastRebuild) < h.rebuildInterval { 148 + h.mu.Unlock() 149 + h.metrics.IncMemberHashRebuildSkip() 150 + return 151 + } 152 + h.lastRebuild = h.now() 153 + h.mu.Unlock() 154 + 155 + newMap, err := h.rebuild() 156 + if err != nil { 157 + // Keep the old positive map; a transient store error shouldn't 158 + // blow away cached members. The next interval will retry. 159 + return 160 + } 161 + 162 + h.mu.Lock() 163 + h.positive = newMap 164 + // Drop negative entries that are now positive — happens when a member 165 + // enrolls between our last rebuild and now. 166 + for hash := range newMap { 167 + delete(h.negative, hash) 168 + } 169 + posLen, negLen := len(h.positive), len(h.negative) 170 + h.mu.Unlock() 171 + 172 + h.metrics.IncMemberHashRebuild() 173 + h.metrics.SetMemberHashSize(posLen, negLen) 174 + } 175 + 176 + // recordMiss inserts a negative-cache entry, evicting if at capacity. 177 + func (h *MemberHashCache) recordMiss(hash string, now time.Time) { 178 + h.mu.Lock() 179 + defer h.mu.Unlock() 180 + 181 + if len(h.negative) >= h.maxNeg { 182 + // First sweep: drop expired entries. 183 + for k, exp := range h.negative { 184 + if !exp.After(now) { 185 + delete(h.negative, k) 186 + } 187 + } 188 + // Still full? Drop ~10% via Go's randomized map iteration. Not a 189 + // perfect LRU but the negative cache is purely an optimization — 190 + // any eviction simply means the next miss for that hash takes the 191 + // rebuild-rate-limited slow path. 192 + if len(h.negative) >= h.maxNeg { 193 + toDrop := h.maxNeg / 10 194 + if toDrop < 1 { 195 + toDrop = 1 196 + } 197 + for k := range h.negative { 198 + delete(h.negative, k) 199 + toDrop-- 200 + if toDrop <= 0 { 201 + break 202 + } 203 + } 204 + } 205 + } 206 + h.negative[hash] = now.Add(h.negTTL) 207 + h.metrics.SetMemberHashSize(len(h.positive), len(h.negative)) 208 + } 209 + 210 + // PeriodicRebuild runs in a goroutine and rebuilds the positive cache on 211 + // each tick, picking up newly enrolled members without waiting for a miss. 212 + func (h *MemberHashCache) PeriodicRebuild(ctx context.Context, interval time.Duration) { 213 + ticker := time.NewTicker(interval) 214 + defer ticker.Stop() 215 + for { 216 + select { 217 + case <-ctx.Done(): 218 + return 219 + case <-ticker.C: 220 + h.runRebuild(true) 221 + } 222 + } 223 + } 224 + 225 + // Sizes returns (positive, negative) counts. Test/debug helper. 226 + func (h *MemberHashCache) Sizes() (positive, negative int) { 227 + h.mu.RLock() 228 + defer h.mu.RUnlock() 229 + return len(h.positive), len(h.negative) 230 + } 231 + 232 + type noopMemberHashMetrics struct{} 233 + 234 + func (noopMemberHashMetrics) IncMemberHashHit() {} 235 + func (noopMemberHashMetrics) IncMemberHashNegHit() {} 236 + func (noopMemberHashMetrics) IncMemberHashMiss() {} 237 + func (noopMemberHashMetrics) IncMemberHashRebuild() {} 238 + func (noopMemberHashMetrics) IncMemberHashRebuildSkip() {} 239 + func (noopMemberHashMetrics) SetMemberHashSize(_ int, _ int) {}
+255
internal/relay/memberhash_cache_test.go
··· 1 + // SPDX-License-Identifier: AGPL-3.0-or-later 2 + 3 + package relay 4 + 5 + import ( 6 + "errors" 7 + "sync" 8 + "sync/atomic" 9 + "testing" 10 + "time" 11 + ) 12 + 13 + type fakeClock struct { 14 + mu sync.Mutex 15 + t time.Time 16 + } 17 + 18 + func (c *fakeClock) Now() time.Time { c.mu.Lock(); defer c.mu.Unlock(); return c.t } 19 + func (c *fakeClock) Advance(d time.Duration) { 20 + c.mu.Lock() 21 + defer c.mu.Unlock() 22 + c.t = c.t.Add(d) 23 + } 24 + 25 + type memberHashCacheMetrics struct { 26 + hit, neg, miss, rebuild, rebuildSkip atomic.Int64 27 + } 28 + 29 + func (m *memberHashCacheMetrics) IncMemberHashHit() { m.hit.Add(1) } 30 + func (m *memberHashCacheMetrics) IncMemberHashNegHit() { m.neg.Add(1) } 31 + func (m *memberHashCacheMetrics) IncMemberHashMiss() { m.miss.Add(1) } 32 + func (m *memberHashCacheMetrics) IncMemberHashRebuild() { m.rebuild.Add(1) } 33 + func (m *memberHashCacheMetrics) IncMemberHashRebuildSkip(){ m.rebuildSkip.Add(1) } 34 + func (m *memberHashCacheMetrics) SetMemberHashSize(_ int, _ int) {} 35 + 36 + func newCacheForTest(t *testing.T, members map[string]string, clock *fakeClock) (*MemberHashCache, *atomic.Int64, *memberHashCacheMetrics) { 37 + t.Helper() 38 + var rebuildCalls atomic.Int64 39 + mu := sync.Mutex{} 40 + current := members 41 + rebuild := func() (map[string]string, error) { 42 + rebuildCalls.Add(1) 43 + mu.Lock() 44 + defer mu.Unlock() 45 + out := make(map[string]string, len(current)) 46 + for k, v := range current { 47 + out[k] = v 48 + } 49 + return out, nil 50 + } 51 + mx := &memberHashCacheMetrics{} 52 + h := NewMemberHashCache(MemberHashCacheConfig{ 53 + Rebuild: rebuild, 54 + RebuildInterval: 30 * time.Second, 55 + NegTTL: 5 * time.Minute, 56 + MaxNegative: 100, 57 + Now: clock.Now, 58 + Metrics: mx, 59 + }) 60 + return h, &rebuildCalls, mx 61 + } 62 + 63 + func TestMemberHashCache_Hit(t *testing.T) { 64 + clock := &fakeClock{t: time.Unix(1_700_000_000, 0)} 65 + h, _, mx := newCacheForTest(t, map[string]string{"hash-a": "did:plc:aaa"}, clock) 66 + 67 + did, ok := h.Lookup("hash-a") 68 + if !ok || did != "did:plc:aaa" { 69 + t.Fatalf("got (%q,%v), want (did:plc:aaa,true)", did, ok) 70 + } 71 + if mx.hit.Load() != 1 { 72 + t.Errorf("hit counter=%d, want 1", mx.hit.Load()) 73 + } 74 + } 75 + 76 + func TestMemberHashCache_NegativeCacheShortCircuits(t *testing.T) { 77 + // The DoS regression fix: a fake hash should NOT trigger a full-table 78 + // rebuild on every subsequent miss within the negative TTL. 79 + clock := &fakeClock{t: time.Unix(1_700_000_000, 0)} 80 + h, rebuildCalls, mx := newCacheForTest(t, map[string]string{}, clock) 81 + 82 + initialRebuilds := rebuildCalls.Load() // 1 from constructor 83 + 84 + // First miss: triggers a rebuild (the bucket allowed it because 85 + // lastRebuild is "now" but constructor used force=true so let's wait 86 + // past the interval first). 87 + clock.Advance(31 * time.Second) 88 + if _, ok := h.Lookup("attacker-fake-1"); ok { 89 + t.Fatal("unexpected hit for fake hash") 90 + } 91 + if mx.miss.Load() != 1 { 92 + t.Errorf("miss counter=%d, want 1", mx.miss.Load()) 93 + } 94 + if rebuildCalls.Load() != initialRebuilds+1 { 95 + t.Errorf("rebuild calls=%d, want %d (one rebuild after interval)", rebuildCalls.Load(), initialRebuilds+1) 96 + } 97 + 98 + // 1000 follow-up lookups for the SAME fake hash within the negTTL must 99 + // short-circuit on the negative cache. 100 + for i := 0; i < 1000; i++ { 101 + if _, ok := h.Lookup("attacker-fake-1"); ok { 102 + t.Fatal("unexpected hit for fake hash on repeat lookup") 103 + } 104 + } 105 + if mx.neg.Load() != 1000 { 106 + t.Errorf("negative-cache hits=%d, want 1000", mx.neg.Load()) 107 + } 108 + if got := rebuildCalls.Load(); got != initialRebuilds+1 { 109 + t.Errorf("rebuild fired during negative-cached lookups: got=%d, want %d", got, initialRebuilds+1) 110 + } 111 + } 112 + 113 + func TestMemberHashCache_RebuildIsRateLimited(t *testing.T) { 114 + // 100 distinct fake hashes within a single 30s window should produce at 115 + // most 1 rebuild — the rate limit prevents stampede. 116 + clock := &fakeClock{t: time.Unix(1_700_000_000, 0)} 117 + h, rebuildCalls, _ := newCacheForTest(t, map[string]string{}, clock) 118 + initial := rebuildCalls.Load() 119 + 120 + // First lookup post-construction has lastRebuild = now, so within 121 + // interval — rebuild should be skipped on the first call too. 122 + for i := 0; i < 100; i++ { 123 + hash := "fake-" + string(rune('A'+i%26)) + string(rune('0'+i/26)) 124 + h.Lookup(hash) 125 + } 126 + 127 + rebuilds := rebuildCalls.Load() - initial 128 + if rebuilds > 1 { 129 + t.Errorf("rebuild fired %d times within interval, want ≤1", rebuilds) 130 + } 131 + } 132 + 133 + func TestMemberHashCache_NegativeTTLExpiresAndRebuildAdmitsNewMember(t *testing.T) { 134 + clock := &fakeClock{t: time.Unix(1_700_000_000, 0)} 135 + members := map[string]string{} 136 + mu := sync.Mutex{} 137 + rebuild := func() (map[string]string, error) { 138 + mu.Lock() 139 + defer mu.Unlock() 140 + out := map[string]string{} 141 + for k, v := range members { 142 + out[k] = v 143 + } 144 + return out, nil 145 + } 146 + h := NewMemberHashCache(MemberHashCacheConfig{ 147 + Rebuild: rebuild, 148 + RebuildInterval: 30 * time.Second, 149 + NegTTL: 5 * time.Minute, 150 + MaxNegative: 100, 151 + Now: clock.Now, 152 + }) 153 + 154 + // 1) Member doesn't exist yet — first lookup misses + caches negatively. 155 + clock.Advance(31 * time.Second) 156 + if _, ok := h.Lookup("hash-late"); ok { 157 + t.Fatal("unexpected hit before enrollment") 158 + } 159 + 160 + // 2) Member enrolls (in the source) and the negative cache is still hot. 161 + mu.Lock() 162 + members["hash-late"] = "did:plc:late" 163 + mu.Unlock() 164 + 165 + // Within negTTL but past rebuildInterval: lookup should hit the 166 + // negative cache and NOT see the new member yet. 167 + clock.Advance(31 * time.Second) 168 + if _, ok := h.Lookup("hash-late"); ok { 169 + t.Error("negative cache failed to short-circuit during TTL") 170 + } 171 + 172 + // 3) After negTTL expires AND rebuild interval has passed, the lookup 173 + // should rebuild and admit the new member. 174 + clock.Advance(6 * time.Minute) // past 5min negTTL 175 + did, ok := h.Lookup("hash-late") 176 + if !ok || did != "did:plc:late" { 177 + t.Errorf("late enrollment not picked up: did=%q ok=%v", did, ok) 178 + } 179 + } 180 + 181 + func TestMemberHashCache_RebuildErrorPreservesPositiveCache(t *testing.T) { 182 + clock := &fakeClock{t: time.Unix(1_700_000_000, 0)} 183 + rebuild := func() (map[string]string, error) { 184 + return nil, errors.New("transient db error") 185 + } 186 + // Construct with a working rebuild first, so we have something cached. 187 + cfg := MemberHashCacheConfig{ 188 + Rebuild: func() (map[string]string, error) { 189 + return map[string]string{"hash-a": "did:plc:aaa"}, nil 190 + }, 191 + RebuildInterval: 30 * time.Second, 192 + NegTTL: 5 * time.Minute, 193 + MaxNegative: 100, 194 + Now: clock.Now, 195 + } 196 + h := NewMemberHashCache(cfg) 197 + 198 + // Swap in a failing rebuild and force it to run. 199 + h.rebuild = rebuild 200 + clock.Advance(31 * time.Second) 201 + h.runRebuild(true) 202 + 203 + // Positive cache must still work despite the rebuild error. 204 + did, ok := h.Lookup("hash-a") 205 + if !ok || did != "did:plc:aaa" { 206 + t.Errorf("positive cache lost on rebuild error: did=%q ok=%v", did, ok) 207 + } 208 + } 209 + 210 + func TestMemberHashCache_NegativeCapEvictsExpiredFirst(t *testing.T) { 211 + clock := &fakeClock{t: time.Unix(1_700_000_000, 0)} 212 + rebuild := func() (map[string]string, error) { return map[string]string{}, nil } 213 + h := NewMemberHashCache(MemberHashCacheConfig{ 214 + Rebuild: rebuild, 215 + RebuildInterval: 30 * time.Second, 216 + NegTTL: 1 * time.Minute, 217 + MaxNegative: 10, 218 + Now: clock.Now, 219 + }) 220 + 221 + // Fill negative cache with 10 entries. 222 + clock.Advance(31 * time.Second) 223 + for i := 0; i < 10; i++ { 224 + h.Lookup("fake-" + string(rune('A'+i))) 225 + } 226 + _, neg := h.Sizes() 227 + if neg != 10 { 228 + t.Fatalf("negative size=%d after fill, want 10", neg) 229 + } 230 + 231 + // All entries expire. 232 + clock.Advance(2 * time.Minute) 233 + 234 + // Inserting one more should sweep expired and end up well below cap. 235 + h.Lookup("fake-NEW") 236 + _, neg = h.Sizes() 237 + if neg > 1 { 238 + t.Errorf("expired entries not swept: negative size=%d, want 1", neg) 239 + } 240 + } 241 + 242 + func TestMemberHashCache_HitDoesNotTriggerRebuild(t *testing.T) { 243 + clock := &fakeClock{t: time.Unix(1_700_000_000, 0)} 244 + h, rebuildCalls, _ := newCacheForTest(t, map[string]string{"hash-a": "did:plc:aaa"}, clock) 245 + initial := rebuildCalls.Load() 246 + 247 + // 1000 hits should never re-rebuild. 248 + clock.Advance(10 * time.Minute) 249 + for i := 0; i < 1000; i++ { 250 + h.Lookup("hash-a") 251 + } 252 + if rebuildCalls.Load() != initial { 253 + t.Errorf("rebuild fired during pure hit traffic: %d→%d", initial, rebuildCalls.Load()) 254 + } 255 + }
+230 -4
internal/relay/metrics.go
··· 21 21 BouncesTotal *prometheus.CounterVec // type: hard, soft 22 22 AuthAttempts *prometheus.CounterVec // result: success, failure 23 23 RateLimitHits *prometheus.CounterVec // limit_type: hourly, daily, global 24 + OrphanDeliveries *prometheus.CounterVec // status: sent, bounced — delivery callbacks for missing DB rows (#208) 25 + OrphanReconciled prometheus.Counter // status=queued rows the janitor closed because no spool file exists (#208) 26 + GoroutineCrashes *prometheus.CounterVec // name — recovered panics in background goroutines (#209) 27 + 28 + // Multi-recipient SMTP DATA outcomes (#226). When a single DATA fans out 29 + // to N recipients and a subset fail to enqueue, we accept the DATA (250) 30 + // to avoid duplicating the successful recipients on client retry, and 31 + // instead surface the failures here. 32 + PartialDeliveries prometheus.Counter // DATA accepted with at least one recipient failed 33 + PartialDeliveryRecipients *prometheus.CounterVec // outcome: succeeded, failed — per-recipient counts inside a partial-delivery DATA 34 + 35 + // Member-hash cache (#218). Negative cache + rebuild rate-limit defend 36 + // against random-VERP DoS at port 25. 37 + MemberHashLookups *prometheus.CounterVec // outcome: hit, neg_hit, miss 38 + MemberHashRebuilds *prometheus.CounterVec // outcome: ran, skipped 39 + MemberHashCacheSize *prometheus.GaugeVec // kind: positive, negative 24 40 25 41 // HTTP request tracking 26 42 HTTPRequestsTotal *prometheus.CounterVec // host, method, path, status ··· 35 51 LabelerReachable prometheus.Gauge 36 52 OspreyReachable prometheus.Gauge 37 53 54 + // SQLite connection-pool observability (#210). Gauges sampled 55 + // from sql.DB.Stats() periodically; counters incremented when a 56 + // returned error matches the SQLITE_BUSY/locked signature. 57 + SQLiteOpenConnections prometheus.Gauge 58 + SQLiteInUse prometheus.Gauge 59 + SQLiteIdle prometheus.Gauge 60 + SQLiteWaitCount prometheus.Gauge // cumulative since process start 61 + SQLiteWaitDurationSec prometheus.Gauge // cumulative seconds since process start 62 + SQLiteBusyErrors *prometheus.CounterVec // op: insert, update, query, exec — best-effort classification at hot writers 63 + 38 64 // Osprey enforcement counters 39 65 OspreyChecksTotal *prometheus.CounterVec // result: allowed, blocked 40 66 41 67 // Osprey event emission counters 42 - OspreyEventsEmitted *prometheus.CounterVec // event_type 43 - OspreyEventsFailed *prometheus.CounterVec // event_type 68 + OspreyEventsEmitted *prometheus.CounterVec // event_type 69 + OspreyEventsFailed *prometheus.CounterVec // event_type 70 + OspreyEventsSpooled *prometheus.CounterVec // event_type — events landed in the on-disk DLQ (#214) 71 + OspreyEventsReplayed *prometheus.CounterVec // event_type — DLQ entries that finally reached the broker (#214) 72 + OspreyEventsDropped *prometheus.CounterVec // reason — permanent loss (overflow, corrupt) (#214) 73 + OspreyDisabled prometheus.Gauge // 1 when the emitter is Noop (Kafka misconfigured), 0 when active (#214) 74 + OspreySpoolDepth prometheus.Gauge // current DLQ size (#214) 75 + OspreyColdCacheDecisions *prometheus.CounterVec // decision: allowed, denied — fires when Osprey is unreachable AND no cache entry (#215) 44 76 45 77 // FBL/ARF complaint tracking 46 78 ComplaintsTotal *prometheus.CounterVec // feedback_type, provider ··· 55 87 // Inbound mail classification + forwarding (Phase 1b) 56 88 InboundMessages *prometheus.CounterVec // classification: verp_bounce, srs_bounce, reply, postmaster 57 89 RepliesForwarded *prometheus.CounterVec // status: sent, failed 90 + InboundRejected *prometheus.CounterVec // reason: rate_limit 58 91 59 92 // Osprey events consumer health 60 93 EventsConsumerLastIngestTimestamp prometheus.Gauge // Unix timestamp of last successful consume ··· 80 113 Name: "atmosphere_relay_delivery_attempts_total", 81 114 Help: "Total delivery attempts, by outcome.", 82 115 }, []string{"status"}), 116 + OrphanDeliveries: prometheus.NewCounterVec(prometheus.CounterOpts{ 117 + Name: "atmosphere_relay_orphan_deliveries_total", 118 + Help: "Delivery callbacks for spool entries with no backing messages row (#208).", 119 + }, []string{"status"}), 120 + OrphanReconciled: prometheus.NewCounter(prometheus.CounterOpts{ 121 + Name: "atmosphere_relay_orphan_reconciled_total", 122 + Help: "Queued message rows closed by the orphan-reconciliation janitor because no spool file exists (#208).", 123 + }), 124 + GoroutineCrashes: prometheus.NewCounterVec(prometheus.CounterOpts{ 125 + Name: "atmosphere_relay_goroutine_crashes_total", 126 + Help: "Background goroutine panics recovered by GoSafe, by goroutine name (#209).", 127 + }, []string{"name"}), 128 + PartialDeliveries: prometheus.NewCounter(prometheus.CounterOpts{ 129 + Name: "atmosphere_relay_partial_deliveries_total", 130 + Help: "Multi-RCPT DATA messages accepted with at least one recipient failing to enqueue (#226).", 131 + }), 132 + PartialDeliveryRecipients: prometheus.NewCounterVec(prometheus.CounterOpts{ 133 + Name: "atmosphere_relay_partial_delivery_recipients_total", 134 + Help: "Per-recipient outcomes inside multi-RCPT DATA messages, by outcome (#226).", 135 + }, []string{"outcome"}), 136 + MemberHashLookups: prometheus.NewCounterVec(prometheus.CounterOpts{ 137 + Name: "atmosphere_relay_member_hash_lookups_total", 138 + Help: "Inbound VERP member-hash lookups, by outcome (#218).", 139 + }, []string{"outcome"}), 140 + MemberHashRebuilds: prometheus.NewCounterVec(prometheus.CounterOpts{ 141 + Name: "atmosphere_relay_member_hash_rebuilds_total", 142 + Help: "Member-hash cache rebuilds, by outcome (#218).", 143 + }, []string{"outcome"}), 144 + MemberHashCacheSize: prometheus.NewGaugeVec(prometheus.GaugeOpts{ 145 + Name: "atmosphere_relay_member_hash_cache_size", 146 + Help: "Member-hash cache size, by kind (positive=enrolled members, negative=cached misses) (#218).", 147 + }, []string{"kind"}), 148 + SQLiteOpenConnections: prometheus.NewGauge(prometheus.GaugeOpts{ 149 + Name: "atmosphere_relay_sqlite_open_connections", 150 + Help: "sql.DB.Stats().OpenConnections — total connections open to SQLite (#210).", 151 + }), 152 + SQLiteInUse: prometheus.NewGauge(prometheus.GaugeOpts{ 153 + Name: "atmosphere_relay_sqlite_in_use", 154 + Help: "sql.DB.Stats().InUse — connections currently checked out and busy executing a query (#210).", 155 + }), 156 + SQLiteIdle: prometheus.NewGauge(prometheus.GaugeOpts{ 157 + Name: "atmosphere_relay_sqlite_idle", 158 + Help: "sql.DB.Stats().Idle — connections currently idle in the pool (#210).", 159 + }), 160 + SQLiteWaitCount: prometheus.NewGauge(prometheus.GaugeOpts{ 161 + Name: "atmosphere_relay_sqlite_wait_count", 162 + Help: "sql.DB.Stats().WaitCount — cumulative number of connections that had to wait for a free slot (#210).", 163 + }), 164 + SQLiteWaitDurationSec: prometheus.NewGauge(prometheus.GaugeOpts{ 165 + Name: "atmosphere_relay_sqlite_wait_duration_seconds", 166 + Help: "sql.DB.Stats().WaitDuration — cumulative seconds waited for a free connection (#210).", 167 + }), 168 + SQLiteBusyErrors: prometheus.NewCounterVec(prometheus.CounterOpts{ 169 + Name: "atmosphere_relay_sqlite_busy_errors_total", 170 + Help: "SQLite errors classified as SQLITE_BUSY/locked at hot-path writers (#210).", 171 + }, []string{"op"}), 83 172 BouncesTotal: prometheus.NewCounterVec(prometheus.CounterOpts{ 84 173 Name: "atmosphere_relay_bounces_total", 85 174 Help: "Total bounces received, by type.", ··· 125 214 Name: "atmosphere_relay_osprey_checks_total", 126 215 Help: "Osprey enforcement checks, by result.", 127 216 }, []string{"result"}), 217 + OspreyEventsSpooled: prometheus.NewCounterVec(prometheus.CounterOpts{ 218 + Name: "atmosphere_relay_osprey_events_spooled_total", 219 + Help: "Osprey events that failed to reach Kafka and were spooled to disk for replay (#214).", 220 + }, []string{"event_type"}), 221 + OspreyEventsReplayed: prometheus.NewCounterVec(prometheus.CounterOpts{ 222 + Name: "atmosphere_relay_osprey_events_replayed_total", 223 + Help: "Osprey events drained from the on-disk DLQ back to Kafka (#214).", 224 + }, []string{"event_type"}), 225 + OspreyEventsDropped: prometheus.NewCounterVec(prometheus.CounterOpts{ 226 + Name: "atmosphere_relay_osprey_events_dropped_total", 227 + Help: "Osprey events permanently lost (DLQ overflow, corrupt entries) (#214).", 228 + }, []string{"reason"}), 229 + OspreyDisabled: prometheus.NewGauge(prometheus.GaugeOpts{ 230 + Name: "atmosphere_relay_osprey_disabled", 231 + Help: "1 if the Osprey emitter is configured as Noop (Kafka broker missing); 0 if active (#214).", 232 + }), 233 + OspreySpoolDepth: prometheus.NewGauge(prometheus.GaugeOpts{ 234 + Name: "atmosphere_relay_osprey_spool_depth", 235 + Help: "Number of events currently sitting in the Osprey on-disk DLQ awaiting replay (#214).", 236 + }), 237 + OspreyColdCacheDecisions: prometheus.NewCounterVec(prometheus.CounterOpts{ 238 + Name: "atmosphere_relay_osprey_cold_cache_decisions_total", 239 + Help: "Cold-cache+unreachable enforcer decisions, by outcome (denied=fail-closed, allowed=fail-open) (#215).", 240 + }, []string{"decision"}), 128 241 OspreyEventsEmitted: prometheus.NewCounterVec(prometheus.CounterOpts{ 129 242 Name: "atmosphere_relay_osprey_events_emitted_total", 130 243 Help: "Osprey events confirmed by Kafka broker (post-Completion), by event type.", ··· 153 266 Name: "atmosphere_relay_replies_forwarded_total", 154 267 Help: "Outcome of reply-forwarding attempts, by status.", 155 268 }, []string{"status"}), 269 + InboundRejected: prometheus.NewCounterVec(prometheus.CounterOpts{ 270 + Name: "atmosphere_relay_inbound_rejected_total", 271 + Help: "Inbound SMTP sessions rejected before classification, by reason.", 272 + }, []string{"reason"}), 156 273 EventsConsumerLastIngestTimestamp: prometheus.NewGauge(prometheus.GaugeOpts{ 157 274 Name: "atmosphere_relay_events_consumer_last_ingest_timestamp_seconds", 158 275 Help: "Unix timestamp of the last successfully consumed Osprey event.", ··· 173 290 m.MessagesSent, 174 291 m.DeliveryAttempts, 175 292 m.BouncesTotal, 293 + m.OrphanDeliveries, 294 + m.OrphanReconciled, 295 + m.GoroutineCrashes, 296 + m.PartialDeliveries, 297 + m.PartialDeliveryRecipients, 298 + m.MemberHashLookups, 299 + m.MemberHashRebuilds, 300 + m.MemberHashCacheSize, 301 + m.SQLiteOpenConnections, 302 + m.SQLiteInUse, 303 + m.SQLiteIdle, 304 + m.SQLiteWaitCount, 305 + m.SQLiteWaitDurationSec, 306 + m.SQLiteBusyErrors, 176 307 m.AuthAttempts, 177 308 m.RateLimitHits, 178 309 m.DeliveryQueueDepth, ··· 182 313 m.OspreyChecksTotal, 183 314 m.OspreyEventsEmitted, 184 315 m.OspreyEventsFailed, 316 + m.OspreyEventsSpooled, 317 + m.OspreyEventsReplayed, 318 + m.OspreyEventsDropped, 319 + m.OspreyDisabled, 320 + m.OspreySpoolDepth, 321 + m.OspreyColdCacheDecisions, 185 322 m.ComplaintsTotal, 186 323 m.InboundMessages, 187 324 m.RepliesForwarded, 325 + m.InboundRejected, 188 326 m.HTTPRequestsTotal, 189 327 m.HTTPRequestDuration, 190 328 m.EnrollFunnel, ··· 204 342 m.MessagesRejected.WithLabelValues("osprey_suspended") 205 343 m.MessagesRejected.WithLabelValues("suppressed") 206 344 m.MessagesRejected.WithLabelValues("smuggling_guard") 345 + m.MessagesRejected.WithLabelValues("delivery_failed") 346 + m.PartialDeliveryRecipients.WithLabelValues("succeeded") 347 + m.PartialDeliveryRecipients.WithLabelValues("failed") 348 + m.MemberHashLookups.WithLabelValues("hit") 349 + m.MemberHashLookups.WithLabelValues("neg_hit") 350 + m.MemberHashLookups.WithLabelValues("miss") 351 + m.MemberHashRebuilds.WithLabelValues("ran") 352 + m.MemberHashRebuilds.WithLabelValues("skipped") 353 + m.MemberHashCacheSize.WithLabelValues("positive") 354 + m.MemberHashCacheSize.WithLabelValues("negative") 207 355 m.DeliveryAttempts.WithLabelValues("sent") 208 356 m.DeliveryAttempts.WithLabelValues("bounced") 209 357 m.DeliveryAttempts.WithLabelValues("deferred") ··· 290 438 m.RepliesForwarded.WithLabelValues(status).Inc() 291 439 } 292 440 441 + // IncGoroutineCrash implements relay.PanicRecorder. Used by GoSafe 442 + // to count recovered panics by named goroutine (#209). 443 + func (m *Metrics) IncGoroutineCrash(name string) { 444 + m.GoroutineCrashes.WithLabelValues(name).Inc() 445 + } 446 + 447 + // IncBusyError implements relaystore.BusyRecorder. Counts SQLITE_BUSY 448 + // errors that escape the busy_timeout PRAGMA at hot-path writers (#210). 449 + func (m *Metrics) IncBusyError(op string) { 450 + m.SQLiteBusyErrors.WithLabelValues(op).Inc() 451 + } 452 + 453 + // IncColdCacheDecision implements relay.ColdCacheRecorder. Counts 454 + // fail-open vs fail-closed enforcer decisions when Osprey is 455 + // unreachable AND the labelcheck cache has no entry for the DID (#215). 456 + func (m *Metrics) IncColdCacheDecision(decision string) { 457 + m.OspreyColdCacheDecisions.WithLabelValues(decision).Inc() 458 + } 459 + 460 + // IncMemberHashHit / IncMemberHashNegHit / IncMemberHashMiss / 461 + // IncMemberHashRebuild / IncMemberHashRebuildSkip / SetMemberHashSize 462 + // implement relay.MemberHashMetrics on *Metrics so the inbound member-hash 463 + // cache (#218) can record without needing a separate adapter type. 464 + func (m *Metrics) IncMemberHashHit() { m.MemberHashLookups.WithLabelValues("hit").Inc() } 465 + func (m *Metrics) IncMemberHashNegHit() { m.MemberHashLookups.WithLabelValues("neg_hit").Inc() } 466 + func (m *Metrics) IncMemberHashMiss() { m.MemberHashLookups.WithLabelValues("miss").Inc() } 467 + func (m *Metrics) IncMemberHashRebuild() { m.MemberHashRebuilds.WithLabelValues("ran").Inc() } 468 + func (m *Metrics) IncMemberHashRebuildSkip(){ m.MemberHashRebuilds.WithLabelValues("skipped").Inc() } 469 + func (m *Metrics) SetMemberHashSize(positive, negative int) { 470 + m.MemberHashCacheSize.WithLabelValues("positive").Set(float64(positive)) 471 + m.MemberHashCacheSize.WithLabelValues("negative").Set(float64(negative)) 472 + } 473 + 474 + // SetSQLiteStats updates the SQLite pool gauges from a snapshot 475 + // taken via relaystore.Store.SampleStats(). Decoupled from 476 + // *sql.DB so the metrics package doesn't take a database/sql 477 + // dependency. 478 + func (m *Metrics) SetSQLiteStats(open, inUse, idle int, waitCount int64, waitDurationSec float64) { 479 + m.SQLiteOpenConnections.Set(float64(open)) 480 + m.SQLiteInUse.Set(float64(inUse)) 481 + m.SQLiteIdle.Set(float64(idle)) 482 + m.SQLiteWaitCount.Set(float64(waitCount)) 483 + m.SQLiteWaitDurationSec.Set(waitDurationSec) 484 + } 485 + 486 + // RecordRejected implements relay.InboundMetrics. 487 + func (m *Metrics) RecordRejected(reason string) { 488 + m.InboundRejected.WithLabelValues(reason).Inc() 489 + } 490 + 293 491 // EmitterMetricsAdapter bridges relay.Metrics to the osprey.EmitterMetrics interface. 294 492 type EmitterMetricsAdapter struct { 295 - Emitted *prometheus.CounterVec 296 - Failed *prometheus.CounterVec 493 + Emitted *prometheus.CounterVec // event_type 494 + Failed *prometheus.CounterVec // event_type 495 + Spooled *prometheus.CounterVec // event_type — fired when an event lands in the on-disk DLQ (#214) 496 + Replayed *prometheus.CounterVec // event_type — fired when a spooled event finally reaches the broker (#214) 497 + Dropped *prometheus.CounterVec // reason — fired on permanent loss (overflow, corrupt) (#214) 498 + SpoolDepth prometheus.Gauge // current spool size (#214) 297 499 } 298 500 299 501 func (a *EmitterMetricsAdapter) IncEmitted(eventType string) { ··· 302 504 303 505 func (a *EmitterMetricsAdapter) IncFailed(eventType string) { 304 506 a.Failed.WithLabelValues(eventType).Inc() 507 + } 508 + 509 + func (a *EmitterMetricsAdapter) IncSpooled(eventType string) { 510 + if a.Spooled != nil { 511 + a.Spooled.WithLabelValues(eventType).Inc() 512 + } 513 + } 514 + 515 + func (a *EmitterMetricsAdapter) IncReplayed(eventType string) { 516 + if a.Replayed != nil { 517 + a.Replayed.WithLabelValues(eventType).Inc() 518 + } 519 + } 520 + 521 + func (a *EmitterMetricsAdapter) IncDropped(reason string) { 522 + if a.Dropped != nil { 523 + a.Dropped.WithLabelValues(reason).Inc() 524 + } 525 + } 526 + 527 + func (a *EmitterMetricsAdapter) SetSpoolDepth(n int) { 528 + if a.SpoolDepth != nil { 529 + a.SpoolDepth.Set(float64(n)) 530 + } 305 531 } 306 532 307 533 // HTTPMiddleware wraps an http.Handler to record request count and duration.
+176 -9
internal/relay/ospreyenforce.go
··· 5 5 import ( 6 6 "context" 7 7 "encoding/json" 8 + "errors" 8 9 "fmt" 9 10 "io" 10 11 "log" 11 12 "net/http" 12 13 "net/url" 14 + "os" 13 15 "strings" 14 16 "sync" 15 17 "time" ··· 44 46 cache map[string]*ospreyEntry 45 47 46 48 flight singleflight.Group 49 + 50 + // failClosedOnColdCache, when true (default), rejects sends with 51 + // an error when Osprey is unreachable AND no cached entry exists. 52 + // Without this, a relay restart followed by an Osprey outage 53 + // allows every new DID to send unsuspended for the duration of 54 + // the outage — even DIDs Osprey would have flagged on a healthy 55 + // query. Closes #215. 56 + failClosedOnColdCache bool 57 + 58 + // coldCacheRecorder counts fail-open vs fail-closed decisions on 59 + // cold cache + Osprey unreachable so operators can graph how 60 + // often the dangerous branch fires. Optional. 61 + coldCacheRecorder ColdCacheRecorder 62 + 63 + // snapshotPath, when non-empty, names a JSON file used to 64 + // persist the cache across restarts. The most-common cause of 65 + // a cold cache (relay restart with Osprey still healthy) is 66 + // addressed by reading this file on startup; the fail-closed 67 + // path above is the safety net for the rarer case. 68 + snapshotPath string 47 69 } 48 70 71 + // ColdCacheRecorder is the narrow interface used to count fail-open 72 + // vs fail-closed decisions. nil-safe. 73 + type ColdCacheRecorder interface { 74 + IncColdCacheDecision(decision string) 75 + } 76 + 77 + // ErrOspreyColdCache is returned by GetPolicy when the cache is empty 78 + // for a DID, Osprey is unreachable, and failClosedOnColdCache is true. 79 + // Callers translate this into a 451 SMTP deferral. 80 + var ErrOspreyColdCache = errors.New("osprey: cold cache and broker unreachable") 81 + 49 82 type ospreyEntry struct { 50 83 // activeLabels captures which labels Osprey currently has in status=1 51 84 // (active) for the DID. Lookup is O(1) per label name. We store the ··· 83 116 84 117 // NewOspreyEnforcer creates an enforcer that queries the Osprey UI API. 85 118 // apiURL is the base URL, e.g. "https://osprey-api.example.com". 119 + // 120 + // Defaults to fail-CLOSED on cold cache (no entry + Osprey unreachable) 121 + // — a regression from the legacy fail-open behavior, deliberately 122 + // chosen because the cold-cache+outage window is exactly when an 123 + // attacker can register a new DID and burn reputation before Osprey 124 + // labels arrive (#215). Operators can opt back into fail-open via 125 + // SetFailClosedOnColdCache(false) if the security tradeoff doesn't 126 + // match their environment. 86 127 func NewOspreyEnforcer(apiURL string, client *http.Client) *OspreyEnforcer { 87 128 if client == nil { 88 129 client = &http.Client{Timeout: 5 * time.Second} 89 130 } 90 131 return &OspreyEnforcer{ 91 - apiURL: apiURL, 92 - client: client, 93 - ttl: defaultOspreyEnforcerTTL, 94 - cache: make(map[string]*ospreyEntry), 132 + apiURL: apiURL, 133 + client: client, 134 + ttl: defaultOspreyEnforcerTTL, 135 + cache: make(map[string]*ospreyEntry), 136 + failClosedOnColdCache: true, 137 + } 138 + } 139 + 140 + // SetFailClosedOnColdCache controls the cold-cache fallback. true = 141 + // reject sends with ErrOspreyColdCache when no entry exists and the 142 + // broker is unreachable; false = legacy fail-open behavior. 143 + func (e *OspreyEnforcer) SetFailClosedOnColdCache(v bool) { 144 + e.failClosedOnColdCache = v 145 + } 146 + 147 + // SetColdCacheRecorder wires a metric recorder for cold-cache decisions. 148 + func (e *OspreyEnforcer) SetColdCacheRecorder(r ColdCacheRecorder) { 149 + e.coldCacheRecorder = r 150 + } 151 + 152 + // SetSnapshotPath enables on-disk cache persistence. Snapshots are 153 + // written periodically by Snapshot() and read by LoadSnapshot() on 154 + // startup so a relay restart doesn't reset the cache to empty — 155 + // which is the load-bearing concern for #215. Pass an empty string 156 + // to disable. 157 + func (e *OspreyEnforcer) SetSnapshotPath(path string) { 158 + e.snapshotPath = path 159 + } 160 + 161 + // snapshotEntry is the on-disk representation. Keeps fetchedAt as 162 + // RFC3339 so a manual operator can read the file without code. 163 + type snapshotEntry struct { 164 + Labels []string `json:"labels"` 165 + FetchedAt string `json:"fetched_at"` 166 + } 167 + 168 + // Snapshot writes the in-memory cache to snapshotPath atomically. 169 + // Safe to call concurrently with reads; takes a brief write lock. 170 + // No-op when snapshotPath is empty. 171 + func (e *OspreyEnforcer) Snapshot() error { 172 + if e.snapshotPath == "" { 173 + return nil 174 + } 175 + e.mu.RLock() 176 + out := make(map[string]snapshotEntry, len(e.cache)) 177 + for did, entry := range e.cache { 178 + labels := make([]string, 0, len(entry.activeLabels)) 179 + for l := range entry.activeLabels { 180 + labels = append(labels, l) 181 + } 182 + out[did] = snapshotEntry{Labels: labels, FetchedAt: entry.fetchedAt.UTC().Format(time.RFC3339Nano)} 183 + } 184 + e.mu.RUnlock() 185 + 186 + data, err := json.MarshalIndent(out, "", " ") 187 + if err != nil { 188 + return fmt.Errorf("marshal: %w", err) 189 + } 190 + tmp := e.snapshotPath + ".tmp" 191 + if err := os.WriteFile(tmp, data, 0o600); err != nil { 192 + return fmt.Errorf("write tmp: %w", err) 193 + } 194 + if err := os.Rename(tmp, e.snapshotPath); err != nil { 195 + os.Remove(tmp) 196 + return fmt.Errorf("rename: %w", err) 95 197 } 198 + return nil 199 + } 200 + 201 + // LoadSnapshot populates the cache from snapshotPath. Entries whose 202 + // fetchedAt is older than 2*ttl are discarded — they would be served 203 + // stale and we'd rather force a fresh query than serve a week-old 204 + // label set. Missing file is not an error (first start). Returns 205 + // the number of entries loaded. 206 + func (e *OspreyEnforcer) LoadSnapshot() (int, error) { 207 + if e.snapshotPath == "" { 208 + return 0, nil 209 + } 210 + data, err := os.ReadFile(e.snapshotPath) 211 + if err != nil { 212 + if os.IsNotExist(err) { 213 + return 0, nil 214 + } 215 + return 0, fmt.Errorf("read snapshot: %w", err) 216 + } 217 + var raw map[string]snapshotEntry 218 + if err := json.Unmarshal(data, &raw); err != nil { 219 + return 0, fmt.Errorf("unmarshal: %w", err) 220 + } 221 + cutoff := time.Now().Add(-2 * e.ttl) 222 + loaded := 0 223 + e.mu.Lock() 224 + for did, se := range raw { 225 + fetched, err := time.Parse(time.RFC3339Nano, se.FetchedAt) 226 + if err != nil || fetched.Before(cutoff) { 227 + continue 228 + } 229 + set := make(map[string]struct{}, len(se.Labels)) 230 + for _, l := range se.Labels { 231 + set[l] = struct{}{} 232 + } 233 + e.cache[did] = &ospreyEntry{activeLabels: set, fetchedAt: fetched} 234 + loaded++ 235 + } 236 + e.mu.Unlock() 237 + return loaded, nil 96 238 } 97 239 98 240 // ospreyLabelsResponse is the shape returned by ··· 106 248 } 107 249 108 250 // GetPolicy returns the effective sending policy for a DID derived from its 109 - // current Osprey labels. Fail-stale: if Osprey is unreachable and a previous 110 - // result is cached, that cached label set is used. If there is no cache at 111 - // all, returns defaultPolicy (fail-open so new DIDs are not blocked by 112 - // observability issues). 251 + // current Osprey labels. 252 + // 253 + // Fail-stale: if Osprey is unreachable and a previous result is 254 + // cached, that cached label set is used. 255 + // 256 + // Cold cache + Osprey unreachable: returns ErrOspreyColdCache when 257 + // failClosedOnColdCache is true (default — closes #215). Operators 258 + // who prefer the legacy fail-open behavior can call 259 + // SetFailClosedOnColdCache(false), which restores the pre-#215 path 260 + // of returning defaultPolicy with no error. 113 261 func (e *OspreyEnforcer) GetPolicy(ctx context.Context, did string) (*LabelPolicy, error) { 114 262 labels, _, err := e.activeLabelsFor(ctx, did) 115 263 if err != nil { 264 + // activeLabelsFor only returns errors for the cold-cache 265 + // fail-closed path; transient lookup failures already fall 266 + // back to stale cache silently. Surface the typed error so 267 + // the SMTP layer can return 451 to the client. 116 268 return defaultPolicy(), err 117 269 } 118 270 return policyFromLabels(labels), nil ··· 157 309 log.Printf("osprey.enforce: did=%s serving stale cache (labels=%v)", did, labelNames(entry.activeLabels)) 158 310 return entry.activeLabels, true, nil 159 311 } 312 + // Cold cache + Osprey unreachable. Default behavior is now 313 + // fail-closed (#215): without this branch, a relay restart 314 + // during an Osprey outage would let attackers send unsuspended 315 + // for the duration of the outage. Operators who need the 316 + // legacy fail-open semantics opt in via SetFailClosedOnColdCache. 317 + if e.failClosedOnColdCache { 318 + log.Printf("osprey.enforce: did=%s action=fail_closed reason=no_cache_and_unreachable", did) 319 + if e.coldCacheRecorder != nil { 320 + e.coldCacheRecorder.IncColdCacheDecision("denied") 321 + } 322 + return nil, false, ErrOspreyColdCache 323 + } 160 324 log.Printf("osprey.enforce: did=%s action=fail_open reason=no_cache_and_unreachable", did) 325 + if e.coldCacheRecorder != nil { 326 + e.coldCacheRecorder.IncColdCacheDecision("allowed") 327 + } 161 328 return nil, false, nil 162 329 } 163 330 ··· 294 461 // Malformed response — return error so GetPolicy falls through to 295 462 // the fail-stale path (preserving any cached labels) instead of 296 463 // overwriting a known-label-bearing entry with an empty set. 297 - return nil, fmt.Errorf("malformed osprey response: %v", err) 464 + return nil, fmt.Errorf("malformed osprey response: %w", err) 298 465 } 299 466 300 467 out := make(map[string]struct{}, len(result.Labels))
+194
internal/relay/ospreyenforce_coldcache_test.go
··· 1 + // SPDX-License-Identifier: AGPL-3.0-or-later 2 + 3 + package relay 4 + 5 + import ( 6 + "context" 7 + "errors" 8 + "net/http" 9 + "net/http/httptest" 10 + "path/filepath" 11 + "sync" 12 + "testing" 13 + "time" 14 + ) 15 + 16 + // brokenServer always returns 500 so the enforcer treats Osprey as 17 + // unreachable. Used to drive the fail-closed branch. 18 + func brokenServer(t *testing.T) *httptest.Server { 19 + t.Helper() 20 + s := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { 21 + w.WriteHeader(http.StatusInternalServerError) 22 + })) 23 + t.Cleanup(s.Close) 24 + return s 25 + } 26 + 27 + type stubColdRecorder struct { 28 + mu sync.Mutex 29 + calls map[string]int 30 + } 31 + 32 + func newStubColdRecorder() *stubColdRecorder { 33 + return &stubColdRecorder{calls: map[string]int{}} 34 + } 35 + func (s *stubColdRecorder) IncColdCacheDecision(d string) { 36 + s.mu.Lock() 37 + s.calls[d]++ 38 + s.mu.Unlock() 39 + } 40 + func (s *stubColdRecorder) count(d string) int { 41 + s.mu.Lock() 42 + defer s.mu.Unlock() 43 + return s.calls[d] 44 + } 45 + 46 + // TestEnforcer_ColdCacheFailClosedByDefault pins the central #215 47 + // invariant: a cold cache + Osprey unreachable rejects with 48 + // ErrOspreyColdCache by default, NOT silently allows. 49 + func TestEnforcer_ColdCacheFailClosedByDefault(t *testing.T) { 50 + srv := brokenServer(t) 51 + e := NewOspreyEnforcer(srv.URL, &http.Client{Timeout: 200 * time.Millisecond}) 52 + rec := newStubColdRecorder() 53 + e.SetColdCacheRecorder(rec) 54 + 55 + policy, err := e.GetPolicy(context.Background(), "did:plc:cold") 56 + if !errors.Is(err, ErrOspreyColdCache) { 57 + t.Fatalf("expected ErrOspreyColdCache, got err=%v", err) 58 + } 59 + if policy == nil { 60 + t.Errorf("policy should be non-nil even on error (defaultPolicy)") 61 + } 62 + if rec.count("denied") != 1 { 63 + t.Errorf("denied count = %d, want 1", rec.count("denied")) 64 + } 65 + if rec.count("allowed") != 0 { 66 + t.Errorf("allowed count = %d, want 0 (default is fail-closed)", rec.count("allowed")) 67 + } 68 + } 69 + 70 + // TestEnforcer_ColdCacheFailOpenOptIn confirms the legacy fail-open 71 + // path can be restored via SetFailClosedOnColdCache(false). 72 + func TestEnforcer_ColdCacheFailOpenOptIn(t *testing.T) { 73 + srv := brokenServer(t) 74 + e := NewOspreyEnforcer(srv.URL, &http.Client{Timeout: 200 * time.Millisecond}) 75 + e.SetFailClosedOnColdCache(false) 76 + rec := newStubColdRecorder() 77 + e.SetColdCacheRecorder(rec) 78 + 79 + _, err := e.GetPolicy(context.Background(), "did:plc:cold") 80 + if err != nil { 81 + t.Fatalf("opt-in fail-open should not return err, got %v", err) 82 + } 83 + if rec.count("allowed") != 1 { 84 + t.Errorf("allowed count = %d, want 1", rec.count("allowed")) 85 + } 86 + } 87 + 88 + // TestEnforcer_SnapshotRoundTrip confirms persistence: write entries, 89 + // snapshot, build a fresh enforcer pointed at the same path, load, 90 + // and verify the entries replay AND the cold-cache branch does NOT 91 + // fire (because the cache is no longer cold). 92 + func TestEnforcer_SnapshotRoundTrip(t *testing.T) { 93 + dir := t.TempDir() 94 + snap := filepath.Join(dir, "cache.json") 95 + 96 + // Original enforcer: stuff a cache entry in. 97 + e1 := NewOspreyEnforcer("http://127.0.0.1:1", nil) 98 + e1.SetSnapshotPath(snap) 99 + e1.cache["did:plc:warm"] = &ospreyEntry{ 100 + activeLabels: map[string]struct{}{"highly_trusted": {}}, 101 + fetchedAt: time.Now(), 102 + } 103 + if err := e1.Snapshot(); err != nil { 104 + t.Fatalf("Snapshot: %v", err) 105 + } 106 + 107 + // Fresh enforcer + broken Osprey: would normally fail-closed 108 + // on cold cache. Loading the snapshot first means the cache is 109 + // warm for did:plc:warm, so the fail-closed branch never fires 110 + // for that DID. 111 + srv := brokenServer(t) 112 + e2 := NewOspreyEnforcer(srv.URL, &http.Client{Timeout: 200 * time.Millisecond}) 113 + e2.SetSnapshotPath(snap) 114 + rec := newStubColdRecorder() 115 + e2.SetColdCacheRecorder(rec) 116 + n, err := e2.LoadSnapshot() 117 + if err != nil { 118 + t.Fatalf("LoadSnapshot: %v", err) 119 + } 120 + if n != 1 { 121 + t.Errorf("loaded entries = %d, want 1", n) 122 + } 123 + 124 + // Now policy lookup uses the cached entry — no broker call, 125 + // no cold-cache decision recorded. 126 + policy, err := e2.GetPolicy(context.Background(), "did:plc:warm") 127 + if err != nil { 128 + t.Errorf("warm-cache lookup returned err: %v", err) 129 + } 130 + if !policy.SkipWarming { 131 + t.Errorf("policy should reflect highly_trusted (SkipWarming=true): %+v", policy) 132 + } 133 + if rec.count("denied") != 0 || rec.count("allowed") != 0 { 134 + t.Errorf("cold-cache recorder fired for warm entry: denied=%d allowed=%d", 135 + rec.count("denied"), rec.count("allowed")) 136 + } 137 + 138 + // A DIFFERENT DID still cold-cache fails closed. 139 + if _, err := e2.GetPolicy(context.Background(), "did:plc:cold"); !errors.Is(err, ErrOspreyColdCache) { 140 + t.Errorf("unknown DID should fail-closed; err=%v", err) 141 + } 142 + } 143 + 144 + // TestEnforcer_LoadSnapshot_DropsExpired ensures stale entries don't 145 + // outlive the 2*ttl freshness window. A snapshot from 1 month ago 146 + // shouldn't keep serving labels indefinitely. 147 + func TestEnforcer_LoadSnapshot_DropsExpired(t *testing.T) { 148 + dir := t.TempDir() 149 + snap := filepath.Join(dir, "cache.json") 150 + 151 + e1 := NewOspreyEnforcer("http://127.0.0.1:1", nil) 152 + e1.SetSnapshotPath(snap) 153 + e1.cache["did:plc:fresh"] = &ospreyEntry{ 154 + activeLabels: map[string]struct{}{}, 155 + fetchedAt: time.Now(), 156 + } 157 + e1.cache["did:plc:stale"] = &ospreyEntry{ 158 + activeLabels: map[string]struct{}{}, 159 + fetchedAt: time.Now().Add(-30 * 24 * time.Hour), 160 + } 161 + if err := e1.Snapshot(); err != nil { 162 + t.Fatal(err) 163 + } 164 + 165 + e2 := NewOspreyEnforcer("http://127.0.0.1:1", nil) 166 + e2.SetSnapshotPath(snap) 167 + n, err := e2.LoadSnapshot() 168 + if err != nil { 169 + t.Fatal(err) 170 + } 171 + if n != 1 { 172 + t.Errorf("loaded entries = %d, want 1 (stale dropped)", n) 173 + } 174 + if _, ok := e2.cache["did:plc:fresh"]; !ok { 175 + t.Error("fresh entry missing from loaded cache") 176 + } 177 + if _, ok := e2.cache["did:plc:stale"]; ok { 178 + t.Error("stale entry survived load") 179 + } 180 + } 181 + 182 + // TestEnforcer_LoadSnapshot_MissingFileNoError covers first-boot: 183 + // no snapshot exists yet, Load should be a clean no-op. 184 + func TestEnforcer_LoadSnapshot_MissingFileNoError(t *testing.T) { 185 + e := NewOspreyEnforcer("http://127.0.0.1:1", nil) 186 + e.SetSnapshotPath(filepath.Join(t.TempDir(), "does-not-exist.json")) 187 + n, err := e.LoadSnapshot() 188 + if err != nil { 189 + t.Errorf("missing file should not error, got %v", err) 190 + } 191 + if n != 0 { 192 + t.Errorf("loaded = %d, want 0", n) 193 + } 194 + }
+12 -2
internal/relay/ospreyenforce_test.go
··· 120 120 } 121 121 122 122 func TestOspreyEnforcerUnreachableNoCache(t *testing.T) { 123 - // No server — enforcer should fail-open when no cache entry exists. 123 + // Opt into the legacy fail-open behavior. Default is fail-closed 124 + // (#215); see TestEnforcer_ColdCacheFailClosedByDefault for that 125 + // path. This test pins the opt-in escape hatch. 124 126 e := NewOspreyEnforcer("http://127.0.0.1:1", &http.Client{Timeout: 50 * time.Millisecond}) 127 + e.SetFailClosedOnColdCache(false) 125 128 suspended, err := e.CheckSuspended(context.Background(), "did:plc:new") 126 129 if err != nil { 127 130 t.Fatalf("unexpected error: %v", err) ··· 187 190 } 188 191 189 192 func TestOspreyEnforcerMalformedResponseNoCacheFailsOpen(t *testing.T) { 190 - // No prior cache + malformed response = fail-open (allow send). 193 + // Opt-in fail-open path. Default is fail-closed (#215); this 194 + // test pins the legacy behavior available via opt-in only. 191 195 srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { 192 196 w.Write([]byte("not json {{{")) 193 197 })) 194 198 defer srv.Close() 195 199 196 200 e := NewOspreyEnforcer(srv.URL, srv.Client()) 201 + e.SetFailClosedOnColdCache(false) 197 202 suspended, err := e.CheckSuspended(context.Background(), "did:plc:test") 198 203 if err != nil { 199 204 t.Fatalf("unexpected error: %v", err) ··· 290 295 } 291 296 } 292 297 298 + // TestOspreyEnforcerServerErrorNoCacheFailsOpen is the opt-in fail- 299 + // open variant — default is fail-closed (#215). The opt-in pin is 300 + // here so a future contributor can find it next to the security 301 + // behavior it legacy-overrides. 293 302 func TestOspreyEnforcerServerErrorNoCacheFailsOpen(t *testing.T) { 294 303 // 500 with no prior cache should fail-open (allow). 295 304 srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { ··· 298 307 defer srv.Close() 299 308 300 309 e := NewOspreyEnforcer(srv.URL, srv.Client()) 310 + e.SetFailClosedOnColdCache(false) 301 311 suspended, err := e.CheckSuspended(context.Background(), "did:plc:new") 302 312 if err != nil { 303 313 t.Fatalf("unexpected error: %v", err)
+9 -9
internal/relay/queue.go
··· 122 122 // If spool write fails, we return an error so the SMTP session can 4xx. 123 123 if q.spool != nil { 124 124 if err := q.spool.Write(entry); err != nil { 125 - return fmt.Errorf("spool write: %v", err) 125 + return fmt.Errorf("spool write: %w", err) 126 126 } 127 127 } 128 128 ··· 394 394 dialer := net.Dialer{Timeout: 30 * time.Second} 395 395 conn, err := dialer.DialContext(ctx, "tcp", mxHost+":25") 396 396 if err != nil { 397 - return 0, fmt.Errorf("connect to %s: %v", mxHost, err) 397 + return 0, fmt.Errorf("connect to %s: %w", mxHost, err) 398 398 } 399 399 400 400 // NewClient uses mxHost for TLS ServerName verification. ··· 402 402 client, err := smtp.NewClient(conn, mxHost) 403 403 if err != nil { 404 404 conn.Close() 405 - return 0, fmt.Errorf("smtp client %s: %v", mxHost, err) 405 + return 0, fmt.Errorf("smtp client %s: %w", mxHost, err) 406 406 } 407 407 defer client.Close() 408 408 ··· 413 413 ehlo = "localhost" // fallback for tests 414 414 } 415 415 if err := client.Hello(ehlo); err != nil { 416 - return 0, fmt.Errorf("EHLO %s to %s: %v", ehlo, mxHost, err) 416 + return 0, fmt.Errorf("EHLO %s to %s: %w", ehlo, mxHost, err) 417 417 } 418 418 419 419 // Opportunistic STARTTLS ··· 434 434 435 435 // MAIL FROM 436 436 if err := client.Mail(from); err != nil { 437 - return smtpCode(err), fmt.Errorf("MAIL FROM: %v", err) 437 + return smtpCode(err), fmt.Errorf("MAIL FROM: %w", err) 438 438 } 439 439 440 440 // RCPT TO 441 441 if err := client.Rcpt(to); err != nil { 442 - return smtpCode(err), fmt.Errorf("RCPT TO: %v", err) 442 + return smtpCode(err), fmt.Errorf("RCPT TO: %w", err) 443 443 } 444 444 445 445 // DATA 446 446 w, err := client.Data() 447 447 if err != nil { 448 - return smtpCode(err), fmt.Errorf("DATA: %v", err) 448 + return smtpCode(err), fmt.Errorf("DATA: %w", err) 449 449 } 450 450 if _, err := w.Write(data); err != nil { 451 - return 0, fmt.Errorf("write data: %v", err) 451 + return 0, fmt.Errorf("write data: %w", err) 452 452 } 453 453 if err := w.Close(); err != nil { 454 - return smtpCode(err), fmt.Errorf("close data: %v", err) 454 + return smtpCode(err), fmt.Errorf("close data: %w", err) 455 455 } 456 456 457 457 client.Quit()
+6 -6
internal/relay/ratelimit.go
··· 83 83 hourlyWindow := now.Truncate(time.Hour) 84 84 hourlyCount, err := rl.store.GetRateCount(ctx, did, relaystore.WindowHourly, hourlyWindow) 85 85 if err != nil { 86 - return fmt.Errorf("check hourly rate: %v", err) 86 + return fmt.Errorf("check hourly rate: %w", err) 87 87 } 88 88 if hourlyCount >= hourlyLimit { 89 89 return &RateLimitError{ ··· 98 98 dailyWindow := time.Date(now.Year(), now.Month(), now.Day(), 0, 0, 0, 0, time.UTC) 99 99 dailyCount, err := rl.store.GetRateCount(ctx, did, relaystore.WindowDaily, dailyWindow) 100 100 if err != nil { 101 - return fmt.Errorf("check daily rate: %v", err) 101 + return fmt.Errorf("check daily rate: %w", err) 102 102 } 103 103 if dailyCount >= dailyLimit { 104 104 return &RateLimitError{ ··· 157 157 RetryAfter: hourlyWindow.Add(time.Hour), 158 158 } 159 159 } 160 - return fmt.Errorf("check hourly rate: %v", err) 160 + return fmt.Errorf("check hourly rate: %w", err) 161 161 } 162 162 163 163 // Atomically check+increment daily counter in SQLite ··· 178 178 RetryAfter: dailyWindow.Add(24 * time.Hour), 179 179 } 180 180 } 181 - return fmt.Errorf("check daily rate: %v", err) 181 + return fmt.Errorf("check daily rate: %w", err) 182 182 } 183 183 184 184 return nil ··· 201 201 // Increment per-member hourly 202 202 hourlyWindow := now.Truncate(time.Hour) 203 203 if err := rl.store.IncrementRateCounter(ctx, did, relaystore.WindowHourly, hourlyWindow); err != nil { 204 - return fmt.Errorf("increment hourly counter: %v", err) 204 + return fmt.Errorf("increment hourly counter: %w", err) 205 205 } 206 206 207 207 // Increment per-member daily 208 208 dailyWindow := time.Date(now.Year(), now.Month(), now.Day(), 0, 0, 0, 0, time.UTC) 209 209 if err := rl.store.IncrementRateCounter(ctx, did, relaystore.WindowDaily, dailyWindow); err != nil { 210 - return fmt.Errorf("increment daily counter: %v", err) 210 + return fmt.Errorf("increment daily counter: %w", err) 211 211 } 212 212 213 213 return nil
+42
internal/relay/recipient_outcome.go
··· 1 + // SPDX-License-Identifier: AGPL-3.0-or-later 2 + 3 + package relay 4 + 5 + // RecipientOutcome records the result of delivering a single recipient inside 6 + // a multi-RCPT SMTP DATA. Emitted by the per-recipient loop in cmd/relay so 7 + // the caller can decide whether the whole DATA should be accepted, rejected, 8 + // or partially-failed. 9 + type RecipientOutcome struct { 10 + // Recipient is the RCPT TO address. 11 + Recipient string 12 + // MsgID is the relaystore.messages row inserted for this recipient. 13 + // Zero when the failure happened before InsertMessage (e.g. DKIM sign). 14 + MsgID int64 15 + // Err is non-nil when the recipient could not be enqueued for any reason. 16 + Err error 17 + } 18 + 19 + // AggregateRecipientOutcomes summarizes a per-recipient delivery loop. 20 + // 21 + // Returns: 22 + // - succeeded, failed: per-recipient counts 23 + // - retryAll: true only when zero recipients succeeded; the caller should 24 + // return a transient SMTP error so the client retries the whole DATA. 25 + // - lastErr: a representative error from the failures (for logging) 26 + // 27 + // When at least one recipient succeeded, the caller MUST accept the DATA 28 + // (return nil to the SMTP server). Returning a transient error in that case 29 + // would cause the client to retry the entire DATA, duplicating the 30 + // successfully-enqueued recipients — the bug fixed by this aggregator. 31 + func AggregateRecipientOutcomes(outcomes []RecipientOutcome) (succeeded, failed int, retryAll bool, lastErr error) { 32 + for _, o := range outcomes { 33 + if o.Err == nil { 34 + succeeded++ 35 + continue 36 + } 37 + failed++ 38 + lastErr = o.Err 39 + } 40 + retryAll = succeeded == 0 && failed > 0 41 + return 42 + }
+134
internal/relay/recipient_outcome_test.go
··· 1 + // SPDX-License-Identifier: AGPL-3.0-or-later 2 + 3 + package relay 4 + 5 + import ( 6 + "errors" 7 + "testing" 8 + ) 9 + 10 + func TestAggregateRecipientOutcomes_AllSucceeded(t *testing.T) { 11 + outcomes := []RecipientOutcome{ 12 + {Recipient: "a@x.com", MsgID: 1}, 13 + {Recipient: "b@x.com", MsgID: 2}, 14 + {Recipient: "c@x.com", MsgID: 3}, 15 + } 16 + 17 + succeeded, failed, retryAll, lastErr := AggregateRecipientOutcomes(outcomes) 18 + 19 + if succeeded != 3 { 20 + t.Errorf("succeeded = %d, want 3", succeeded) 21 + } 22 + if failed != 0 { 23 + t.Errorf("failed = %d, want 0", failed) 24 + } 25 + if retryAll { 26 + t.Error("retryAll = true, want false (any success means accept)") 27 + } 28 + if lastErr != nil { 29 + t.Errorf("lastErr = %v, want nil", lastErr) 30 + } 31 + } 32 + 33 + func TestAggregateRecipientOutcomes_AllFailed(t *testing.T) { 34 + bad := errors.New("queue full") 35 + outcomes := []RecipientOutcome{ 36 + {Recipient: "a@x.com", Err: bad}, 37 + {Recipient: "b@x.com", Err: bad}, 38 + } 39 + 40 + succeeded, failed, retryAll, lastErr := AggregateRecipientOutcomes(outcomes) 41 + 42 + if succeeded != 0 { 43 + t.Errorf("succeeded = %d, want 0", succeeded) 44 + } 45 + if failed != 2 { 46 + t.Errorf("failed = %d, want 2", failed) 47 + } 48 + if !retryAll { 49 + t.Error("retryAll = false, want true (zero successes → caller must reject DATA)") 50 + } 51 + if lastErr != bad { 52 + t.Errorf("lastErr = %v, want %v", lastErr, bad) 53 + } 54 + } 55 + 56 + func TestAggregateRecipientOutcomes_PartialFailure_AcceptsAnyway(t *testing.T) { 57 + // This is the regression fix for #226: when 1..N-1 enqueued and N failed, 58 + // we MUST NOT signal retry — that would duplicate 1..N-1. 59 + bad := errors.New("spool I/O error") 60 + outcomes := []RecipientOutcome{ 61 + {Recipient: "a@x.com", MsgID: 1}, 62 + {Recipient: "b@x.com", MsgID: 2}, 63 + {Recipient: "c@x.com", MsgID: 3, Err: bad}, 64 + } 65 + 66 + succeeded, failed, retryAll, lastErr := AggregateRecipientOutcomes(outcomes) 67 + 68 + if succeeded != 2 { 69 + t.Errorf("succeeded = %d, want 2", succeeded) 70 + } 71 + if failed != 1 { 72 + t.Errorf("failed = %d, want 1", failed) 73 + } 74 + if retryAll { 75 + t.Fatal("retryAll = true would cause client to retry DATA, duplicating recipients a@x.com and b@x.com — this is the bug #226 fixes") 76 + } 77 + if lastErr != bad { 78 + t.Errorf("lastErr = %v, want %v", lastErr, bad) 79 + } 80 + } 81 + 82 + func TestAggregateRecipientOutcomes_FailureFirst_StillAcceptsIfAnySucceed(t *testing.T) { 83 + // Order shouldn't matter — even if the first recipient failed, as long 84 + // as at least one later recipient succeeded we still accept the DATA 85 + // because retrying would duplicate the later success. 86 + bad := errors.New("DKIM sign error") 87 + outcomes := []RecipientOutcome{ 88 + {Recipient: "a@x.com", Err: bad}, 89 + {Recipient: "b@x.com", MsgID: 2}, 90 + } 91 + 92 + succeeded, failed, retryAll, _ := AggregateRecipientOutcomes(outcomes) 93 + 94 + if succeeded != 1 || failed != 1 { 95 + t.Errorf("succeeded=%d failed=%d, want 1/1", succeeded, failed) 96 + } 97 + if retryAll { 98 + t.Error("retryAll = true on partial success — would duplicate b@x.com on retry") 99 + } 100 + } 101 + 102 + func TestAggregateRecipientOutcomes_Empty(t *testing.T) { 103 + succeeded, failed, retryAll, lastErr := AggregateRecipientOutcomes(nil) 104 + 105 + if succeeded != 0 || failed != 0 { 106 + t.Errorf("counts non-zero on empty input: succeeded=%d failed=%d", succeeded, failed) 107 + } 108 + if retryAll { 109 + // Empty input means "no recipients to deliver" — the caller should 110 + // not have invoked the loop at all. We choose retryAll=false here 111 + // because there's nothing to retry. 112 + t.Error("retryAll = true on empty outcomes — should be false (nothing to retry)") 113 + } 114 + if lastErr != nil { 115 + t.Errorf("lastErr = %v on empty, want nil", lastErr) 116 + } 117 + } 118 + 119 + func TestAggregateRecipientOutcomes_LastErrIsTheLastFailure(t *testing.T) { 120 + // When multiple failures occur, lastErr should be deterministic — the 121 + // last one in iteration order, so logs are reproducible. 122 + first := errors.New("first failure") 123 + second := errors.New("second failure") 124 + outcomes := []RecipientOutcome{ 125 + {Recipient: "a@x.com", Err: first}, 126 + {Recipient: "b@x.com", Err: second}, 127 + } 128 + 129 + _, _, _, lastErr := AggregateRecipientOutcomes(outcomes) 130 + 131 + if lastErr != second { 132 + t.Errorf("lastErr = %v, want %v (iteration order should pick the last)", lastErr, second) 133 + } 134 + }
+60 -17
internal/relay/smtp.go
··· 476 476 return nil 477 477 } 478 478 479 - // validateFromHeader parses the From header from message data and verifies 480 - // the domain matches the member's registered domain. This prevents a member 481 - // registered for example.com from sending with From: ceo@bigbank.com, which 482 - // would be DKIM-signed and could enable phishing. 479 + // validateFromHeader parses the From header (and the related Sender, 480 + // Resent-From, Resent-Sender headers) from message data and verifies all 481 + // of them carry the member's registered domain. 482 + // 483 + // Why all four: per RFC 5322, 484 + // - From identifies the author. DMARC alignment is on the From domain, 485 + // so spoofing it enables phishing under the member's DKIM signature. 486 + // - Sender identifies the agent that actually injected the message 487 + // (used when From contains multiple authors). Receivers — Gmail in 488 + // particular — fall back to the Sender domain for DMARC alignment in 489 + // that case, so a member registered for example.com sending with 490 + // "Sender: agent@bigbank.com" would still spoof bigbank.com from 491 + // Gmail's perspective even though our From check passed. 492 + // - Resent-From / Resent-Sender carry the same risks for forwarded 493 + // messages. The relay isn't a re-mailer; messages should originate 494 + // from the member's domain regardless of which header conveys that. 495 + // 496 + // All four single-address headers are validated identically. Resent-* 497 + // headers may appear multiple times (RFC 5322 §3.6.6 forwarding trace); 498 + // every occurrence must pass. 483 499 func validateFromHeader(data []byte, memberDomain string) error { 484 500 r := textproto.NewReader(bufio.NewReader(strings.NewReader(string(data)))) 485 501 header, err := r.ReadMIMEHeader() ··· 487 503 return fmt.Errorf("From header domain must match %s", memberDomain) 488 504 } 489 505 490 - fromHeader := header.Get("From") 491 - if fromHeader == "" { 506 + if header.Get("From") == "" { 492 507 return fmt.Errorf("missing From header") 493 508 } 494 509 495 - // Parse From address using stdlib — rejects multi-address headers and 496 - // malformed addresses that hand-rolled parsers might accept. 497 - addr, err := mail.ParseAddress(fromHeader) 498 - if err != nil { 499 - return fmt.Errorf("could not parse From header: %v", err) 510 + // Single-occurrence headers: From and Sender. Each must, when present, 511 + // carry exactly one address aligned with the member's domain. 512 + for _, name := range []string{"From", "Sender"} { 513 + v := header.Get(name) 514 + if v == "" { 515 + continue 516 + } 517 + if err := requireAlignedSingleAddress(name, v, memberDomain); err != nil { 518 + return err 519 + } 520 + } 521 + 522 + // Multi-occurrence Resent-* headers. RFC 5322 allows each forward hop 523 + // to add its own Resent-From/Resent-Sender block; net/textproto returns 524 + // every value via header.Values(). We require *every* hop to align. 525 + for _, name := range []string{"Resent-From", "Resent-Sender"} { 526 + for _, v := range header.Values(name) { 527 + if v == "" { 528 + continue 529 + } 530 + if err := requireAlignedSingleAddress(name, v, memberDomain); err != nil { 531 + return err 532 + } 533 + } 500 534 } 501 535 536 + return nil 537 + } 538 + 539 + // requireAlignedSingleAddress parses a single-address header value and 540 + // returns an error unless it contains exactly one address whose domain 541 + // matches memberDomain (case-insensitive, exact match — no subdomain 542 + // alignment, mirroring the rest of the relay's policy). 543 + func requireAlignedSingleAddress(headerName, headerValue, memberDomain string) error { 544 + addr, err := mail.ParseAddress(headerValue) 545 + if err != nil { 546 + return fmt.Errorf("could not parse %s header: %w", headerName, err) 547 + } 502 548 parts := strings.SplitN(addr.Address, "@", 2) 503 549 if len(parts) != 2 { 504 - return fmt.Errorf("could not parse domain from From header") 550 + return fmt.Errorf("could not parse domain from %s header", headerName) 505 551 } 506 - fromDomain := parts[1] 507 - 508 - if strings.ToLower(fromDomain) != strings.ToLower(memberDomain) { 509 - return fmt.Errorf("From header domain %q does not match registered domain %q", fromDomain, memberDomain) 552 + if !strings.EqualFold(parts[1], memberDomain) { 553 + return fmt.Errorf("%s header domain %q does not match registered domain %q", headerName, parts[1], memberDomain) 510 554 } 511 - 512 555 return nil 513 556 }
+95
internal/relay/smtp_test.go
··· 507 507 } 508 508 } 509 509 510 + // --- Sender / Resent-* header validation (#225) --- 511 + // 512 + // Gmail and other large receivers fall back to Sender for DMARC alignment 513 + // when From contains multiple authors. A member that passes the From check 514 + // can still spoof a third party via a forged Sender — so we extend the 515 + // alignment check to every author/agent header RFC 5322 defines. 516 + 517 + func TestValidateFromHeader_SenderAligned(t *testing.T) { 518 + msg := "From: noreply@example.com\r\nSender: ops@example.com\r\nTo: user@gmail.com\r\nSubject: Test\r\n\r\nBody\r\n" 519 + if err := validateFromHeader([]byte(msg), "example.com"); err != nil { 520 + t.Fatalf("aligned Sender rejected: %v", err) 521 + } 522 + } 523 + 524 + func TestValidateFromHeader_SenderSpoofed(t *testing.T) { 525 + // Author looks legit but Sender impersonates a bank — Gmail uses Sender 526 + // for DMARC alignment when present, so this must be blocked. 527 + msg := "From: noreply@example.com\r\nSender: ceo@bigbank.com\r\nTo: user@gmail.com\r\nSubject: Test\r\n\r\nBody\r\n" 528 + err := validateFromHeader([]byte(msg), "example.com") 529 + if err == nil { 530 + t.Fatal("spoofed Sender header should be rejected") 531 + } 532 + if !strings.Contains(err.Error(), "Sender") { 533 + t.Errorf("error should mention Sender header: %v", err) 534 + } 535 + if !strings.Contains(err.Error(), "bigbank.com") { 536 + t.Errorf("error should name the spoofed domain: %v", err) 537 + } 538 + } 539 + 540 + func TestValidateFromHeader_SenderEmptyOk(t *testing.T) { 541 + // Empty/absent Sender is fine — it's only meaningful when present. 542 + msg := "From: noreply@example.com\r\nTo: user@gmail.com\r\nSubject: Test\r\n\r\nBody\r\n" 543 + if err := validateFromHeader([]byte(msg), "example.com"); err != nil { 544 + t.Fatalf("absent Sender should not be required: %v", err) 545 + } 546 + } 547 + 548 + func TestValidateFromHeader_ResentFromSpoofed(t *testing.T) { 549 + msg := "From: noreply@example.com\r\nResent-From: ceo@bigbank.com\r\nTo: user@gmail.com\r\nSubject: Test\r\n\r\nBody\r\n" 550 + err := validateFromHeader([]byte(msg), "example.com") 551 + if err == nil { 552 + t.Fatal("spoofed Resent-From header should be rejected") 553 + } 554 + if !strings.Contains(err.Error(), "Resent-From") { 555 + t.Errorf("error should mention Resent-From: %v", err) 556 + } 557 + } 558 + 559 + func TestValidateFromHeader_ResentSenderSpoofed(t *testing.T) { 560 + msg := "From: noreply@example.com\r\nResent-Sender: ceo@bigbank.com\r\nTo: user@gmail.com\r\nSubject: Test\r\n\r\nBody\r\n" 561 + err := validateFromHeader([]byte(msg), "example.com") 562 + if err == nil { 563 + t.Fatal("spoofed Resent-Sender header should be rejected") 564 + } 565 + } 566 + 567 + func TestValidateFromHeader_ResentFromMultipleHopsAllAligned(t *testing.T) { 568 + // RFC 5322 §3.6.6: each forward hop prepends its own Resent-* block. 569 + // When the relay is the (only) re-mailer, all hops are us, and all 570 + // should be aligned with the member's domain. 571 + msg := "From: noreply@example.com\r\n" + 572 + "Resent-From: ops@example.com\r\n" + 573 + "Resent-From: ops2@example.com\r\n" + 574 + "To: user@gmail.com\r\nSubject: Test\r\n\r\nBody\r\n" 575 + if err := validateFromHeader([]byte(msg), "example.com"); err != nil { 576 + t.Fatalf("multiple aligned Resent-From hops rejected: %v", err) 577 + } 578 + } 579 + 580 + func TestValidateFromHeader_ResentFromMultipleHopsOneSpoofed(t *testing.T) { 581 + // One hop is forged → reject. Otherwise an attacker could chain a 582 + // legitimate hop after a spoofed one to slip past a "first-only" check. 583 + msg := "From: noreply@example.com\r\n" + 584 + "Resent-From: ops@example.com\r\n" + 585 + "Resent-From: ceo@bigbank.com\r\n" + 586 + "To: user@gmail.com\r\nSubject: Test\r\n\r\nBody\r\n" 587 + err := validateFromHeader([]byte(msg), "example.com") 588 + if err == nil { 589 + t.Fatal("forged hop in Resent-From chain should be rejected") 590 + } 591 + if !strings.Contains(err.Error(), "bigbank.com") { 592 + t.Errorf("error should identify spoofed hop: %v", err) 593 + } 594 + } 595 + 596 + func TestValidateFromHeader_SenderMultiAddressRejected(t *testing.T) { 597 + // Same multi-address attack as From, but on Sender. 598 + msg := "From: noreply@example.com\r\nSender: attacker@evil.com, \"Friendly\" <legit@example.com>\r\nTo: user@gmail.com\r\nSubject: Test\r\n\r\nBody\r\n" 599 + err := validateFromHeader([]byte(msg), "example.com") 600 + if err == nil { 601 + t.Fatal("multi-address Sender header should be rejected") 602 + } 603 + } 604 + 510 605 func TestSMTPFromHeaderPhishingBlocked(t *testing.T) { 511 606 // End-to-end test: member for example.com tries to send with From: ceo@bigbank.com 512 607 apiKey := "atmos_testkey123"
+85 -10
internal/relay/spool.go
··· 34 34 } 35 35 36 36 // Write persists a queue entry to the spool directory. 37 + // 38 + // Durability contract: when Write returns nil, the message body has 39 + // been fsynced to the underlying device AND the rename has been 40 + // fsynced to the directory entry. A subsequent power loss cannot lose 41 + // a message that Write claimed to persist. Without these fsyncs the 42 + // rename can appear to succeed but be reordered behind a crash, 43 + // leaving either a zero-length file or no file at all when the kernel 44 + // replays the journal — exactly the orphan case (#208) that produces 45 + // duplicate-delivery on SMTP retry. 37 46 func (s *Spool) Write(entry *QueueEntry) error { 38 47 se := spoolEntry{ 39 48 ID: entry.ID, ··· 46 55 47 56 data, err := json.Marshal(se) 48 57 if err != nil { 49 - return fmt.Errorf("marshal spool entry: %v", err) 58 + return fmt.Errorf("marshal spool entry: %w", err) 50 59 } 51 60 52 61 path := filepath.Join(s.dir, fmt.Sprintf("%d.msg", entry.ID)) 53 - 54 - // Write atomically: temp file + rename to avoid partial writes on crash 55 62 tmp := path + ".tmp" 56 - if err := os.WriteFile(tmp, data, 0600); err != nil { 57 - return fmt.Errorf("write spool file: %v", err) 63 + 64 + // Step 1: write + fsync the temp file. fsync MUST happen before 65 + // rename or the rename can land in the journal ahead of the data 66 + // blocks, leaving a zero-byte file after a crash. 67 + f, err := os.OpenFile(tmp, os.O_WRONLY|os.O_CREATE|os.O_TRUNC, 0600) 68 + if err != nil { 69 + return fmt.Errorf("open spool tmp: %w", err) 58 70 } 71 + if _, err := f.Write(data); err != nil { 72 + f.Close() 73 + os.Remove(tmp) 74 + return fmt.Errorf("write spool tmp: %w", err) 75 + } 76 + if err := f.Sync(); err != nil { 77 + f.Close() 78 + os.Remove(tmp) 79 + return fmt.Errorf("fsync spool tmp: %w", err) 80 + } 81 + if err := f.Close(); err != nil { 82 + os.Remove(tmp) 83 + return fmt.Errorf("close spool tmp: %w", err) 84 + } 85 + 86 + // Step 2: rename. With the data fsynced above, the rename's 87 + // directory-entry change is the only remaining hop, and step 3 88 + // fsyncs the directory so even that hop can't be lost. 59 89 if err := os.Rename(tmp, path); err != nil { 60 90 os.Remove(tmp) 61 - return fmt.Errorf("rename spool file: %v", err) 91 + return fmt.Errorf("rename spool file: %w", err) 92 + } 93 + 94 + // Step 3: fsync the directory so the rename is durable. Some 95 + // filesystems (ext4 default, btrfs, zfs) make this implicit when 96 + // data was fsynced first, but Linux does not guarantee it across 97 + // all configurations and macOS APFS makes no guarantees either. 98 + // Failing here means we have a freshly-renamed file that may not 99 + // survive a crash — log but do not roll back, since the file IS 100 + // in place from this process's view and rolling back the rename 101 + // would itself need another sync to be durable. 102 + dir, err := os.Open(s.dir) 103 + if err != nil { 104 + return fmt.Errorf("open spool dir for fsync: %w", err) 105 + } 106 + if err := dir.Sync(); err != nil { 107 + dir.Close() 108 + return fmt.Errorf("fsync spool dir: %w", err) 109 + } 110 + if err := dir.Close(); err != nil { 111 + return fmt.Errorf("close spool dir: %w", err) 62 112 } 63 113 64 114 return nil ··· 66 116 67 117 // Remove deletes a spool file for the given message ID. 68 118 // Returns nil if the file doesn't exist. 119 + // 120 + // fsync of the directory after the unlink is intentional: without 121 + // it, a crash between the unlink and a subsequent operation can 122 + // leave the file ghost-present after journal replay, and LoadAll 123 + // would then re-deliver an already-delivered message. Cost is one 124 + // directory fsync per terminal-state message, which is small 125 + // compared to the cost of an unintended duplicate send. 69 126 func (s *Spool) Remove(id int64) error { 70 127 path := filepath.Join(s.dir, fmt.Sprintf("%d.msg", id)) 71 128 err := os.Remove(path) 72 - if os.IsNotExist(err) { 73 - return nil 129 + if err != nil && !os.IsNotExist(err) { 130 + return err 131 + } 132 + dir, derr := os.Open(s.dir) 133 + if derr != nil { 134 + return fmt.Errorf("open spool dir for fsync: %w", derr) 74 135 } 75 - return err 136 + defer dir.Close() 137 + if err := dir.Sync(); err != nil { 138 + return fmt.Errorf("fsync spool dir after remove: %w", err) 139 + } 140 + return nil 141 + } 142 + 143 + // Exists reports whether a spool file for the given message ID is 144 + // currently present. Used by the orphan-reconciliation janitor in 145 + // cmd/relay (a status=queued DB row with no spool file is the 146 + // signature of a dropped Enqueue). 147 + func (s *Spool) Exists(id int64) bool { 148 + path := filepath.Join(s.dir, fmt.Sprintf("%d.msg", id)) 149 + _, err := os.Stat(path) 150 + return err == nil 76 151 } 77 152 78 153 // LoadAll reads all spool files and returns queue entries. ··· 80 155 func (s *Spool) LoadAll() ([]*QueueEntry, error) { 81 156 entries, err := os.ReadDir(s.dir) 82 157 if err != nil { 83 - return nil, fmt.Errorf("read spool dir: %v", err) 158 + return nil, fmt.Errorf("read spool dir: %w", err) 84 159 } 85 160 86 161 var result []*QueueEntry
+112
internal/relay/spool_durability_test.go
··· 1 + // SPDX-License-Identifier: AGPL-3.0-or-later 2 + 3 + package relay 4 + 5 + import ( 6 + "os" 7 + "path/filepath" 8 + "testing" 9 + ) 10 + 11 + // TestSpoolWrite_FsyncFile pins that Write leaves no .tmp file behind 12 + // and that the resulting message file is durable. We can't actually 13 + // observe fsync from userspace; the durable path is exercised by 14 + // dispatching a write and confirming a clean filesystem state — if 15 + // the implementation regresses to plain WriteFile + Rename, the test 16 + // still passes (fsync is invisible to readers), but the supporting 17 + // invariants below — no leftover .tmp, file is readable — guard the 18 + // observable contract. 19 + func TestSpoolWrite_NoTmpResidue(t *testing.T) { 20 + dir := t.TempDir() 21 + s := NewSpool(dir) 22 + if err := s.Write(&QueueEntry{ID: 42, From: "a@b", To: "c@d", Data: []byte("hi")}); err != nil { 23 + t.Fatalf("Write: %v", err) 24 + } 25 + 26 + // .msg file present 27 + if _, err := os.Stat(filepath.Join(dir, "42.msg")); err != nil { 28 + t.Errorf(".msg file missing: %v", err) 29 + } 30 + // no .tmp residue 31 + matches, _ := filepath.Glob(filepath.Join(dir, "*.tmp")) 32 + if len(matches) != 0 { 33 + t.Errorf("leftover .tmp files: %v", matches) 34 + } 35 + } 36 + 37 + // TestSpoolWrite_TmpRemovedOnRenameFailure verifies that if the 38 + // rename fails (because the target path is a directory), the .tmp 39 + // file is cleaned up rather than left behind to confuse future 40 + // LoadAll scans. 41 + func TestSpoolWrite_TmpRemovedOnRenameFailure(t *testing.T) { 42 + dir := t.TempDir() 43 + // Pre-create a *directory* at the target path so rename will fail. 44 + if err := os.Mkdir(filepath.Join(dir, "1.msg"), 0755); err != nil { 45 + t.Fatalf("mkdir: %v", err) 46 + } 47 + s := NewSpool(dir) 48 + err := s.Write(&QueueEntry{ID: 1, From: "a@b", To: "c@d", Data: []byte("hi")}) 49 + if err == nil { 50 + t.Fatal("expected error from rename onto directory") 51 + } 52 + // .tmp must not survive the failure. 53 + if _, err := os.Stat(filepath.Join(dir, "1.msg.tmp")); !os.IsNotExist(err) { 54 + t.Errorf("leftover .tmp after failed rename: stat err=%v", err) 55 + } 56 + } 57 + 58 + // TestSpoolExists reflects the contract used by the orphan 59 + // reconciliation janitor: only present files report true. 60 + func TestSpoolExists(t *testing.T) { 61 + dir := t.TempDir() 62 + s := NewSpool(dir) 63 + if s.Exists(99) { 64 + t.Error("Exists(99) returned true on empty spool") 65 + } 66 + if err := s.Write(&QueueEntry{ID: 99, From: "a@b", To: "c@d", Data: []byte("x")}); err != nil { 67 + t.Fatalf("Write: %v", err) 68 + } 69 + if !s.Exists(99) { 70 + t.Error("Exists(99) returned false after Write") 71 + } 72 + if err := s.Remove(99); err != nil { 73 + t.Fatalf("Remove: %v", err) 74 + } 75 + if s.Exists(99) { 76 + t.Error("Exists(99) returned true after Remove") 77 + } 78 + } 79 + 80 + // TestSpoolRemove_IdempotentOnMissing — the janitor calls Remove on 81 + // completion paths; missing-file is not an error. 82 + func TestSpoolRemove_IdempotentOnMissing(t *testing.T) { 83 + dir := t.TempDir() 84 + s := NewSpool(dir) 85 + if err := s.Remove(123); err != nil { 86 + t.Errorf("Remove on missing returned error: %v", err) 87 + } 88 + } 89 + 90 + // TestSpoolWriteRoundTrip confirms the data we wrote is what LoadAll 91 + // returns. Existing tests do this for the legacy code path; the 92 + // fsync rewrite must preserve byte-for-byte fidelity. 93 + func TestSpoolWriteRoundTrip(t *testing.T) { 94 + dir := t.TempDir() 95 + s := NewSpool(dir) 96 + want := &QueueEntry{ID: 7, From: "from@x", To: "to@y", Data: []byte("hello world"), MemberDID: "did:plc:test", Attempts: 2} 97 + if err := s.Write(want); err != nil { 98 + t.Fatalf("Write: %v", err) 99 + } 100 + loaded, err := s.LoadAll() 101 + if err != nil { 102 + t.Fatalf("LoadAll: %v", err) 103 + } 104 + if len(loaded) != 1 { 105 + t.Fatalf("LoadAll returned %d entries, want 1", len(loaded)) 106 + } 107 + got := loaded[0] 108 + if got.ID != want.ID || got.From != want.From || got.To != want.To || 109 + string(got.Data) != string(want.Data) || got.MemberDID != want.MemberDID || got.Attempts != want.Attempts { 110 + t.Errorf("round-trip mismatch:\n got= %+v\n want= %+v", got, want) 111 + } 112 + }
+4 -4
internal/relay/warming.go
··· 41 41 func DefaultWarmingConfig() WarmingConfig { 42 42 return WarmingConfig{ 43 43 WarmingPeriod: 7 * 24 * time.Hour, 44 - WarmingHourly: 5, 45 - WarmingDaily: 20, 44 + WarmingHourly: 2, 45 + WarmingDaily: 10, 46 46 47 47 RampingPeriod: 14 * 24 * time.Hour, 48 - RampingHourly: 20, 49 - RampingDaily: 100, 48 + RampingHourly: 10, 49 + RampingDaily: 50, 50 50 } 51 51 } 52 52
+16 -16
internal/relay/warming_test.go
··· 12 12 if cfg.WarmingPeriod != 7*24*time.Hour { 13 13 t.Errorf("WarmingPeriod = %v, want 7 days", cfg.WarmingPeriod) 14 14 } 15 - if cfg.WarmingHourly != 5 { 16 - t.Errorf("WarmingHourly = %d, want 5", cfg.WarmingHourly) 15 + if cfg.WarmingHourly != 2 { 16 + t.Errorf("WarmingHourly = %d, want 2", cfg.WarmingHourly) 17 17 } 18 - if cfg.WarmingDaily != 20 { 19 - t.Errorf("WarmingDaily = %d, want 20", cfg.WarmingDaily) 18 + if cfg.WarmingDaily != 10 { 19 + t.Errorf("WarmingDaily = %d, want 10", cfg.WarmingDaily) 20 20 } 21 21 if cfg.RampingPeriod != 14*24*time.Hour { 22 22 t.Errorf("RampingPeriod = %v, want 14 days", cfg.RampingPeriod) 23 23 } 24 - if cfg.RampingHourly != 20 { 25 - t.Errorf("RampingHourly = %d, want 20", cfg.RampingHourly) 24 + if cfg.RampingHourly != 10 { 25 + t.Errorf("RampingHourly = %d, want 10", cfg.RampingHourly) 26 26 } 27 - if cfg.RampingDaily != 100 { 28 - t.Errorf("RampingDaily = %d, want 100", cfg.RampingDaily) 27 + if cfg.RampingDaily != 50 { 28 + t.Errorf("RampingDaily = %d, want 50", cfg.RampingDaily) 29 29 } 30 30 } 31 31 ··· 91 91 func TestTierCaps_Warming(t *testing.T) { 92 92 cfg := DefaultWarmingConfig() 93 93 h, d := TierCaps(cfg, TierWarming) 94 - if h != 5 || d != 20 { 95 - t.Errorf("TierCaps(warming) = (%d,%d), want (5,20)", h, d) 94 + if h != 2 || d != 10 { 95 + t.Errorf("TierCaps(warming) = (%d,%d), want (2,10)", h, d) 96 96 } 97 97 } 98 98 99 99 func TestTierCaps_Ramping(t *testing.T) { 100 100 cfg := DefaultWarmingConfig() 101 101 h, d := TierCaps(cfg, TierRamping) 102 - if h != 20 || d != 100 { 103 - t.Errorf("TierCaps(ramping) = (%d,%d), want (20,100)", h, d) 102 + if h != 10 || d != 50 { 103 + t.Errorf("TierCaps(ramping) = (%d,%d), want (10,50)", h, d) 104 104 } 105 105 } 106 106 ··· 119 119 cfg := DefaultWarmingConfig() 120 120 created := time.Now().Add(-1 * time.Hour) 121 121 hourly, daily := WarmingLimits(cfg, created, 100, 1000) 122 - if hourly != 5 || daily != 20 { 123 - t.Errorf("new-member limits = (%d,%d), want (5,20)", hourly, daily) 122 + if hourly != 2 || daily != 10 { 123 + t.Errorf("new-member limits = (%d,%d), want (2,10)", hourly, daily) 124 124 } 125 125 } 126 126 ··· 128 128 cfg := DefaultWarmingConfig() 129 129 created := time.Now().Add(-10 * 24 * time.Hour) // mid-ramping 130 130 hourly, daily := WarmingLimits(cfg, created, 100, 1000) 131 - if hourly != 20 || daily != 100 { 132 - t.Errorf("ramping-member limits = (%d,%d), want (20,100)", hourly, daily) 131 + if hourly != 10 || daily != 50 { 132 + t.Errorf("ramping-member limits = (%d,%d), want (10,50)", hourly, daily) 133 133 } 134 134 } 135 135
+155 -75
internal/relay/warmup.go
··· 14 14 // Bypasses rate limiting and suppression since these are operator-initiated 15 15 // sends to known seed addresses. 16 16 type WarmupSender struct { 17 - seedAddresses []string 18 - memberLookup func(ctx context.Context, did string) (*MemberWithDomains, error) 19 - queue *Queue 20 - operatorKeys *DKIMKeys 17 + seedAddresses []string 18 + fromLocalParts []string 19 + memberLookup func(ctx context.Context, did string) (*MemberWithDomains, error) 20 + queue *Queue 21 + operatorKeys *DKIMKeys 21 22 operatorDKIMDomain string 22 - relayDomain string 23 + relayDomain string 23 24 24 25 insertMessage func(ctx context.Context, did, from, to, msgID string) (int64, error) 25 26 incrSendCount func(ctx context.Context, did string) ··· 28 29 // WarmupConfig configures the warmup sender. 29 30 type WarmupConfig struct { 30 31 SeedAddresses []string 32 + FromLocalParts []string // local parts to rotate (default ["scott"]) 31 33 MemberLookup func(ctx context.Context, did string) (*MemberWithDomains, error) 32 34 Queue *Queue 33 35 OperatorKeys *DKIMKeys ··· 38 40 } 39 41 40 42 func NewWarmupSender(cfg WarmupConfig) *WarmupSender { 43 + fromParts := cfg.FromLocalParts 44 + if len(fromParts) == 0 { 45 + fromParts = []string{"scott"} 46 + } 41 47 return &WarmupSender{ 42 48 seedAddresses: cfg.SeedAddresses, 49 + fromLocalParts: fromParts, 43 50 memberLookup: cfg.MemberLookup, 44 51 queue: cfg.Queue, 45 52 operatorKeys: cfg.OperatorKeys, ··· 59 66 Errors []string `json:"errors,omitempty"` 60 67 } 61 68 69 + // SendOne sends a single warmup email to the given seed address on behalf of 70 + // the member DID. Template and From address are selected by recipientIdx to 71 + // ensure variety across recipients within a batch. 72 + func (w *WarmupSender) SendOne(ctx context.Context, did string, recipientIdx int) (*WarmupResult, error) { 73 + if recipientIdx < 0 || recipientIdx >= len(w.seedAddresses) { 74 + return nil, fmt.Errorf("recipient index %d out of range [0, %d)", recipientIdx, len(w.seedAddresses)) 75 + } 76 + 77 + member, err := w.memberLookup(ctx, did) 78 + if err != nil { 79 + return nil, fmt.Errorf("member lookup: %w", err) 80 + } 81 + if member == nil || len(member.Domains) == 0 { 82 + return nil, fmt.Errorf("member %s not found or has no domains", did) 83 + } 84 + 85 + domain := member.Domains[0] 86 + to := w.seedAddresses[recipientIdx] 87 + fromLocal := w.fromLocalParts[recipientIdx%len(w.fromLocalParts)] 88 + from := fromLocal + "@" + domain.Domain 89 + 90 + templates := warmupTemplates() 91 + tmpl := templates[recipientIdx%len(templates)] 92 + 93 + msgID := fmt.Sprintf("<%d.warmup@%s>", time.Now().UnixNano(), w.relayDomain) 94 + msg := buildWarmupMessage(from, to, msgID, tmpl) 95 + 96 + result := &WarmupResult{} 97 + if err := w.sendMessage(ctx, did, from, to, msgID, msg, domain); err != nil { 98 + result.Failed = 1 99 + result.Errors = append(result.Errors, fmt.Sprintf("%s: %v", to, err)) 100 + } else { 101 + result.Sent = 1 102 + } 103 + return result, nil 104 + } 105 + 62 106 // SendBatch sends one warmup email to each seed address on behalf of the 63 - // given member DID. Returns the number sent and any per-recipient errors. 107 + // given member DID. Template and From address vary per recipient. 108 + // Returns the number sent and any per-recipient errors. 64 109 func (w *WarmupSender) SendBatch(ctx context.Context, did string) (*WarmupResult, error) { 65 110 if len(w.seedAddresses) == 0 { 66 111 return nil, fmt.Errorf("no warmup seed addresses configured") ··· 75 120 } 76 121 77 122 domain := member.Domains[0] 78 - from := "postmaster@" + domain.Domain 79 - 123 + templates := warmupTemplates() 80 124 result := &WarmupResult{} 81 - for _, to := range w.seedAddresses { 82 - msgID := fmt.Sprintf("<%d.warmup@%s>", time.Now().UnixNano(), w.relayDomain) 83 - msg := buildWarmupMessage(from, to, msgID, domain.Domain) 84 125 85 - verpFrom := VERPReturnPath(did, to, w.relayDomain) 126 + for i, to := range w.seedAddresses { 127 + fromLocal := w.fromLocalParts[i%len(w.fromLocalParts)] 128 + from := fromLocal + "@" + domain.Domain 129 + tmpl := templates[i%len(templates)] 86 130 87 - raw := []byte(msg) 88 - stamped := append([]byte("X-Atmos-Member-Did: "+did+"\r\n"), raw...) 89 - stamped = PrependFeedbackID(stamped, "transactional", did, domain.Domain) 131 + msgID := fmt.Sprintf("<%d.warmup@%s>", time.Now().UnixNano(), w.relayDomain) 132 + msg := buildWarmupMessage(from, to, msgID, tmpl) 90 133 91 - signer := NewDualDomainSigner(domain.DKIMKeys, w.operatorKeys, domain.Domain, w.operatorDKIMDomain) 92 - signed, err := signer.Sign(strings.NewReader(string(stamped))) 93 - if err != nil { 134 + if err := w.sendMessage(ctx, did, from, to, msgID, msg, domain); err != nil { 94 135 result.Failed++ 95 - result.Errors = append(result.Errors, fmt.Sprintf("%s: DKIM sign: %v", to, err)) 96 - continue 136 + result.Errors = append(result.Errors, fmt.Sprintf("%s: %v", to, err)) 137 + } else { 138 + result.Sent++ 97 139 } 140 + } 98 141 99 - entryID := int64(0) 100 - if w.insertMessage != nil { 101 - id, err := w.insertMessage(ctx, did, from, to, msgID) 102 - if err != nil { 103 - log.Printf("warmup.insert_message: did=%s to=%s error=%v", did, to, err) 104 - } else { 105 - entryID = id 106 - } 107 - } 108 - if w.incrSendCount != nil { 109 - w.incrSendCount(ctx, did) 110 - } 142 + return result, nil 143 + } 144 + 145 + func (w *WarmupSender) sendMessage(ctx context.Context, did, from, to, msgID, msg string, domain DomainInfo) error { 146 + verpFrom := VERPReturnPath(did, to, w.relayDomain) 111 147 112 - if err := w.queue.Enqueue(&QueueEntry{ 113 - ID: entryID, 114 - From: verpFrom, 115 - To: to, 116 - Data: signed, 117 - MemberDID: did, 118 - }); err != nil { 119 - result.Failed++ 120 - result.Errors = append(result.Errors, fmt.Sprintf("%s: enqueue: %v", to, err)) 121 - continue 148 + raw := []byte(msg) 149 + stamped := append([]byte("X-Atmos-Member-Did: "+did+"\r\n"), raw...) 150 + stamped = PrependFeedbackID(stamped, "transactional", did, domain.Domain) 151 + 152 + signer := NewDualDomainSigner(domain.DKIMKeys, w.operatorKeys, domain.Domain, w.operatorDKIMDomain) 153 + signed, err := signer.Sign(strings.NewReader(string(stamped))) 154 + if err != nil { 155 + return fmt.Errorf("DKIM sign: %w", err) 156 + } 157 + 158 + entryID := int64(0) 159 + if w.insertMessage != nil { 160 + id, err := w.insertMessage(ctx, did, from, to, msgID) 161 + if err != nil { 162 + log.Printf("warmup.insert_message: did=%s to=%s error=%v", did, to, err) 163 + } else { 164 + entryID = id 122 165 } 166 + } 167 + if w.incrSendCount != nil { 168 + w.incrSendCount(ctx, did) 169 + } 123 170 124 - result.Sent++ 125 - log.Printf("warmup.queued: did=%s to=%s msg_id=%s", did, to, msgID) 171 + if err := w.queue.Enqueue(&QueueEntry{ 172 + ID: entryID, 173 + From: verpFrom, 174 + To: to, 175 + Data: signed, 176 + MemberDID: did, 177 + }); err != nil { 178 + return fmt.Errorf("enqueue: %w", err) 126 179 } 127 180 128 - return result, nil 181 + log.Printf("warmup.queued: did=%s from=%s to=%s msg_id=%s", did, from, to, msgID) 182 + return nil 129 183 } 130 184 131 185 type warmupTemplate struct { ··· 133 187 body string 134 188 } 135 189 136 - func warmupTemplates(domain string) []warmupTemplate { 190 + func warmupTemplates() []warmupTemplate { 137 191 return []warmupTemplate{ 138 192 { 139 - subject: "Re: setting up email for " + domain, 140 - body: "Hi,\r\n\r\n" + 141 - "Just following up — the email configuration for " + domain + " is all set. DKIM signatures are being applied correctly and everything looks good on our end.\r\n\r\n" + 142 - "Let me know if you run into any issues or have questions about the setup.\r\n\r\n" + 143 - "Best,\r\n" + 193 + subject: "Thursday lunch spot", 194 + body: "Hey,\r\n\r\n" + 195 + "Are you free Thursday? I was thinking we could try that new place on 4th. I heard they do a good cubano.\r\n\r\n" + 196 + "Let me know — I can reserve a table if we go around noon.\r\n\r\n" + 197 + "Scott", 198 + }, 199 + { 200 + subject: "Re: that article you sent", 201 + body: "Just read through it — really interesting point about how federated systems handle trust differently than centralized ones. " + 202 + "The section on cooperative infrastructure reminded me of some things we've been thinking about.\r\n\r\n" + 203 + "Have you seen the follow-up post the author did? I'll dig up the link.\r\n\r\n" + 144 204 "Scott", 145 205 }, 146 206 { 147 - subject: "Quick note about " + domain, 207 + subject: "Weekend plans?", 148 208 body: "Hey,\r\n\r\n" + 149 - "Wanted to let you know that " + domain + " is fully configured and sending through the relay. The DKIM and SPF records are aligned, so messages should be landing in inboxes without any trouble.\r\n\r\n" + 150 - "The cooperative relay model means your domain benefits from shared reputation across all members, which is especially helpful for newer domains that haven't built up their own sending history yet.\r\n\r\n" + 151 - "Thanks,\r\n" + 209 + "Any plans this weekend? I was going to do a hike if the weather holds up. The forecast looks decent but you never know around here.\r\n\r\n" + 210 + "Also — I finally finished that book you recommended. The ending was not what I expected. We should talk about it.\r\n\r\n" + 211 + "Scott", 212 + }, 213 + { 214 + subject: "quick favor", 215 + body: "Hey, can you send me that recipe you mentioned last time? " + 216 + "The one with the roasted peppers. I want to try making it this week.\r\n\r\n" + 217 + "Thanks!\r\n" + 218 + "Scott", 219 + }, 220 + { 221 + subject: "Re: meeting notes", 222 + body: "Thanks for sending these over. I think the timeline in section 3 is a bit aggressive but everything else looks right to me.\r\n\r\n" + 223 + "One thought — should we loop in the design team before we commit to the API contract? " + 224 + "Might save us a round of changes later.\r\n\r\n" + 225 + "Let me know what you think.\r\n\r\n" + 226 + "Scott", 227 + }, 228 + { 229 + subject: "coffee machine recs", 230 + body: "I'm finally replacing my old drip machine. Do you still like your Breville? " + 231 + "I've been going back and forth between that and just getting a simple pour-over setup.\r\n\r\n" + 232 + "Budget is flexible but I don't want something that takes 20 minutes to clean.\r\n\r\n" + 152 233 "Scott", 153 234 }, 154 235 { 155 - subject: domain + " is looking good", 156 - body: "Hi,\r\n\r\n" + 157 - "Everything is running well for " + domain + ". Wanted to drop a quick note to confirm that outbound messages are being signed and delivered as expected.\r\n\r\n" + 158 - "One thing worth mentioning — each message gets two DKIM signatures: one for your domain and one for the relay pool. This gives receiving mail servers two independent ways to verify authenticity, which generally helps with inbox placement.\r\n\r\n" + 159 - "Cheers,\r\n" + 236 + subject: "Saw this and thought of you", 237 + body: "There's a talk at the library next Tuesday about local history — the speaker is that author who wrote the book about the old rail lines. " + 238 + "Starts at 7pm. Free admission.\r\n\r\n" + 239 + "Want to go? I can drive.\r\n\r\n" + 240 + "Scott", 241 + }, 242 + { 243 + subject: "Re: printer issue", 244 + body: "Try power cycling it — unplug for 30 seconds, then plug back in. " + 245 + "If that doesn't work, check if there's a firmware update. Mine had the same problem and updating fixed it.\r\n\r\n" + 246 + "If it's still stuck after that let me know and I'll come take a look.\r\n\r\n" + 160 247 "Scott", 161 248 }, 162 249 { 163 - subject: "Checking in — " + domain, 164 - body: "Hey,\r\n\r\n" + 165 - "Just checking in on " + domain + ". The mail pipeline is healthy and I don't see any issues on our side.\r\n\r\n" + 166 - "If you've been seeing good deliverability, that's great — the shared IP reputation pool is working as intended. If anything looks off, just let me know and I can take a closer look at the logs.\r\n\r\n" + 167 - "Best,\r\n" + 250 + subject: "Happy birthday!", 251 + body: "Hope you have a great one today! Any big plans?\r\n\r\n" + 252 + "We should get dinner sometime this week to celebrate. My treat.\r\n\r\n" + 168 253 "Scott", 169 254 }, 170 255 { 171 - subject: "All good with " + domain, 172 - body: "Hi,\r\n\r\n" + 173 - "Touching base to confirm " + domain + " is in good shape. The relay is processing your outbound mail normally, and authentication records are passing validation.\r\n\r\n" + 174 - "For context, Atmosphere Mail is a cooperative relay built for the AT Protocol ecosystem. The idea is that smaller self-hosted services can share IP reputation instead of each one starting from scratch with a cold IP address. Happy to answer any questions about how it works.\r\n\r\n" + 175 - "Thanks,\r\n" + 256 + subject: "parking situation tomorrow", 257 + body: "Heads up — they're doing construction on the south lot tomorrow so we'll need to use the garage on 2nd. " + 258 + "I'd get there a bit early, it fills up fast.\r\n\r\n" + 259 + "See you there.\r\n\r\n" + 176 260 "Scott", 177 261 }, 178 262 } 179 263 } 180 264 181 - func buildWarmupMessage(from, to, msgID, domain string) string { 182 - templates := warmupTemplates(domain) 183 - idx := int(time.Now().Unix()/60) % len(templates) 184 - t := templates[idx] 185 - 265 + func buildWarmupMessage(from, to, msgID string, t warmupTemplate) string { 186 266 return strings.Join([]string{ 187 267 "From: " + from, 188 268 "To: " + to,
+240
internal/relay/warmup_scheduler.go
··· 1 + // SPDX-License-Identifier: AGPL-3.0-or-later 2 + 3 + package relay 4 + 5 + import ( 6 + "context" 7 + "log" 8 + "math/rand/v2" 9 + "sync" 10 + "time" 11 + ) 12 + 13 + // MemberWarmupCandidate carries the per-member info the scheduler needs 14 + // to make a fair selection (#219). DID is required; CreatedAt is used to 15 + // boost newly-enrolled members so they reach mailbox-provider visibility 16 + // faster than long-tenured ones who already have a sending history. 17 + type MemberWarmupCandidate struct { 18 + DID string 19 + CreatedAt time.Time 20 + } 21 + 22 + // WarmupScheduler drips warmup sends across the day instead of firing 23 + // them all at once. Each tick sends one email to one seed address for 24 + // one member, then waits before the next. This produces the organic 25 + // send pattern that mailbox providers expect from real human senders. 26 + // 27 + // Selection is rotation-fair (every eligible member gets warmed up 28 + // before any one repeats) with a tiebreaker that prefers newly-enrolled 29 + // members so a long-tenured member can't crowd out new enrollees on 30 + // the first iteration through the pool. See #219. 31 + type WarmupScheduler struct { 32 + sender *WarmupSender 33 + listCandidates func(ctx context.Context) ([]MemberWarmupCandidate, error) 34 + interval time.Duration 35 + jitter time.Duration 36 + 37 + mu sync.Mutex 38 + running bool 39 + cancelFunc context.CancelFunc 40 + 41 + // lastWarmedUp tracks per-DID lastWarmupAt for fairness. Process-local; 42 + // a restart resets the rotation, which at worst over-warms one member 43 + // before the wheel re-balances. Persistence is a future enhancement 44 + // covered by the spec note "Track 'last_warmup_at' per member" but is 45 + // not required for the fairness invariant within a process lifetime. 46 + lastMu sync.Mutex 47 + lastWarmedUp map[string]time.Time 48 + now func() time.Time 49 + } 50 + 51 + // WarmupSchedulerConfig configures the background warmup scheduler. 52 + type WarmupSchedulerConfig struct { 53 + Sender *WarmupSender 54 + 55 + // ListCandidates returns active warmup-eligible members with their 56 + // enrollment timestamps. Preferred over ListDIDs because it lets the 57 + // fairness algorithm boost newly-enrolled members. 58 + ListCandidates func(ctx context.Context) ([]MemberWarmupCandidate, error) 59 + 60 + // ListDIDs is the legacy callback returning DIDs without timestamps. 61 + // When ListCandidates is nil the scheduler falls back to ListDIDs; 62 + // the resulting candidates have CreatedAt = zero, which makes them 63 + // equivalent for the boost tiebreaker. Members still rotate via 64 + // lastWarmupAt tracking. 65 + ListDIDs func(ctx context.Context) ([]string, error) 66 + 67 + Interval time.Duration // base time between sends (default 20min) 68 + Jitter time.Duration // max random jitter (default 10min) 69 + 70 + // Now overrides time.Now for tests; defaults to time.Now. 71 + Now func() time.Time 72 + } 73 + 74 + func NewWarmupScheduler(cfg WarmupSchedulerConfig) *WarmupScheduler { 75 + interval := cfg.Interval 76 + if interval == 0 { 77 + interval = 20 * time.Minute 78 + } 79 + jitter := cfg.Jitter 80 + if jitter == 0 { 81 + jitter = 10 * time.Minute 82 + } 83 + now := cfg.Now 84 + if now == nil { 85 + now = time.Now 86 + } 87 + 88 + listCandidates := cfg.ListCandidates 89 + if listCandidates == nil && cfg.ListDIDs != nil { 90 + legacy := cfg.ListDIDs 91 + listCandidates = func(ctx context.Context) ([]MemberWarmupCandidate, error) { 92 + dids, err := legacy(ctx) 93 + if err != nil { 94 + return nil, err 95 + } 96 + out := make([]MemberWarmupCandidate, len(dids)) 97 + for i, d := range dids { 98 + out[i] = MemberWarmupCandidate{DID: d} 99 + } 100 + return out, nil 101 + } 102 + } 103 + 104 + return &WarmupScheduler{ 105 + sender: cfg.Sender, 106 + listCandidates: listCandidates, 107 + interval: interval, 108 + jitter: jitter, 109 + lastWarmedUp: map[string]time.Time{}, 110 + now: now, 111 + } 112 + } 113 + 114 + // Start begins the background warmup loop. Safe to call multiple times; 115 + // subsequent calls are no-ops if already running. 116 + func (s *WarmupScheduler) Start(ctx context.Context) { 117 + s.mu.Lock() 118 + defer s.mu.Unlock() 119 + if s.running { 120 + return 121 + } 122 + s.running = true 123 + ctx, s.cancelFunc = context.WithCancel(ctx) 124 + go s.loop(ctx) 125 + log.Printf("warmup.scheduler: started interval=%s jitter=%s seeds=%d", 126 + s.interval, s.jitter, s.sender.SeedCount()) 127 + } 128 + 129 + // Stop halts the background warmup loop. 130 + func (s *WarmupScheduler) Stop() { 131 + s.mu.Lock() 132 + defer s.mu.Unlock() 133 + if !s.running { 134 + return 135 + } 136 + s.cancelFunc() 137 + s.running = false 138 + log.Printf("warmup.scheduler: stopped") 139 + } 140 + 141 + func (s *WarmupScheduler) loop(ctx context.Context) { 142 + defer func() { 143 + s.mu.Lock() 144 + s.running = false 145 + s.mu.Unlock() 146 + }() 147 + 148 + for { 149 + wait := s.interval + time.Duration(rand.Int64N(int64(s.jitter))) 150 + select { 151 + case <-ctx.Done(): 152 + return 153 + case <-time.After(wait): 154 + s.tick(ctx) 155 + } 156 + } 157 + } 158 + 159 + func (s *WarmupScheduler) tick(ctx context.Context) { 160 + candidates, err := s.listCandidates(ctx) 161 + if err != nil { 162 + log.Printf("warmup.scheduler: list members: %v", err) 163 + return 164 + } 165 + if len(candidates) == 0 { 166 + return 167 + } 168 + 169 + seedCount := s.sender.SeedCount() 170 + if seedCount == 0 { 171 + return 172 + } 173 + 174 + picked := s.SelectMember(candidates) 175 + recipientIdx := rand.IntN(seedCount) 176 + 177 + result, err := s.sender.SendOne(ctx, picked.DID, recipientIdx) 178 + if err != nil { 179 + log.Printf("warmup.scheduler: did=%s error=%v", picked.DID, err) 180 + return 181 + } 182 + 183 + if result.Sent > 0 { 184 + s.recordWarmup(picked.DID) 185 + log.Printf("warmup.scheduler: did=%s seed=%d sent=1", picked.DID, recipientIdx) 186 + } 187 + if result.Failed > 0 { 188 + log.Printf("warmup.scheduler: did=%s seed=%d failed=1 errors=%v", picked.DID, recipientIdx, result.Errors) 189 + } 190 + } 191 + 192 + // SelectMember picks the candidate most due for a warmup send. Exported 193 + // so tests can pin the fairness invariant directly. 194 + // 195 + // Algorithm: oldest lastWarmupAt wins (rotation fairness — every member 196 + // gets warmed up before any single one repeats). Tiebreaker: newest 197 + // CreatedAt wins (boost recent enrollees so a flood of pre-existing 198 + // members can't starve a new one). Members never warmed up have 199 + // lastWarmupAt = zero, which always sorts before any non-zero time, so 200 + // they're always picked before re-warming an already-warmed member. 201 + func (s *WarmupScheduler) SelectMember(candidates []MemberWarmupCandidate) MemberWarmupCandidate { 202 + if len(candidates) == 0 { 203 + return MemberWarmupCandidate{} 204 + } 205 + 206 + s.lastMu.Lock() 207 + defer s.lastMu.Unlock() 208 + 209 + best := candidates[0] 210 + bestLast := s.lastWarmedUp[best.DID] 211 + 212 + for _, c := range candidates[1:] { 213 + last := s.lastWarmedUp[c.DID] 214 + switch { 215 + case last.Before(bestLast): 216 + best, bestLast = c, last 217 + case last.Equal(bestLast): 218 + if c.CreatedAt.After(best.CreatedAt) { 219 + best, bestLast = c, last 220 + } 221 + } 222 + } 223 + return best 224 + } 225 + 226 + // recordWarmup stamps the lastWarmupAt for a DID after a successful send. 227 + func (s *WarmupScheduler) recordWarmup(did string) { 228 + s.lastMu.Lock() 229 + defer s.lastMu.Unlock() 230 + s.lastWarmedUp[did] = s.now() 231 + } 232 + 233 + // LastWarmedUp returns the last-warmup timestamp for a DID. Returns the 234 + // zero time if the DID has never been warmed up by this scheduler. 235 + // Test/diagnostic helper. 236 + func (s *WarmupScheduler) LastWarmedUp(did string) time.Time { 237 + s.lastMu.Lock() 238 + defer s.lastMu.Unlock() 239 + return s.lastWarmedUp[did] 240 + }
+331
internal/relay/warmup_scheduler_test.go
··· 1 + // SPDX-License-Identifier: AGPL-3.0-or-later 2 + 3 + package relay 4 + 5 + import ( 6 + "context" 7 + "sync/atomic" 8 + "testing" 9 + "time" 10 + ) 11 + 12 + func TestWarmupScheduler_TickSendsOneEmail(t *testing.T) { 13 + var sent atomic.Int32 14 + 15 + opKeys := testDKIMKeys(t) 16 + memberKeys := testDKIMKeys(t) 17 + ws := &WarmupSender{ 18 + seedAddresses: []string{"a@test.com", "b@test.com"}, 19 + fromLocalParts: []string{"scott", "hello"}, 20 + operatorDKIMDomain: "atmos.email", 21 + operatorKeys: opKeys, 22 + relayDomain: "smtp.atmos.email", 23 + memberLookup: func(ctx context.Context, did string) (*MemberWithDomains, error) { 24 + return &MemberWithDomains{ 25 + DID: did, 26 + Status: "active", 27 + Domains: []DomainInfo{ 28 + {Domain: "example.com", DKIMKeys: memberKeys}, 29 + }, 30 + }, nil 31 + }, 32 + queue: newTestQueue(), 33 + insertMessage: func(ctx context.Context, did, from, to, msgID string) (int64, error) { 34 + return 1, nil 35 + }, 36 + incrSendCount: func(ctx context.Context, did string) { 37 + sent.Add(1) 38 + }, 39 + } 40 + 41 + sched := NewWarmupScheduler(WarmupSchedulerConfig{ 42 + Sender: ws, 43 + ListDIDs: func(ctx context.Context) ([]string, error) { 44 + return []string{"did:plc:test"}, nil 45 + }, 46 + Interval: 100 * time.Millisecond, 47 + Jitter: 1 * time.Millisecond, 48 + }) 49 + 50 + ctx, cancel := context.WithCancel(context.Background()) 51 + sched.Start(ctx) 52 + t.Cleanup(func() { 53 + sched.Stop() 54 + cancel() 55 + }) 56 + 57 + time.Sleep(1 * time.Second) 58 + 59 + count := sent.Load() 60 + if count < 2 { 61 + t.Errorf("expected at least 2 sends, got %d", count) 62 + } 63 + } 64 + 65 + func TestWarmupScheduler_NoMembersNoSends(t *testing.T) { 66 + var sent atomic.Int32 67 + 68 + ws := &WarmupSender{ 69 + seedAddresses: []string{"a@test.com"}, 70 + fromLocalParts: []string{"scott"}, 71 + relayDomain: "smtp.atmos.email", 72 + memberLookup: func(ctx context.Context, did string) (*MemberWithDomains, error) { 73 + return nil, nil 74 + }, 75 + queue: newTestQueue(), 76 + incrSendCount: func(ctx context.Context, did string) { 77 + sent.Add(1) 78 + }, 79 + } 80 + 81 + sched := NewWarmupScheduler(WarmupSchedulerConfig{ 82 + Sender: ws, 83 + ListDIDs: func(ctx context.Context) ([]string, error) { 84 + return nil, nil 85 + }, 86 + Interval: 10 * time.Millisecond, 87 + Jitter: 1 * time.Millisecond, 88 + }) 89 + 90 + ctx, cancel := context.WithCancel(context.Background()) 91 + sched.Start(ctx) 92 + t.Cleanup(func() { 93 + sched.Stop() 94 + cancel() 95 + }) 96 + 97 + time.Sleep(100 * time.Millisecond) 98 + 99 + if sent.Load() != 0 { 100 + t.Errorf("expected 0 sends with no members, got %d", sent.Load()) 101 + } 102 + } 103 + 104 + func TestWarmupScheduler_StartStopIdempotent(t *testing.T) { 105 + ws := &WarmupSender{ 106 + seedAddresses: []string{"a@test.com"}, 107 + fromLocalParts: []string{"scott"}, 108 + } 109 + 110 + sched := NewWarmupScheduler(WarmupSchedulerConfig{ 111 + Sender: ws, 112 + ListDIDs: func(ctx context.Context) ([]string, error) { 113 + return nil, nil 114 + }, 115 + }) 116 + 117 + ctx := context.Background() 118 + 119 + sched.Start(ctx) 120 + sched.Start(ctx) 121 + 122 + sched.Stop() 123 + sched.Stop() 124 + } 125 + 126 + func TestSendOne_VariesTemplateAndFrom(t *testing.T) { 127 + type sendRecord struct { 128 + from string 129 + to string 130 + } 131 + var sends []sendRecord 132 + 133 + opKeys := testDKIMKeys(t) 134 + memberKeys := testDKIMKeys(t) 135 + ws := &WarmupSender{ 136 + seedAddresses: []string{"a@test.com", "b@test.com", "c@test.com"}, 137 + fromLocalParts: []string{"scott", "hello"}, 138 + operatorDKIMDomain: "atmos.email", 139 + operatorKeys: opKeys, 140 + relayDomain: "smtp.atmos.email", 141 + memberLookup: func(ctx context.Context, did string) (*MemberWithDomains, error) { 142 + return &MemberWithDomains{ 143 + DID: did, 144 + Status: "active", 145 + Domains: []DomainInfo{ 146 + {Domain: "example.com", DKIMKeys: memberKeys}, 147 + }, 148 + }, nil 149 + }, 150 + queue: newTestQueue(), 151 + insertMessage: func(ctx context.Context, did, from, to, msgID string) (int64, error) { 152 + sends = append(sends, sendRecord{from: from, to: to}) 153 + return int64(len(sends)), nil 154 + }, 155 + incrSendCount: func(ctx context.Context, did string) {}, 156 + } 157 + 158 + ctx := context.Background() 159 + for i := 0; i < 3; i++ { 160 + _, err := ws.SendOne(ctx, "did:plc:test", i) 161 + if err != nil { 162 + t.Fatalf("SendOne(%d): %v", i, err) 163 + } 164 + } 165 + 166 + if len(sends) != 3 { 167 + t.Fatalf("expected 3 sends, got %d", len(sends)) 168 + } 169 + 170 + if sends[0].from != "scott@example.com" { 171 + t.Errorf("send 0 from = %s, want scott@example.com", sends[0].from) 172 + } 173 + if sends[1].from != "hello@example.com" { 174 + t.Errorf("send 1 from = %s, want hello@example.com", sends[1].from) 175 + } 176 + if sends[2].from != "scott@example.com" { 177 + t.Errorf("send 2 from = %s, want scott@example.com", sends[2].from) 178 + } 179 + } 180 + 181 + func newTestQueue() *Queue { 182 + return NewQueue(func(r DeliveryResult) {}, DefaultQueueConfig()) 183 + } 184 + 185 + func testDKIMKeys(t *testing.T) *DKIMKeys { 186 + t.Helper() 187 + keys, err := GenerateDKIMKeys("test20260101") 188 + if err != nil { 189 + t.Fatalf("GenerateDKIMKeys: %v", err) 190 + } 191 + return keys 192 + } 193 + 194 + // --- Fairness selection (#219) --- 195 + 196 + func TestSelectMember_PreferUnwarmedOverWarmed(t *testing.T) { 197 + s := NewWarmupScheduler(WarmupSchedulerConfig{ 198 + Sender: &WarmupSender{}, 199 + ListDIDs: func(ctx context.Context) ([]string, error) { return nil, nil }, 200 + }) 201 + now := time.Date(2026, 4, 29, 12, 0, 0, 0, time.UTC) 202 + s.now = func() time.Time { return now } 203 + 204 + candidates := []MemberWarmupCandidate{ 205 + {DID: "did:warmed", CreatedAt: now.Add(-30 * 24 * time.Hour)}, 206 + {DID: "did:fresh"}, 207 + } 208 + s.recordWarmup("did:warmed") 209 + 210 + picked := s.SelectMember(candidates) 211 + if picked.DID != "did:fresh" { 212 + t.Errorf("picked %q, want did:fresh — never-warmed member must outrank already-warmed", picked.DID) 213 + } 214 + } 215 + 216 + func TestSelectMember_RotatesThroughEveryone(t *testing.T) { 217 + // 5 candidates → first 5 ticks must touch all 5 distinct DIDs before 218 + // any one repeats. This is the rotation-fairness invariant. 219 + s := NewWarmupScheduler(WarmupSchedulerConfig{ 220 + Sender: &WarmupSender{}, 221 + ListDIDs: func(ctx context.Context) ([]string, error) { return nil, nil }, 222 + }) 223 + tick := time.Date(2026, 4, 29, 12, 0, 0, 0, time.UTC) 224 + s.now = func() time.Time { return tick } 225 + 226 + candidates := []MemberWarmupCandidate{ 227 + {DID: "a", CreatedAt: tick.Add(-60 * 24 * time.Hour)}, 228 + {DID: "b", CreatedAt: tick.Add(-50 * 24 * time.Hour)}, 229 + {DID: "c", CreatedAt: tick.Add(-40 * 24 * time.Hour)}, 230 + {DID: "d", CreatedAt: tick.Add(-30 * 24 * time.Hour)}, 231 + {DID: "e", CreatedAt: tick.Add(-20 * 24 * time.Hour)}, 232 + } 233 + 234 + seen := map[string]bool{} 235 + for i := 0; i < 5; i++ { 236 + picked := s.SelectMember(candidates) 237 + if seen[picked.DID] { 238 + t.Errorf("tick %d picked %q again before rotation completed: seen=%v", i, picked.DID, seen) 239 + } 240 + seen[picked.DID] = true 241 + // Advance simulated time so each warmup gets a distinct timestamp, 242 + // otherwise everyone ties at the same "lastWarmedUp" and the 243 + // CreatedAt tiebreaker dominates instead of rotation order. 244 + tick = tick.Add(1 * time.Minute) 245 + s.recordWarmup(picked.DID) 246 + } 247 + if len(seen) != 5 { 248 + t.Errorf("after 5 ticks saw %d distinct members, want 5", len(seen)) 249 + } 250 + } 251 + 252 + func TestSelectMember_BoostsNewerEnrolleeOnTie(t *testing.T) { 253 + // Two never-warmed candidates → both have lastWarmupAt = zero so the 254 + // tiebreaker fires. The newer enrollee must win. 255 + s := NewWarmupScheduler(WarmupSchedulerConfig{ 256 + Sender: &WarmupSender{}, 257 + ListDIDs: func(ctx context.Context) ([]string, error) { return nil, nil }, 258 + }) 259 + now := time.Date(2026, 4, 29, 12, 0, 0, 0, time.UTC) 260 + s.now = func() time.Time { return now } 261 + 262 + candidates := []MemberWarmupCandidate{ 263 + {DID: "tenured", CreatedAt: now.Add(-365 * 24 * time.Hour)}, 264 + {DID: "newbie", CreatedAt: now.Add(-2 * 24 * time.Hour)}, 265 + } 266 + picked := s.SelectMember(candidates) 267 + if picked.DID != "newbie" { 268 + t.Errorf("picked %q, want newbie — boost tiebreaker should prefer recent enrollee", picked.DID) 269 + } 270 + } 271 + 272 + func TestSelectMember_NoCandidatesReturnsZero(t *testing.T) { 273 + s := NewWarmupScheduler(WarmupSchedulerConfig{ 274 + Sender: &WarmupSender{}, 275 + ListDIDs: func(ctx context.Context) ([]string, error) { return nil, nil }, 276 + }) 277 + picked := s.SelectMember(nil) 278 + if picked.DID != "" { 279 + t.Errorf("empty input picked %q, want empty", picked.DID) 280 + } 281 + } 282 + 283 + func TestSelectMember_SecondRoundRotatesAgain(t *testing.T) { 284 + // After a full rotation, the second round must again touch every 285 + // member before any single one repeats — proving fairness across 286 + // rounds, not just within one. 287 + s := NewWarmupScheduler(WarmupSchedulerConfig{ 288 + Sender: &WarmupSender{}, 289 + ListDIDs: func(ctx context.Context) ([]string, error) { return nil, nil }, 290 + }) 291 + tick := time.Date(2026, 4, 29, 12, 0, 0, 0, time.UTC) 292 + s.now = func() time.Time { return tick } 293 + 294 + candidates := []MemberWarmupCandidate{ 295 + {DID: "a", CreatedAt: tick.Add(-30 * 24 * time.Hour)}, 296 + {DID: "b", CreatedAt: tick.Add(-20 * 24 * time.Hour)}, 297 + {DID: "c", CreatedAt: tick.Add(-10 * 24 * time.Hour)}, 298 + } 299 + 300 + picks := []string{} 301 + for i := 0; i < 6; i++ { 302 + picked := s.SelectMember(candidates) 303 + picks = append(picks, picked.DID) 304 + tick = tick.Add(1 * time.Minute) 305 + s.recordWarmup(picked.DID) 306 + } 307 + 308 + // First 3 must be a permutation of {a,b,c}; same for next 3. 309 + first := map[string]bool{picks[0]: true, picks[1]: true, picks[2]: true} 310 + second := map[string]bool{picks[3]: true, picks[4]: true, picks[5]: true} 311 + if len(first) != 3 || len(second) != 3 { 312 + t.Errorf("rounds not full rotations: picks=%v", picks) 313 + } 314 + } 315 + 316 + func TestSelectMember_LastWarmedUpRecorded(t *testing.T) { 317 + s := NewWarmupScheduler(WarmupSchedulerConfig{ 318 + Sender: &WarmupSender{}, 319 + ListDIDs: func(ctx context.Context) ([]string, error) { return nil, nil }, 320 + }) 321 + now := time.Date(2026, 4, 29, 12, 0, 0, 0, time.UTC) 322 + s.now = func() time.Time { return now } 323 + 324 + if !s.LastWarmedUp("did:never").IsZero() { 325 + t.Error("never-warmed DID should return zero time") 326 + } 327 + s.recordWarmup("did:warmed") 328 + if got := s.LastWarmedUp("did:warmed"); !got.Equal(now) { 329 + t.Errorf("LastWarmedUp = %v, want %v", got, now) 330 + } 331 + }
+162
internal/relaystore/bypass_audit_test.go
··· 1 + // SPDX-License-Identifier: AGPL-3.0-or-later 2 + 3 + package relaystore 4 + 5 + import ( 6 + "context" 7 + "testing" 8 + "time" 9 + ) 10 + 11 + // TestInsertBypassDID_WritesAuditRow ensures every add/remove leaves a 12 + // reconstructable trail in bypass_audit even when the bypass_dids row 13 + // is later overwritten or deleted (#213). 14 + func TestInsertBypassDID_WritesAuditRow(t *testing.T) { 15 + s := testStore(t) 16 + ctx := context.Background() 17 + did := "did:plc:auditme" 18 + expiry := time.Now().UTC().Add(2 * time.Hour) 19 + 20 + if err := s.InsertBypassDID(ctx, did, expiry, "investigating sender flood"); err != nil { 21 + t.Fatalf("InsertBypassDID: %v", err) 22 + } 23 + 24 + var n int 25 + if err := s.db.QueryRowContext(ctx, 26 + `SELECT COUNT(*) FROM bypass_audit WHERE did = ? AND action = 'add'`, did, 27 + ).Scan(&n); err != nil { 28 + t.Fatalf("count audit: %v", err) 29 + } 30 + if n != 1 { 31 + t.Errorf("audit add count = %d, want 1", n) 32 + } 33 + 34 + if err := s.DeleteBypassDID(ctx, did, "manual"); err != nil { 35 + t.Fatalf("DeleteBypassDID: %v", err) 36 + } 37 + if err := s.db.QueryRowContext(ctx, 38 + `SELECT COUNT(*) FROM bypass_audit WHERE did = ? AND action = 'remove'`, did, 39 + ).Scan(&n); err != nil { 40 + t.Fatalf("count audit: %v", err) 41 + } 42 + if n != 1 { 43 + t.Errorf("audit remove count = %d, want 1", n) 44 + } 45 + } 46 + 47 + // TestListBypassDIDs_ExcludesExpired pins the central security 48 + // invariant: a bypass entry whose expires_at has passed must NOT 49 + // appear in the active list, even if PurgeExpiredBypassDIDs hasn't 50 + // run yet. This way the label checker reload path is also safe. 51 + func TestListBypassDIDs_ExcludesExpired(t *testing.T) { 52 + s := testStore(t) 53 + ctx := context.Background() 54 + 55 + past := time.Now().UTC().Add(-time.Hour) 56 + future := time.Now().UTC().Add(time.Hour) 57 + 58 + if err := s.InsertBypassDID(ctx, "did:plc:expired", past, ""); err != nil { 59 + t.Fatalf("insert expired: %v", err) 60 + } 61 + if err := s.InsertBypassDID(ctx, "did:plc:active", future, ""); err != nil { 62 + t.Fatalf("insert active: %v", err) 63 + } 64 + 65 + got, err := s.ListBypassDIDs(ctx) 66 + if err != nil { 67 + t.Fatalf("ListBypassDIDs: %v", err) 68 + } 69 + if len(got) != 1 || got[0] != "did:plc:active" { 70 + t.Errorf("ListBypassDIDs = %v, want [did:plc:active] only", got) 71 + } 72 + } 73 + 74 + // TestListBypassDIDs_KeepsLegacyPermanent — entries migrated from the 75 + // pre-#213 schema have expires_at='' and represent already-deployed 76 + // permanent bypasses. We must not retroactively evict them on the 77 + // migration; an operator has to convert them by re-adding with expiry. 78 + func TestListBypassDIDs_KeepsLegacyPermanent(t *testing.T) { 79 + s := testStore(t) 80 + ctx := context.Background() 81 + 82 + // Simulate a migrated row with empty expires_at. 83 + if _, err := s.db.ExecContext(ctx, 84 + `INSERT INTO bypass_dids (did, expires_at, reason, created_at) VALUES (?, '', '', '')`, 85 + "did:plc:legacy", 86 + ); err != nil { 87 + t.Fatalf("seed legacy row: %v", err) 88 + } 89 + 90 + got, err := s.ListBypassDIDs(ctx) 91 + if err != nil { 92 + t.Fatalf("ListBypassDIDs: %v", err) 93 + } 94 + if len(got) != 1 || got[0] != "did:plc:legacy" { 95 + t.Errorf("legacy permanent entry dropped from list: got %v", got) 96 + } 97 + } 98 + 99 + // TestPurgeExpiredBypassDIDs_RemovesExpiredOnly confirms the janitor 100 + // path: returns the count of evicted DIDs and writes 'expired' audit 101 + // rows so post-hoc analysis can distinguish janitor from manual. 102 + func TestPurgeExpiredBypassDIDs_RemovesExpiredOnly(t *testing.T) { 103 + s := testStore(t) 104 + ctx := context.Background() 105 + 106 + past := time.Now().UTC().Add(-time.Hour) 107 + future := time.Now().UTC().Add(time.Hour) 108 + if err := s.InsertBypassDID(ctx, "did:plc:e1", past, ""); err != nil { 109 + t.Fatalf("seed: %v", err) 110 + } 111 + if err := s.InsertBypassDID(ctx, "did:plc:e2", past, ""); err != nil { 112 + t.Fatalf("seed: %v", err) 113 + } 114 + if err := s.InsertBypassDID(ctx, "did:plc:keep", future, ""); err != nil { 115 + t.Fatalf("seed: %v", err) 116 + } 117 + 118 + n, err := s.PurgeExpiredBypassDIDs(ctx) 119 + if err != nil { 120 + t.Fatalf("PurgeExpiredBypassDIDs: %v", err) 121 + } 122 + if n != 2 { 123 + t.Errorf("evicted = %d, want 2", n) 124 + } 125 + 126 + got, _ := s.ListBypassDIDs(ctx) 127 + if len(got) != 1 || got[0] != "did:plc:keep" { 128 + t.Errorf("post-purge list = %v, want [did:plc:keep]", got) 129 + } 130 + 131 + var expiredAudit int 132 + if err := s.db.QueryRowContext(ctx, 133 + `SELECT COUNT(*) FROM bypass_audit WHERE action = 'remove' AND reason = 'expired'`, 134 + ).Scan(&expiredAudit); err != nil { 135 + t.Fatalf("count expired audit: %v", err) 136 + } 137 + if expiredAudit != 2 { 138 + t.Errorf("expired-audit rows = %d, want 2", expiredAudit) 139 + } 140 + } 141 + 142 + // TestPurgeExpiredBypassDIDs_LeavesLegacyAlone confirms the 143 + // grandfather invariant — even mid-purge, legacy permanent entries 144 + // (expires_at='') remain. 145 + func TestPurgeExpiredBypassDIDs_LeavesLegacyAlone(t *testing.T) { 146 + s := testStore(t) 147 + ctx := context.Background() 148 + 149 + if _, err := s.db.ExecContext(ctx, 150 + `INSERT INTO bypass_dids (did, expires_at, reason, created_at) VALUES (?, '', '', '')`, 151 + "did:plc:legacy", 152 + ); err != nil { 153 + t.Fatalf("seed legacy: %v", err) 154 + } 155 + if _, err := s.PurgeExpiredBypassDIDs(ctx); err != nil { 156 + t.Fatalf("Purge: %v", err) 157 + } 158 + got, _ := s.ListBypassDIDs(ctx) 159 + if len(got) != 1 || got[0] != "did:plc:legacy" { 160 + t.Errorf("legacy permanent entry was purged: got %v", got) 161 + } 162 + }
+6 -6
internal/relaystore/inbound_messages.go
··· 92 92 ON inbound_messages(classification, received_at DESC); 93 93 `) 94 94 if err != nil { 95 - return fmt.Errorf("create inbound_messages: %v", err) 95 + return fmt.Errorf("create inbound_messages: %w", err) 96 96 } 97 97 return nil 98 98 } ··· 110 110 m.RawHeaders, m.PayloadSummary, m.Disposition, 111 111 ) 112 112 if err != nil { 113 - return 0, fmt.Errorf("insert inbound message: %v", err) 113 + return 0, fmt.Errorf("insert inbound message: %w", err) 114 114 } 115 115 return res.LastInsertId() 116 116 } ··· 167 167 168 168 rows, err := s.db.QueryContext(ctx, q, args...) 169 169 if err != nil { 170 - return nil, fmt.Errorf("list inbound messages: %v", err) 170 + return nil, fmt.Errorf("list inbound messages: %w", err) 171 171 } 172 172 defer rows.Close() 173 173 ··· 204 204 ) 205 205 } 206 206 if err != nil { 207 - return nil, fmt.Errorf("count inbound by member: %v", err) 207 + return nil, fmt.Errorf("count inbound by member: %w", err) 208 208 } 209 209 defer rows.Close() 210 210 ··· 213 213 var class string 214 214 var count int64 215 215 if err := rows.Scan(&class, &count); err != nil { 216 - return nil, fmt.Errorf("scan inbound count: %v", err) 216 + return nil, fmt.Errorf("scan inbound count: %w", err) 217 217 } 218 218 out[class] = count 219 219 } ··· 232 232 return nil, nil 233 233 } 234 234 if err != nil { 235 - return nil, fmt.Errorf("scan inbound message: %v", err) 235 + return nil, fmt.Errorf("scan inbound message: %w", err) 236 236 } 237 237 m.ReceivedAt = parseTime(receivedAt) 238 238 return &m, nil
+140
internal/relaystore/observability.go
··· 1 + // SPDX-License-Identifier: AGPL-3.0-or-later 2 + 3 + package relaystore 4 + 5 + import ( 6 + "context" 7 + "strings" 8 + "time" 9 + ) 10 + 11 + // ListBypassAuditForTest exposes the bypass_audit table to tests 12 + // outside the relaystore package (the admin package's bypass tests 13 + // need to assert audit rows). Production code should not call this; 14 + // keep the API uncommitted by suffixing ForTest. 15 + func (s *Store) ListBypassAuditForTest(ctx context.Context, did string) ([]BypassAuditEntry, error) { 16 + rows, err := s.db.QueryContext(ctx, 17 + `SELECT id, did, action, reason, expires_at, created_at 18 + FROM bypass_audit WHERE did = ? ORDER BY id ASC`, did, 19 + ) 20 + if err != nil { 21 + return nil, err 22 + } 23 + defer rows.Close() 24 + var out []BypassAuditEntry 25 + for rows.Next() { 26 + var e BypassAuditEntry 27 + var expStr, createdStr string 28 + if err := rows.Scan(&e.ID, &e.DID, &e.Action, &e.Reason, &expStr, &createdStr); err != nil { 29 + return nil, err 30 + } 31 + if expStr != "" { 32 + if t, err := time.Parse(time.RFC3339Nano, expStr); err == nil { 33 + e.ExpiresAt = t 34 + } 35 + } 36 + if t, err := time.Parse(time.RFC3339Nano, createdStr); err == nil { 37 + e.CreatedAt = t 38 + } 39 + out = append(out, e) 40 + } 41 + return out, rows.Err() 42 + } 43 + 44 + // BusyRecorder is the narrow interface the Store needs to count 45 + // SQLITE_BUSY errors at hot-path writers, without taking a hard 46 + // dependency on Prometheus types in the relaystore package. 47 + // *relay.Metrics implements this; cmd/relay wires it via 48 + // SetBusyRecorder during construction. 49 + type BusyRecorder interface { 50 + IncBusyError(op string) 51 + } 52 + 53 + // SetBusyRecorder installs the busy-error recorder. Calling more 54 + // than once replaces the previous recorder; safe to call once 55 + // during wiring before the Store sees any traffic. 56 + func (s *Store) SetBusyRecorder(r BusyRecorder) { 57 + s.busyRecorder = r 58 + } 59 + 60 + // recordIfBusy is a small helper that callers use to classify a 61 + // freshly-returned error from a sql.DB call. Returns the error 62 + // unchanged so call sites can chain it inline. 63 + func (s *Store) recordIfBusy(op string, err error) error { 64 + if err != nil && s.busyRecorder != nil && IsSQLiteBusy(err) { 65 + s.busyRecorder.IncBusyError(op) 66 + } 67 + return err 68 + } 69 + 70 + // PoolStats is a Store-level snapshot of *sql.DB pool counters 71 + // suitable for emitting as Prometheus gauges. Returned each time 72 + // SampleStats is called so a caller in cmd/relay can drive a 73 + // periodic update loop without exposing *sql.DB outside the 74 + // package. 75 + type PoolStats struct { 76 + OpenConnections int 77 + InUse int 78 + Idle int 79 + WaitCount int64 80 + WaitDurationSecond float64 81 + } 82 + 83 + // SampleStats reads sql.DB.Stats() and converts it into a 84 + // transport-friendly snapshot. Cheap to call (atomic loads under 85 + // the hood); cmd/relay should poll this every ~10s and forward 86 + // the values into the Prometheus gauges defined in 87 + // internal/relay/metrics.go. 88 + // 89 + // Why we expose sql.DB.Stats() rather than wrapping every Exec / 90 + // Query call site: the relay has 90+ DB call sites across the 91 + // store package, and SQLITE_BUSY errors that escape the 5s 92 + // busy_timeout PRAGMA are rare. The pool stats are a near-perfect 93 + // proxy: WaitCount climbing means contention is brewing, even 94 + // before any error escapes; InUse near MaxOpenConns means the 95 + // next caller will wait. Combined with BusyErrorClassify on the 96 + // hot writers, this gives operators a complete picture without 97 + // touching every callsite. Closes #210. 98 + func (s *Store) SampleStats() PoolStats { 99 + st := s.db.Stats() 100 + return PoolStats{ 101 + OpenConnections: st.OpenConnections, 102 + InUse: st.InUse, 103 + Idle: st.Idle, 104 + WaitCount: st.WaitCount, 105 + WaitDurationSecond: st.WaitDuration.Seconds(), 106 + } 107 + } 108 + 109 + // IsSQLiteBusy reports whether an error returned from modernc/sqlite 110 + // is a SQLITE_BUSY or locked condition. modernc does NOT export a 111 + // typed sentinel for these (the official driver wraps them as 112 + // *sqlite.Error but the value is unexported), so we fall back to 113 + // a robust substring match on the well-known reason strings. 114 + // 115 + // Used by store-level helpers to increment metrics.SQLiteBusyErrors 116 + // at the hot-path writers (InsertMessage, UpdateMessageStatus, 117 + // IncrementSendCount, RecordRateCount). False positives are 118 + // effectively impossible: these phrases are reserved by SQLite for 119 + // busy/locked conditions and don't appear in unrelated driver errors. 120 + func IsSQLiteBusy(err error) bool { 121 + if err == nil { 122 + return false 123 + } 124 + s := strings.ToLower(err.Error()) 125 + switch { 126 + case strings.Contains(s, "database is locked"): 127 + return true 128 + case strings.Contains(s, "database table is locked"): 129 + return true 130 + case strings.Contains(s, "sqlite_busy"): 131 + return true 132 + case strings.Contains(s, "(5)"): 133 + // modernc surfaces SQLITE_BUSY = 5 with a "(5)" suffix. 134 + // Bare "(5)" matches too eagerly on its own; require the 135 + // "locked"/"busy" keyword adjacent to avoid false positives 136 + // against e.g. constraint codes. 137 + return strings.Contains(s, "busy") || strings.Contains(s, "locked") 138 + } 139 + return false 140 + }
+151
internal/relaystore/observability_test.go
··· 1 + // SPDX-License-Identifier: AGPL-3.0-or-later 2 + 3 + package relaystore 4 + 5 + import ( 6 + "context" 7 + "errors" 8 + "sync" 9 + "sync/atomic" 10 + "testing" 11 + "time" 12 + ) 13 + 14 + func TestIsSQLiteBusy(t *testing.T) { 15 + cases := []struct { 16 + name string 17 + err error 18 + want bool 19 + }{ 20 + {"nil", nil, false}, 21 + {"unrelated", errors.New("foreign key constraint failed"), false}, 22 + {"locked plain", errors.New("database is locked"), true}, 23 + {"locked uppercase", errors.New("Database Is Locked"), true}, 24 + {"table locked", errors.New("database table is locked"), true}, 25 + {"sqlite_busy", errors.New("SQLITE_BUSY: cannot start a transaction"), true}, 26 + {"modernc paren-5 + busy", errors.New("database busy (5)"), true}, 27 + {"modernc paren-5 + locked", errors.New("locked (5)"), true}, 28 + {"paren-5 alone is NOT busy", errors.New("constraint code (5)"), false}, 29 + {"wrapped", errors.New("insert message: database is locked"), true}, 30 + } 31 + for _, tc := range cases { 32 + t.Run(tc.name, func(t *testing.T) { 33 + if got := IsSQLiteBusy(tc.err); got != tc.want { 34 + t.Errorf("IsSQLiteBusy(%q) = %v, want %v", tc.err, got, tc.want) 35 + } 36 + }) 37 + } 38 + } 39 + 40 + // stubBusyRecorder counts IncBusyError calls so tests can verify 41 + // the store wiring without dragging Prometheus types into the test. 42 + type stubBusyRecorder struct { 43 + mu sync.Mutex 44 + calls map[string]int 45 + } 46 + 47 + func newStubBusyRecorder() *stubBusyRecorder { 48 + return &stubBusyRecorder{calls: map[string]int{}} 49 + } 50 + 51 + func (s *stubBusyRecorder) IncBusyError(op string) { 52 + s.mu.Lock() 53 + s.calls[op]++ 54 + s.mu.Unlock() 55 + } 56 + 57 + func (s *stubBusyRecorder) count(op string) int { 58 + s.mu.Lock() 59 + defer s.mu.Unlock() 60 + return s.calls[op] 61 + } 62 + 63 + // TestSampleStats_ReturnsZeroOnFreshStore confirms the cheap-path 64 + // invariant: SampleStats is safe to call before any traffic and 65 + // returns sane zero-ish values rather than panicking on an 66 + // uninitialized pool. 67 + func TestSampleStats_ReturnsZeroOnFreshStore(t *testing.T) { 68 + s := testStore(t) 69 + ps := s.SampleStats() 70 + // OpenConnections may be 0 or 1 depending on whether testStore 71 + // pre-pinged. Just assert the shape didn't panic + returns 72 + // non-negative values. 73 + if ps.OpenConnections < 0 || ps.InUse < 0 || ps.Idle < 0 || ps.WaitCount < 0 { 74 + t.Fatalf("negative stats: %+v", ps) 75 + } 76 + } 77 + 78 + // TestSampleStats_TracksInUse drives a transaction that holds a 79 + // connection and verifies SampleStats observes InUse > 0 while it 80 + // runs. Verifies the gauge has any signal at all (not just hard- 81 + // zero) when contention is occurring. 82 + func TestSampleStats_TracksInUse(t *testing.T) { 83 + s := testStore(t) 84 + // Start a long query in a goroutine; hold it until the test 85 + // has had a chance to sample. 86 + release := make(chan struct{}) 87 + started := make(chan struct{}) 88 + var counted atomic.Int32 89 + go func() { 90 + // modernc/sqlite supports BEGIN IMMEDIATE; hold it. 91 + conn, err := s.db.Conn(context.Background()) 92 + if err != nil { 93 + t.Errorf("get conn: %v", err) 94 + return 95 + } 96 + defer conn.Close() 97 + close(started) 98 + <-release 99 + counted.Store(1) 100 + }() 101 + <-started 102 + defer close(release) 103 + 104 + // Allow the sql.DB pool to register the open connection. 105 + deadline := time.Now().Add(time.Second) 106 + for time.Now().Before(deadline) { 107 + if s.SampleStats().InUse >= 1 { 108 + return 109 + } 110 + time.Sleep(5 * time.Millisecond) 111 + } 112 + t.Errorf("SampleStats().InUse never reached >= 1; final: %+v", s.SampleStats()) 113 + } 114 + 115 + // TestStore_BusyRecorder_OptionalNilSafe confirms callers can use 116 + // the store without ever wiring a recorder. The recordIfBusy path 117 + // must short-circuit on nil rather than panicking. 118 + func TestStore_BusyRecorder_OptionalNilSafe(t *testing.T) { 119 + s := testStore(t) 120 + // No SetBusyRecorder call. 121 + out := s.recordIfBusy("any", errors.New("database is locked")) 122 + if out == nil || out.Error() != "database is locked" { 123 + t.Errorf("recordIfBusy nil-recorder returned %v, want pass-through", out) 124 + } 125 + } 126 + 127 + // TestStore_RecordIfBusy_WiresClassifier confirms recordIfBusy 128 + // forwards busy errors and ignores non-busy errors. 129 + func TestStore_RecordIfBusy_WiresClassifier(t *testing.T) { 130 + s := testStore(t) 131 + rec := newStubBusyRecorder() 132 + s.SetBusyRecorder(rec) 133 + 134 + // Busy: should count. 135 + s.recordIfBusy("op1", errors.New("database is locked")) 136 + if got := rec.count("op1"); got != 1 { 137 + t.Errorf("op1 count = %d, want 1", got) 138 + } 139 + 140 + // Non-busy: should NOT count. 141 + s.recordIfBusy("op2", errors.New("constraint failure")) 142 + if got := rec.count("op2"); got != 0 { 143 + t.Errorf("op2 count = %d, want 0 (non-busy err shouldn't increment)", got) 144 + } 145 + 146 + // Nil error: should NOT count. 147 + s.recordIfBusy("op3", nil) 148 + if got := rec.count("op3"); got != 0 { 149 + t.Errorf("op3 count = %d, want 0 (nil err shouldn't increment)", got) 150 + } 151 + }
+126
internal/relaystore/orphan_test.go
··· 1 + // SPDX-License-Identifier: AGPL-3.0-or-later 2 + 3 + package relaystore 4 + 5 + import ( 6 + "context" 7 + "errors" 8 + "testing" 9 + "time" 10 + ) 11 + 12 + // TestUpdateMessageStatus_ReturnsErrMessageNotFound pins the orphan 13 + // detection contract: updating a non-existent ID surfaces a typed 14 + // error so the delivery callback can increment the orphan metric 15 + // instead of silently dropping the update (#208). 16 + func TestUpdateMessageStatus_ReturnsErrMessageNotFound(t *testing.T) { 17 + s := testStore(t) 18 + err := s.UpdateMessageStatus(context.Background(), 999_999, MsgSent, 250) 19 + if !errors.Is(err, ErrMessageNotFound) { 20 + t.Fatalf("expected ErrMessageNotFound, got %v", err) 21 + } 22 + } 23 + 24 + // TestUpdateMessageStatus_ExistingRowReturnsNil — happy path, the 25 + // orphan-detecting code must not break the normal delivery callback. 26 + func TestUpdateMessageStatus_ExistingRowReturnsNil(t *testing.T) { 27 + s := testStore(t) 28 + ctx := context.Background() 29 + if err := s.InsertMember(ctx, &Member{ 30 + DID: "did:plc:orphan", 31 + Status: StatusActive, 32 + HourlyLimit: 100, DailyLimit: 1000, 33 + CreatedAt: time.Now().UTC(), UpdatedAt: time.Now().UTC(), 34 + }); err != nil { 35 + t.Fatal(err) 36 + } 37 + id, err := s.InsertMessage(ctx, &Message{ 38 + MemberDID: "did:plc:orphan", FromAddr: "a@b", ToAddr: "c@d", 39 + MessageID: "<x@y>", Status: MsgQueued, CreatedAt: time.Now().UTC(), 40 + }) 41 + if err != nil { 42 + t.Fatalf("InsertMessage: %v", err) 43 + } 44 + if err := s.UpdateMessageStatus(ctx, id, MsgSent, 250); err != nil { 45 + t.Errorf("update existing row: %v", err) 46 + } 47 + } 48 + 49 + // TestListQueuedMessageIDsOlderThan_FiltersByAgeAndStatus is the 50 + // janitor's contract: returns only rows that are status=queued AND 51 + // older than the cutoff. Excludes recent rows (would race with 52 + // just-Enqueued messages whose spool file is still landing) and 53 + // excludes terminal-state rows. 54 + func TestListQueuedMessageIDsOlderThan_FiltersByAgeAndStatus(t *testing.T) { 55 + s := testStore(t) 56 + ctx := context.Background() 57 + if err := s.InsertMember(ctx, &Member{ 58 + DID: "did:plc:janitortest", Status: StatusActive, HourlyLimit: 100, DailyLimit: 1000, 59 + CreatedAt: time.Now().UTC(), UpdatedAt: time.Now().UTC(), 60 + }); err != nil { 61 + t.Fatal(err) 62 + } 63 + 64 + now := time.Now().UTC() 65 + mk := func(status string, age time.Duration) int64 { 66 + t.Helper() 67 + id, err := s.InsertMessage(ctx, &Message{ 68 + MemberDID: "did:plc:janitortest", FromAddr: "a@b", ToAddr: "c@d", 69 + MessageID: "<id@x>", Status: status, CreatedAt: now.Add(-age), 70 + }) 71 + if err != nil { 72 + t.Fatal(err) 73 + } 74 + return id 75 + } 76 + 77 + oldQueued := mk(MsgQueued, 10*time.Minute) // should appear 78 + mk(MsgQueued, 30*time.Second) // too recent — should NOT appear 79 + mk(MsgSent, 1*time.Hour) // wrong status — should NOT appear 80 + mk(MsgBounced, 1*time.Hour) // wrong status — should NOT appear 81 + oldQueued2 := mk(MsgQueued, 1*time.Hour) // should appear 82 + 83 + ids, err := s.ListQueuedMessageIDsOlderThan(ctx, 5*time.Minute, 100) 84 + if err != nil { 85 + t.Fatalf("ListQueuedMessageIDsOlderThan: %v", err) 86 + } 87 + got := map[int64]bool{} 88 + for _, id := range ids { 89 + got[id] = true 90 + } 91 + if !got[oldQueued] || !got[oldQueued2] { 92 + t.Errorf("missing expected ids; got=%v want both %d and %d", ids, oldQueued, oldQueued2) 93 + } 94 + if len(ids) != 2 { 95 + t.Errorf("returned %d ids, want exactly 2 — recent / non-queued rows leaked through", len(ids)) 96 + } 97 + } 98 + 99 + // TestListQueuedMessageIDsOlderThan_RespectsLimit confirms the limit 100 + // is honored so the janitor can bound its work per pass. 101 + func TestListQueuedMessageIDsOlderThan_RespectsLimit(t *testing.T) { 102 + s := testStore(t) 103 + ctx := context.Background() 104 + if err := s.InsertMember(ctx, &Member{ 105 + DID: "did:plc:janitorlimit", Status: StatusActive, HourlyLimit: 100, DailyLimit: 1000, 106 + CreatedAt: time.Now().UTC(), UpdatedAt: time.Now().UTC(), 107 + }); err != nil { 108 + t.Fatal(err) 109 + } 110 + old := time.Now().UTC().Add(-1 * time.Hour) 111 + for i := 0; i < 7; i++ { 112 + if _, err := s.InsertMessage(ctx, &Message{ 113 + MemberDID: "did:plc:janitorlimit", FromAddr: "a@b", ToAddr: "c@d", 114 + MessageID: "<id@x>", Status: MsgQueued, CreatedAt: old, 115 + }); err != nil { 116 + t.Fatal(err) 117 + } 118 + } 119 + ids, err := s.ListQueuedMessageIDsOlderThan(ctx, 5*time.Minute, 3) 120 + if err != nil { 121 + t.Fatalf("ListQueuedMessageIDsOlderThan: %v", err) 122 + } 123 + if len(ids) != 3 { 124 + t.Errorf("returned %d ids, want 3 (limit)", len(ids)) 125 + } 126 + }
+11 -11
internal/relaystore/pending_notifications.go
··· 74 74 WHERE dead_letter = 0; 75 75 `) 76 76 if err != nil { 77 - return fmt.Errorf("create pending_notifications: %v", err) 77 + return fmt.Errorf("create pending_notifications: %w", err) 78 78 } 79 79 return nil 80 80 } ··· 102 102 } 103 103 payloadBytes, err := json.Marshal(payload) 104 104 if err != nil { 105 - return 0, fmt.Errorf("enqueue notification: marshal payload: %v", err) 105 + return 0, fmt.Errorf("enqueue notification: marshal payload: %w", err) 106 106 } 107 107 108 108 now := time.Now().UTC().Unix() ··· 112 112 kind, recipient, string(payloadBytes), now, now, 113 113 ) 114 114 if err != nil { 115 - return 0, fmt.Errorf("enqueue notification: insert: %v", err) 115 + return 0, fmt.Errorf("enqueue notification: insert: %w", err) 116 116 } 117 117 return res.LastInsertId() 118 118 } ··· 140 140 now, limit, 141 141 ) 142 142 if err != nil { 143 - return nil, fmt.Errorf("claim notifications: %v", err) 143 + return nil, fmt.Errorf("claim notifications: %w", err) 144 144 } 145 145 defer rows.Close() 146 146 ··· 150 150 var payloadStr string 151 151 var createdAt, nextAttemptAt int64 152 152 if err := rows.Scan(&n.ID, &n.Kind, &n.Recipient, &payloadStr, &n.Attempts, &n.LastError, &createdAt, &nextAttemptAt); err != nil { 153 - return nil, fmt.Errorf("scan notification: %v", err) 153 + return nil, fmt.Errorf("scan notification: %w", err) 154 154 } 155 155 n.CreatedAt = time.Unix(createdAt, 0).UTC() 156 156 n.NextAttemptAt = time.Unix(nextAttemptAt, 0).UTC() ··· 182 182 if success { 183 183 _, err := s.db.ExecContext(ctx, `DELETE FROM pending_notifications WHERE id = ?`, id) 184 184 if err != nil { 185 - return fmt.Errorf("delete delivered notification: %v", err) 185 + return fmt.Errorf("delete delivered notification: %w", err) 186 186 } 187 187 return nil 188 188 } ··· 202 202 lastError, next, MaxNotificationAttempts, id, 203 203 ) 204 204 if err != nil { 205 - return fmt.Errorf("mark notification attempted: %v", err) 205 + return fmt.Errorf("mark notification attempted: %w", err) 206 206 } 207 207 return nil 208 208 } ··· 220 220 ORDER BY next_attempt_at DESC`, 221 221 ) 222 222 if err != nil { 223 - return nil, fmt.Errorf("list dead letter: %v", err) 223 + return nil, fmt.Errorf("list dead letter: %w", err) 224 224 } 225 225 defer rows.Close() 226 226 ··· 230 230 var payloadStr string 231 231 var createdAt, nextAttemptAt int64 232 232 if err := rows.Scan(&n.ID, &n.Kind, &n.Recipient, &payloadStr, &n.Attempts, &n.LastError, &createdAt, &nextAttemptAt); err != nil { 233 - return nil, fmt.Errorf("scan dead letter: %v", err) 233 + return nil, fmt.Errorf("scan dead letter: %w", err) 234 234 } 235 235 n.CreatedAt = time.Unix(createdAt, 0).UTC() 236 236 n.NextAttemptAt = time.Unix(nextAttemptAt, 0).UTC() ··· 261 261 return nil, nil 262 262 } 263 263 if err != nil { 264 - return nil, fmt.Errorf("get notification: %v", err) 264 + return nil, fmt.Errorf("get notification: %w", err) 265 265 } 266 266 n.CreatedAt = time.Unix(createdAt, 0).UTC() 267 267 n.NextAttemptAt = time.Unix(nextAttemptAt, 0).UTC() ··· 286 286 return false, nil 287 287 } 288 288 if err != nil { 289 - return false, fmt.Errorf("is dead letter: %v", err) 289 + return false, fmt.Errorf("is dead letter: %w", err) 290 290 } 291 291 return flag == 1, nil 292 292 }
+13 -13
internal/relaystore/relay_events.go
··· 90 90 CREATE INDEX IF NOT EXISTS idx_relay_events_event_timestamp ON relay_events(event_timestamp DESC); 91 91 `) 92 92 if err != nil { 93 - return fmt.Errorf("create relay_events: %v", err) 93 + return fmt.Errorf("create relay_events: %w", err) 94 94 } 95 95 96 96 // content_fingerprint added after the table shipped — ADD COLUMN is ··· 101 101 _ = s.db.QueryRow(`SELECT COUNT(*) FROM pragma_table_info('relay_events') WHERE name = 'content_fingerprint'`).Scan(&hasFingerprint) 102 102 if hasFingerprint == 0 { 103 103 if _, err := s.db.Exec(`ALTER TABLE relay_events ADD COLUMN content_fingerprint TEXT NOT NULL DEFAULT ''`); err != nil { 104 - return fmt.Errorf("add content_fingerprint column: %v", err) 104 + return fmt.Errorf("add content_fingerprint column: %w", err) 105 105 } 106 106 } 107 107 // Index the fingerprint for the primary read pattern: "show me every ··· 110 110 // empty fingerprints (non-relay_attempt events) and we don't want them 111 111 // bloating the index. 112 112 if _, err := s.db.Exec(`CREATE INDEX IF NOT EXISTS idx_relay_events_fingerprint ON relay_events(content_fingerprint, event_timestamp DESC) WHERE content_fingerprint != ''`); err != nil { 113 - return fmt.Errorf("create fingerprint index: %v", err) 113 + return fmt.Errorf("create fingerprint index: %w", err) 114 114 } 115 115 return nil 116 116 } ··· 121 121 func (s *Store) InsertRelayEvent(ctx context.Context, e *RelayEvent) error { 122 122 verdictsJSON, err := json.Marshal(defaultStrings(e.Verdicts)) 123 123 if err != nil { 124 - return fmt.Errorf("marshal verdicts: %v", err) 124 + return fmt.Errorf("marshal verdicts: %w", err) 125 125 } 126 126 labelsJSON, err := json.Marshal(defaultStrings(e.LabelsApplied)) 127 127 if err != nil { 128 - return fmt.Errorf("marshal labels_applied: %v", err) 128 + return fmt.Errorf("marshal labels_applied: %w", err) 129 129 } 130 130 131 131 var smtpCode any ··· 146 146 string(verdictsJSON), string(labelsJSON), e.Raw, 147 147 ) 148 148 if err != nil { 149 - return fmt.Errorf("insert relay event: %v", err) 149 + return fmt.Errorf("insert relay event: %w", err) 150 150 } 151 151 return nil 152 152 } ··· 203 203 204 204 rows, err := s.db.QueryContext(ctx, q, args...) 205 205 if err != nil { 206 - return nil, fmt.Errorf("list relay events: %v", err) 206 + return nil, fmt.Errorf("list relay events: %w", err) 207 207 } 208 208 defer rows.Close() 209 209 ··· 226 226 `SELECT MAX(kafka_offset) FROM relay_events`, 227 227 ).Scan(&offset) 228 228 if err != nil { 229 - return -1, fmt.Errorf("last kafka offset: %v", err) 229 + return -1, fmt.Errorf("last kafka offset: %w", err) 230 230 } 231 231 if !offset.Valid { 232 232 return -1, nil ··· 244 244 formatTime(since), 245 245 ) 246 246 if err != nil { 247 - return nil, fmt.Errorf("count events by action: %v", err) 247 + return nil, fmt.Errorf("count events by action: %w", err) 248 248 } 249 249 defer rows.Close() 250 250 ··· 253 253 var name string 254 254 var count int64 255 255 if err := rows.Scan(&name, &count); err != nil { 256 - return nil, fmt.Errorf("scan action count: %v", err) 256 + return nil, fmt.Errorf("scan action count: %w", err) 257 257 } 258 258 out[name] = count 259 259 } ··· 279 279 formatTime(since), 280 280 ) 281 281 if err != nil { 282 - return nil, fmt.Errorf("count labels applied: %v", err) 282 + return nil, fmt.Errorf("count labels applied: %w", err) 283 283 } 284 284 defer rows.Close() 285 285 ··· 287 287 for rows.Next() { 288 288 var raw string 289 289 if err := rows.Scan(&raw); err != nil { 290 - return nil, fmt.Errorf("scan labels_applied: %v", err) 290 + return nil, fmt.Errorf("scan labels_applied: %w", err) 291 291 } 292 292 var labels []string 293 293 if err := json.Unmarshal([]byte(raw), &labels); err != nil { ··· 331 331 return nil, nil 332 332 } 333 333 if err != nil { 334 - return nil, fmt.Errorf("scan relay event: %v", err) 334 + return nil, fmt.Errorf("scan relay event: %w", err) 335 335 } 336 336 e.IngestedAt = parseTime(ingestedAt) 337 337 e.EventTimestamp = parseTime(eventTimestamp)
+98
internal/relaystore/schema_version.go
··· 1 + // SPDX-License-Identifier: AGPL-3.0-or-later 2 + 3 + package relaystore 4 + 5 + import ( 6 + "database/sql" 7 + "errors" 8 + "fmt" 9 + "time" 10 + ) 11 + 12 + // CurrentSchemaVersion is the schema version this binary expects. 13 + // 14 + // Bump this whenever migrate() learns to apply a new structural change 15 + // (CREATE TABLE / ALTER TABLE / new index / migration of existing data). 16 + // The store records the bumped version at the end of a successful 17 + // migrate() run, and refuses to start the next time around if a *newer* 18 + // version has since been recorded by some later binary. 19 + // 20 + // Version history (kept here so the diff that bumps the constant carries 21 + // the rationale; the schema_version table also records `description`): 22 + // 23 + // 1 — baseline: everything that migrate() builds today (multi-domain, 24 + // messages, rate counters, suppressions, member_domains, attestation 25 + // flags, content fingerprints, relay events, bypass audit, etc.). 26 + // Every existing deployment lands here on first start. 27 + const CurrentSchemaVersion = 1 28 + 29 + // ErrSchemaTooNew is returned by EnsureSchemaVersion when the database 30 + // has been written to by a newer binary than the one starting up. The 31 + // safe behavior is to refuse to start: an old binary missing knowledge 32 + // of newer columns or tables would otherwise silently INSERT defaults 33 + // and corrupt the data the newer binary persisted. 34 + var ErrSchemaTooNew = errors.New("database schema version is newer than this binary supports — refusing to start") 35 + 36 + // EnsureSchemaVersion creates the schema_version tracking table if 37 + // missing and verifies the database isn't ahead of the binary. 38 + // 39 + // Returns ErrSchemaTooNew (wrapped) when MAX(version) > current. The 40 + // returned dbVersion is the highest version recorded in the table 41 + // (zero on a fresh database). It's exposed so the caller can decide 42 + // whether to skip work that's already been applied. 43 + func (s *Store) EnsureSchemaVersion() (dbVersion int, err error) { 44 + if _, err := s.db.Exec(` 45 + CREATE TABLE IF NOT EXISTS schema_version ( 46 + version INTEGER PRIMARY KEY, 47 + description TEXT NOT NULL DEFAULT '', 48 + applied_at TEXT NOT NULL, 49 + binary_marker TEXT NOT NULL DEFAULT '' 50 + ) 51 + `); err != nil { 52 + return 0, fmt.Errorf("create schema_version: %w", err) 53 + } 54 + 55 + var maxVersion sql.NullInt64 56 + if err := s.db.QueryRow(`SELECT MAX(version) FROM schema_version`).Scan(&maxVersion); err != nil { 57 + return 0, fmt.Errorf("read schema_version: %w", err) 58 + } 59 + if !maxVersion.Valid { 60 + return 0, nil 61 + } 62 + dbVersion = int(maxVersion.Int64) 63 + if dbVersion > CurrentSchemaVersion { 64 + return dbVersion, fmt.Errorf("%w (db=%d, binary=%d)", ErrSchemaTooNew, dbVersion, CurrentSchemaVersion) 65 + } 66 + return dbVersion, nil 67 + } 68 + 69 + // RecordSchemaVersion writes a row marking the current binary's schema 70 + // version as applied. Idempotent — INSERT OR IGNORE keeps the original 71 + // applied_at for unchanged versions so deployment history is retained. 72 + func (s *Store) RecordSchemaVersion(description, binaryMarker string) error { 73 + _, err := s.db.Exec( 74 + `INSERT OR IGNORE INTO schema_version (version, description, applied_at, binary_marker) 75 + VALUES (?, ?, ?, ?)`, 76 + CurrentSchemaVersion, 77 + description, 78 + time.Now().UTC().Format(time.RFC3339), 79 + binaryMarker, 80 + ) 81 + if err != nil { 82 + return fmt.Errorf("record schema_version: %w", err) 83 + } 84 + return nil 85 + } 86 + 87 + // SchemaVersion returns the highest version recorded in the database. 88 + // Zero on a fresh database. 89 + func (s *Store) SchemaVersion() (int, error) { 90 + var v sql.NullInt64 91 + if err := s.db.QueryRow(`SELECT MAX(version) FROM schema_version`).Scan(&v); err != nil { 92 + return 0, fmt.Errorf("query schema_version: %w", err) 93 + } 94 + if !v.Valid { 95 + return 0, nil 96 + } 97 + return int(v.Int64), nil 98 + }
+116
internal/relaystore/schema_version_test.go
··· 1 + // SPDX-License-Identifier: AGPL-3.0-or-later 2 + 3 + package relaystore 4 + 5 + import ( 6 + "errors" 7 + "path/filepath" 8 + "testing" 9 + ) 10 + 11 + func newTempStore(t *testing.T) *Store { 12 + t.Helper() 13 + dsn := "file:" + filepath.Join(t.TempDir(), "test.db") + "?_journal=WAL" 14 + s, err := New(dsn) 15 + if err != nil { 16 + t.Fatalf("New: %v", err) 17 + } 18 + t.Cleanup(func() { s.Close() }) 19 + return s 20 + } 21 + 22 + func TestSchemaVersion_FreshDBRecordsCurrent(t *testing.T) { 23 + s := newTempStore(t) 24 + v, err := s.SchemaVersion() 25 + if err != nil { 26 + t.Fatalf("SchemaVersion: %v", err) 27 + } 28 + if v != CurrentSchemaVersion { 29 + t.Errorf("fresh DB schema_version = %d, want %d", v, CurrentSchemaVersion) 30 + } 31 + } 32 + 33 + func TestSchemaVersion_ReopenIsIdempotent(t *testing.T) { 34 + // Reopening a DB that's already at the current version must NOT 35 + // produce a duplicate row or error. 36 + dir := t.TempDir() 37 + dsn := "file:" + filepath.Join(dir, "test.db") + "?_journal=WAL" 38 + 39 + s1, err := New(dsn) 40 + if err != nil { 41 + t.Fatalf("New (1): %v", err) 42 + } 43 + s1.Close() 44 + 45 + s2, err := New(dsn) 46 + if err != nil { 47 + t.Fatalf("New (2): %v", err) 48 + } 49 + defer s2.Close() 50 + 51 + var rows int 52 + if err := s2.db.QueryRow(`SELECT COUNT(*) FROM schema_version WHERE version = ?`, CurrentSchemaVersion).Scan(&rows); err != nil { 53 + t.Fatalf("count: %v", err) 54 + } 55 + if rows != 1 { 56 + t.Errorf("schema_version row count = %d, want 1 (INSERT OR IGNORE should dedupe)", rows) 57 + } 58 + } 59 + 60 + func TestSchemaVersion_RefusesNewerDB(t *testing.T) { 61 + // Simulate "rollback to old binary": pre-populate schema_version 62 + // with a version higher than CurrentSchemaVersion, then reopen. 63 + dir := t.TempDir() 64 + dsn := "file:" + filepath.Join(dir, "test.db") + "?_journal=WAL" 65 + 66 + s1, err := New(dsn) 67 + if err != nil { 68 + t.Fatalf("New (1): %v", err) 69 + } 70 + if _, err := s1.db.Exec( 71 + `INSERT INTO schema_version (version, description, applied_at, binary_marker) 72 + VALUES (?, 'future', '2099-01-01T00:00:00Z', 'future-binary')`, 73 + CurrentSchemaVersion+5, 74 + ); err != nil { 75 + t.Fatalf("inject future version: %v", err) 76 + } 77 + s1.Close() 78 + 79 + _, err = New(dsn) 80 + if err == nil { 81 + t.Fatal("expected New to refuse a newer DB, got nil error") 82 + } 83 + if !errors.Is(err, ErrSchemaTooNew) { 84 + t.Errorf("error chain missing ErrSchemaTooNew: %v", err) 85 + } 86 + } 87 + 88 + func TestSchemaVersion_TableExistsAfterMigrate(t *testing.T) { 89 + s := newTempStore(t) 90 + var count int 91 + if err := s.db.QueryRow( 92 + `SELECT COUNT(*) FROM sqlite_master WHERE type='table' AND name='schema_version'`, 93 + ).Scan(&count); err != nil { 94 + t.Fatalf("query sqlite_master: %v", err) 95 + } 96 + if count != 1 { 97 + t.Errorf("schema_version table not created: count=%d", count) 98 + } 99 + } 100 + 101 + func TestSchemaVersion_RecordIsIdempotentExplicit(t *testing.T) { 102 + // Direct unit test on RecordSchemaVersion to pin INSERT OR IGNORE. 103 + s := newTempStore(t) 104 + for i := 0; i < 5; i++ { 105 + if err := s.RecordSchemaVersion("retry", "marker-v2"); err != nil { 106 + t.Fatalf("RecordSchemaVersion (%d): %v", i, err) 107 + } 108 + } 109 + var rows int 110 + if err := s.db.QueryRow(`SELECT COUNT(*) FROM schema_version`).Scan(&rows); err != nil { 111 + t.Fatalf("count: %v", err) 112 + } 113 + if rows != 1 { 114 + t.Errorf("expected exactly 1 row after 6 calls, got %d", rows) 115 + } 116 + }
+433 -117
internal/relaystore/store.go
··· 5 5 import ( 6 6 "context" 7 7 "database/sql" 8 + "errors" 8 9 "fmt" 9 10 "log" 10 11 "strings" ··· 13 14 14 15 _ "modernc.org/sqlite" 15 16 ) 17 + 18 + // ErrMessageNotFound is returned by UpdateMessageStatus when the 19 + // targeted row does not exist. Caused by a delivery callback firing 20 + // for a spool-only entry whose DB row was never inserted (or was 21 + // purged early). Callers should log + increment a metric so the 22 + // orphan rate is visible — silently dropping these updates is the 23 + // safety hole closed by #208. 24 + var ErrMessageNotFound = errors.New("relaystore: message row not found") 16 25 17 26 // Member status constants. 18 27 const ( ··· 36 45 MsgQueued = "queued" 37 46 MsgSent = "sent" 38 47 MsgBounced = "bounced" 48 + // MsgFailed is the terminal state for messages we lost internally 49 + // (orphan reconciliation, spool corruption). Distinct from 50 + // MsgBounced so operators can distinguish receiver-side rejection 51 + // from our own pipeline failure when reading the dashboard. 52 + MsgFailed = "failed" 39 53 MsgDeferred = "deferred" 40 54 ) 41 55 ··· 147 161 } 148 162 149 163 type Store struct { 150 - db *sql.DB 151 - rateMu sync.Mutex // serializes CheckAndIncrementRate to prevent TOCTOU 164 + db *sql.DB 165 + rateMu sync.Mutex // serializes CheckAndIncrementRate to prevent TOCTOU 166 + busyRecorder BusyRecorder // optional; counts SQLITE_BUSY errors at hot writers (#210) 152 167 } 153 168 154 169 func New(dsn string) (*Store, error) { 155 170 db, err := sql.Open("sqlite", dsn) 156 171 if err != nil { 157 - return nil, fmt.Errorf("open sqlite: %v", err) 172 + return nil, fmt.Errorf("open sqlite: %w", err) 158 173 } 159 174 if _, err := db.Exec("PRAGMA journal_mode=WAL"); err != nil { 160 175 db.Close() 161 - return nil, fmt.Errorf("set WAL mode: %v", err) 176 + return nil, fmt.Errorf("set WAL mode: %w", err) 162 177 } 163 178 if _, err := db.Exec("PRAGMA busy_timeout = 5000"); err != nil { 164 179 db.Close() 165 - return nil, fmt.Errorf("set busy timeout: %v", err) 180 + return nil, fmt.Errorf("set busy timeout: %w", err) 166 181 } 167 182 if _, err := db.Exec("PRAGMA foreign_keys=ON"); err != nil { 168 183 db.Close() 169 - return nil, fmt.Errorf("enable foreign keys: %v", err) 184 + return nil, fmt.Errorf("enable foreign keys: %w", err) 170 185 } 171 186 s := &Store{db: db} 172 187 if err := s.migrate(); err != nil { 173 188 db.Close() 174 - return nil, fmt.Errorf("migrate: %v", err) 189 + return nil, fmt.Errorf("migrate: %w", err) 175 190 } 176 191 return s, nil 177 192 } ··· 185 200 } 186 201 187 202 func (s *Store) migrate() error { 203 + // Schema-version guard: refuse to start if the DB has been written to 204 + // by a newer binary than this one. Without this an old rollback would 205 + // silently use ALTER TABLE / INSERT DEFAULTS on a schema it doesn't 206 + // understand and corrupt the data the newer binary persisted (#224). 207 + if _, err := s.EnsureSchemaVersion(); err != nil { 208 + return err 209 + } 210 + 188 211 // Check if old schema exists (members table has 'domain' column) 189 212 var hasDomainCol int 190 213 _ = s.db.QueryRow(`SELECT COUNT(*) FROM pragma_table_info('members') WHERE name = 'domain'`).Scan(&hasDomainCol) 191 214 192 215 if hasDomainCol > 0 { 193 216 if err := s.migrateToMultiDomain(); err != nil { 194 - return fmt.Errorf("multi-domain migration: %v", err) 217 + return fmt.Errorf("multi-domain migration: %w", err) 195 218 } 196 219 } 197 220 ··· 260 283 CREATE INDEX IF NOT EXISTS idx_feedback_events_member ON feedback_events(member_did); 261 284 262 285 CREATE TABLE IF NOT EXISTS bypass_dids ( 263 - did TEXT PRIMARY KEY 286 + did TEXT PRIMARY KEY, 287 + expires_at TEXT NOT NULL DEFAULT '', 288 + reason TEXT NOT NULL DEFAULT '', 289 + created_at TEXT NOT NULL DEFAULT '' 264 290 ); 265 291 292 + -- bypass_audit retains an immutable log of every add/remove so 293 + -- compromise or accidental mass-bypass can be reconstructed 294 + -- after the fact. The active bypass set lives in bypass_dids; 295 + -- this table is append-only. 296 + CREATE TABLE IF NOT EXISTS bypass_audit ( 297 + id INTEGER PRIMARY KEY AUTOINCREMENT, 298 + did TEXT NOT NULL, 299 + action TEXT NOT NULL, -- 'add' or 'remove' 300 + reason TEXT NOT NULL DEFAULT '', 301 + expires_at TEXT NOT NULL DEFAULT '', -- only meaningful for 'add' 302 + created_at TEXT NOT NULL 303 + ); 304 + CREATE INDEX IF NOT EXISTS bypass_audit_created_at_idx 305 + ON bypass_audit(created_at); 306 + 266 307 CREATE TABLE IF NOT EXISTS suppressions ( 267 308 member_did TEXT NOT NULL, 268 309 recipient_addr TEXT NOT NULL, ··· 357 398 _ = s.db.QueryRow(`SELECT COUNT(*) FROM pragma_table_info('member_domains') WHERE name = 'forward_to'`).Scan(&hasForwardTo) 358 399 if hasForwardTo == 0 { 359 400 if _, err := s.db.Exec(`ALTER TABLE member_domains ADD COLUMN forward_to TEXT NOT NULL DEFAULT ''`); err != nil { 360 - return fmt.Errorf("add forward_to column: %v", err) 401 + return fmt.Errorf("add forward_to column: %w", err) 361 402 } 362 403 } 363 404 ··· 369 410 _ = s.db.QueryRow(`SELECT COUNT(*) FROM pragma_table_info('members') WHERE name = 'did_verified'`).Scan(&hasDIDVerified) 370 411 if hasDIDVerified == 0 { 371 412 if _, err := s.db.Exec(`ALTER TABLE members ADD COLUMN did_verified INTEGER NOT NULL DEFAULT 0`); err != nil { 372 - return fmt.Errorf("add did_verified column: %v", err) 413 + return fmt.Errorf("add did_verified column: %w", err) 373 414 } 374 415 } 375 416 ··· 380 421 _ = s.db.QueryRow(`SELECT COUNT(*) FROM pragma_table_info('pending_enrollments') WHERE name = 'contact_email'`).Scan(&hasPendingContactEmail) 381 422 if hasPendingContactEmail == 0 { 382 423 if _, err := s.db.Exec(`ALTER TABLE pending_enrollments ADD COLUMN contact_email TEXT NOT NULL DEFAULT ''`); err != nil { 383 - return fmt.Errorf("add contact_email to pending_enrollments: %v", err) 424 + return fmt.Errorf("add contact_email to pending_enrollments: %w", err) 384 425 } 385 426 } 386 427 ··· 393 434 _ = s.db.QueryRow(`SELECT COUNT(*) FROM pragma_table_info('member_domains') WHERE name = 'contact_email'`).Scan(&hasContactEmail) 394 435 if hasContactEmail == 0 { 395 436 if _, err := s.db.Exec(`ALTER TABLE member_domains ADD COLUMN contact_email TEXT NOT NULL DEFAULT ''`); err != nil { 396 - return fmt.Errorf("add contact_email column: %v", err) 437 + return fmt.Errorf("add contact_email column: %w", err) 397 438 } 398 439 } 399 440 ··· 406 447 _ = s.db.QueryRow(`SELECT COUNT(*) FROM pragma_table_info('member_domains') WHERE name = 'attestation_rkey'`).Scan(&hasAttRkey) 407 448 if hasAttRkey == 0 { 408 449 if _, err := s.db.Exec(`ALTER TABLE member_domains ADD COLUMN attestation_rkey TEXT NOT NULL DEFAULT ''`); err != nil { 409 - return fmt.Errorf("add attestation_rkey column: %v", err) 450 + return fmt.Errorf("add attestation_rkey column: %w", err) 410 451 } 411 452 } 412 453 var hasAttAt int 413 454 _ = s.db.QueryRow(`SELECT COUNT(*) FROM pragma_table_info('member_domains') WHERE name = 'attestation_published_at'`).Scan(&hasAttAt) 414 455 if hasAttAt == 0 { 415 456 if _, err := s.db.Exec(`ALTER TABLE member_domains ADD COLUMN attestation_published_at TEXT NOT NULL DEFAULT ''`); err != nil { 416 - return fmt.Errorf("add attestation_published_at column: %v", err) 457 + return fmt.Errorf("add attestation_published_at column: %w", err) 417 458 } 418 459 } 419 460 ··· 423 464 _ = s.db.QueryRow(`SELECT COUNT(*) FROM pragma_table_info('members') WHERE name = 'terms_accepted_at'`).Scan(&hasTermsAcceptedAt) 424 465 if hasTermsAcceptedAt == 0 { 425 466 if _, err := s.db.Exec(`ALTER TABLE members ADD COLUMN terms_accepted_at TEXT NOT NULL DEFAULT ''`); err != nil { 426 - return fmt.Errorf("add terms_accepted_at column: %v", err) 467 + return fmt.Errorf("add terms_accepted_at column: %w", err) 427 468 } 428 469 } 429 470 var hasTermsVersion int 430 471 _ = s.db.QueryRow(`SELECT COUNT(*) FROM pragma_table_info('members') WHERE name = 'terms_version'`).Scan(&hasTermsVersion) 431 472 if hasTermsVersion == 0 { 432 473 if _, err := s.db.Exec(`ALTER TABLE members ADD COLUMN terms_version TEXT NOT NULL DEFAULT ''`); err != nil { 433 - return fmt.Errorf("add terms_version column: %v", err) 474 + return fmt.Errorf("add terms_version column: %w", err) 434 475 } 435 476 } 436 477 ··· 440 481 _ = s.db.QueryRow(`SELECT COUNT(*) FROM pragma_table_info('pending_enrollments') WHERE name = 'terms_accepted'`).Scan(&hasPendingTerms) 441 482 if hasPendingTerms == 0 { 442 483 if _, err := s.db.Exec(`ALTER TABLE pending_enrollments ADD COLUMN terms_accepted INTEGER NOT NULL DEFAULT 0`); err != nil { 443 - return fmt.Errorf("add terms_accepted to pending_enrollments: %v", err) 484 + return fmt.Errorf("add terms_accepted to pending_enrollments: %w", err) 444 485 } 445 486 } 446 487 ··· 453 494 _ = s.db.QueryRow(`SELECT COUNT(*) FROM pragma_table_info('member_domains') WHERE name = 'email_verified'`).Scan(&hasEmailVerified) 454 495 if hasEmailVerified == 0 { 455 496 if _, err := s.db.Exec(`ALTER TABLE member_domains ADD COLUMN email_verified INTEGER DEFAULT 0`); err != nil { 456 - return fmt.Errorf("add email_verified column: %v", err) 497 + return fmt.Errorf("add email_verified column: %w", err) 457 498 } 458 499 } 459 500 var hasEmailVerifyToken int 460 501 _ = s.db.QueryRow(`SELECT COUNT(*) FROM pragma_table_info('member_domains') WHERE name = 'email_verify_token'`).Scan(&hasEmailVerifyToken) 461 502 if hasEmailVerifyToken == 0 { 462 503 if _, err := s.db.Exec(`ALTER TABLE member_domains ADD COLUMN email_verify_token TEXT DEFAULT ''`); err != nil { 463 - return fmt.Errorf("add email_verify_token column: %v", err) 504 + return fmt.Errorf("add email_verify_token column: %w", err) 464 505 } 465 506 } 466 507 var hasEmailVerifyExpires int 467 508 _ = s.db.QueryRow(`SELECT COUNT(*) FROM pragma_table_info('member_domains') WHERE name = 'email_verify_expires'`).Scan(&hasEmailVerifyExpires) 468 509 if hasEmailVerifyExpires == 0 { 469 510 if _, err := s.db.Exec(`ALTER TABLE member_domains ADD COLUMN email_verify_expires TEXT DEFAULT ''`); err != nil { 470 - return fmt.Errorf("add email_verify_expires column: %v", err) 511 + return fmt.Errorf("add email_verify_expires column: %w", err) 471 512 } 472 513 } 473 514 ··· 479 520 _ = s.db.QueryRow(`SELECT COUNT(*) FROM pragma_table_info('messages') WHERE name = 'content_fingerprint'`).Scan(&hasFP) 480 521 if hasFP == 0 { 481 522 if _, err := s.db.Exec(`ALTER TABLE messages ADD COLUMN content_fingerprint TEXT NOT NULL DEFAULT ''`); err != nil { 482 - return fmt.Errorf("add content_fingerprint column: %v", err) 523 + return fmt.Errorf("add content_fingerprint column: %w", err) 483 524 } 484 525 if _, err := s.db.Exec(`CREATE INDEX IF NOT EXISTS idx_messages_fingerprint ON messages(member_did, content_fingerprint, created_at)`); err != nil { 485 - return fmt.Errorf("create fingerprint index: %v", err) 526 + return fmt.Errorf("create fingerprint index: %w", err) 486 527 } 487 528 } 488 529 ··· 503 544 if err := s.migratePendingNotifications(); err != nil { 504 545 return err 505 546 } 547 + // Bypass-DID expiry + audit columns (#213). Existing deployments 548 + // have a bypass_dids table without expires_at/reason/created_at — 549 + // add them as defaults so we don't lose any active bypass on the 550 + // migration. Old rows get expires_at='' which the new ListBypassDIDs 551 + // treats as "permanent" (matching legacy behavior); operators are 552 + // expected to re-add with expiry as part of the rollout runbook. 553 + if err := s.migrateBypassExpiry(); err != nil { 554 + return err 555 + } 556 + 557 + // All structural changes applied — record the version so a future 558 + // downgrade can detect that this binary already touched the DB. 559 + if err := s.RecordSchemaVersion("baseline (multi-domain, full schema)", ""); err != nil { 560 + return err 561 + } 562 + return nil 563 + } 564 + 565 + // migrateBypassExpiry adds expires_at/reason/created_at to bypass_dids 566 + // on existing deployments and creates the bypass_audit table if it 567 + // doesn't already exist (the CREATE TABLE IF NOT EXISTS at top of 568 + // migrate() handles the audit table; this function is for the columns 569 + // on bypass_dids itself). 570 + func (s *Store) migrateBypassExpiry() error { 571 + type col struct{ name, sql string } 572 + wanted := []col{ 573 + {"expires_at", `ALTER TABLE bypass_dids ADD COLUMN expires_at TEXT NOT NULL DEFAULT ''`}, 574 + {"reason", `ALTER TABLE bypass_dids ADD COLUMN reason TEXT NOT NULL DEFAULT ''`}, 575 + {"created_at", `ALTER TABLE bypass_dids ADD COLUMN created_at TEXT NOT NULL DEFAULT ''`}, 576 + } 577 + for _, c := range wanted { 578 + var n int 579 + if err := s.db.QueryRow(`SELECT COUNT(*) FROM pragma_table_info('bypass_dids') WHERE name = ?`, c.name).Scan(&n); err != nil { 580 + return fmt.Errorf("check bypass_dids.%s: %w", c.name, err) 581 + } 582 + if n == 0 { 583 + if _, err := s.db.Exec(c.sql); err != nil { 584 + return fmt.Errorf("add bypass_dids.%s: %w", c.name, err) 585 + } 586 + } 587 + } 506 588 return nil 507 589 } 508 590 ··· 511 593 func (s *Store) migrateToMultiDomain() error { 512 594 // Must disable FK checks outside a transaction for SQLite 513 595 if _, err := s.db.Exec("PRAGMA foreign_keys=OFF"); err != nil { 514 - return fmt.Errorf("disable FK: %v", err) 596 + return fmt.Errorf("disable FK: %w", err) 515 597 } 516 598 defer s.db.Exec("PRAGMA foreign_keys=ON") 517 599 518 600 tx, err := s.db.Begin() 519 601 if err != nil { 520 - return fmt.Errorf("begin tx: %v", err) 602 + return fmt.Errorf("begin tx: %w", err) 521 603 } 522 604 defer tx.Rollback() 523 605 ··· 536 618 SELECT domain, did, api_key_hash, dkim_rsa_privkey, dkim_ed_privkey, dkim_selector, created_at FROM members; 537 619 `) 538 620 if err != nil { 539 - return fmt.Errorf("create member_domains: %v", err) 621 + return fmt.Errorf("create member_domains: %w", err) 540 622 } 541 623 542 624 // Recreate members without domain-level columns ··· 557 639 ALTER TABLE members_v2 RENAME TO members; 558 640 `) 559 641 if err != nil { 560 - return fmt.Errorf("recreate members: %v", err) 642 + return fmt.Errorf("recreate members: %w", err) 561 643 } 562 644 563 645 _, err = tx.Exec(`CREATE INDEX idx_member_domains_did ON member_domains(did)`) 564 646 if err != nil { 565 - return fmt.Errorf("create domain index: %v", err) 647 + return fmt.Errorf("create domain index: %w", err) 566 648 } 567 649 568 650 log.Printf("relaystore: migrated to multi-domain schema") ··· 581 663 formatTime(m.CreatedAt), formatTime(m.UpdatedAt), 582 664 ) 583 665 if err != nil { 584 - return fmt.Errorf("insert member: %v", err) 666 + return fmt.Errorf("insert member: %w", err) 585 667 } 586 668 return nil 587 669 } ··· 592 674 func (s *Store) EnrollMember(ctx context.Context, member *Member, domain *MemberDomain) error { 593 675 tx, err := s.db.BeginTx(ctx, nil) 594 676 if err != nil { 595 - return fmt.Errorf("enroll begin tx: %v", err) 677 + return fmt.Errorf("enroll begin tx: %w", err) 596 678 } 597 679 defer tx.Rollback() 598 680 ··· 607 689 formatTime(member.CreatedAt), formatTime(member.UpdatedAt), 608 690 ) 609 691 if err != nil { 610 - return fmt.Errorf("enroll insert member: %v", err) 692 + return fmt.Errorf("enroll insert member: %w", err) 611 693 } 612 694 } 613 695 ··· 618 700 formatTime(domain.CreatedAt), 619 701 ) 620 702 if err != nil { 621 - return fmt.Errorf("enroll insert domain: %v", err) 703 + return fmt.Errorf("enroll insert domain: %w", err) 622 704 } 623 705 624 706 return tx.Commit() ··· 641 723 ORDER BY m.created_at ASC`, 642 724 ) 643 725 if err != nil { 644 - return nil, fmt.Errorf("list members with domains: %v", err) 726 + return nil, fmt.Errorf("list members with domains: %w", err) 645 727 } 646 728 defer rows.Close() 647 729 ··· 656 738 &termsAcceptedAt, &mwd.TermsVersion, 657 739 &createdAt, &updatedAt, &domainCSV, 658 740 ); err != nil { 659 - return nil, fmt.Errorf("scan member with domains: %v", err) 741 + return nil, fmt.Errorf("scan member with domains: %w", err) 660 742 } 661 743 mwd.DIDVerified = didVerified != 0 662 744 mwd.TermsAcceptedAt = parseTime(termsAcceptedAt) ··· 684 766 FROM members ORDER BY created_at ASC`, 685 767 ) 686 768 if err != nil { 687 - return nil, fmt.Errorf("list members: %v", err) 769 + return nil, fmt.Errorf("list members: %w", err) 688 770 } 689 771 defer rows.Close() 690 772 ··· 712 794 func (s *Store) DeleteMember(ctx context.Context, did string) error { 713 795 tx, err := s.db.BeginTx(ctx, nil) 714 796 if err != nil { 715 - return fmt.Errorf("delete begin tx: %v", err) 797 + return fmt.Errorf("delete begin tx: %w", err) 716 798 } 717 799 defer tx.Rollback() 718 800 ··· 724 806 } 725 807 for _, d := range deletes { 726 808 if _, err := tx.ExecContext(ctx, "DELETE FROM "+d.table+" WHERE "+d.col+" = ?", did); err != nil { 727 - return fmt.Errorf("delete from %s: %v", d.table, err) 809 + return fmt.Errorf("delete from %s: %w", d.table, err) 728 810 } 729 811 } 730 812 if _, err := tx.ExecContext(ctx, "DELETE FROM member_domains WHERE did = ?", did); err != nil { 731 - return fmt.Errorf("delete member_domains: %v", err) 813 + return fmt.Errorf("delete member_domains: %w", err) 732 814 } 733 815 if _, err := tx.ExecContext(ctx, "DELETE FROM members WHERE did = ?", did); err != nil { 734 - return fmt.Errorf("delete member: %v", err) 816 + return fmt.Errorf("delete member: %w", err) 735 817 } 736 818 737 819 return tx.Commit() ··· 742 824 `UPDATE members SET send_count = send_count + 1, updated_at = ? WHERE did = ?`, 743 825 formatTime(time.Now().UTC()), did, 744 826 ) 745 - return err 827 + return s.recordIfBusy("increment_send_count", err) 746 828 } 747 829 748 830 // scanner is satisfied by both *sql.Row and *sql.Rows. ··· 765 847 return nil, nil 766 848 } 767 849 if err != nil { 768 - return nil, fmt.Errorf("scan member: %v", err) 850 + return nil, fmt.Errorf("scan member: %w", err) 769 851 } 770 852 771 853 m.DIDVerified = didVerified != 0 ··· 785 867 formatTime(d.CreatedAt), 786 868 ) 787 869 if err != nil { 788 - return fmt.Errorf("insert member domain: %v", err) 870 + return fmt.Errorf("insert member domain: %w", err) 789 871 } 790 872 return nil 791 873 } ··· 802 884 return nil, nil 803 885 } 804 886 if err != nil { 805 - return nil, fmt.Errorf("get member domain: %v", err) 887 + return nil, fmt.Errorf("get member domain: %w", err) 806 888 } 807 889 d.EmailVerified = emailVerified != 0 808 890 d.CreatedAt = parseTime(createdAt) ··· 815 897 FROM member_domains WHERE did = ? ORDER BY created_at ASC`, did, 816 898 ) 817 899 if err != nil { 818 - return nil, fmt.Errorf("list member domains: %v", err) 900 + return nil, fmt.Errorf("list member domains: %w", err) 819 901 } 820 902 defer rows.Close() 821 903 ··· 825 907 var createdAt string 826 908 var emailVerified int 827 909 if err := rows.Scan(&d.Domain, &d.DID, &d.APIKeyHash, &d.DKIMRSAPriv, &d.DKIMEdPriv, &d.DKIMSelector, &d.ForwardTo, &d.ContactEmail, &emailVerified, &createdAt); err != nil { 828 - return nil, fmt.Errorf("scan member domain: %v", err) 910 + return nil, fmt.Errorf("scan member domain: %w", err) 829 911 } 830 912 d.EmailVerified = emailVerified != 0 831 913 d.CreatedAt = parseTime(createdAt) ··· 844 926 hash, domain, 845 927 ) 846 928 if err != nil { 847 - return fmt.Errorf("update api_key_hash: %v", err) 929 + return fmt.Errorf("update api_key_hash: %w", err) 848 930 } 849 931 n, err := res.RowsAffected() 850 932 if err != nil { 851 - return fmt.Errorf("update api_key_hash rows: %v", err) 933 + return fmt.Errorf("update api_key_hash rows: %w", err) 852 934 } 853 935 if n == 0 { 854 936 return fmt.Errorf("domain %q not registered", domain) ··· 867 949 contactEmail, domain, 868 950 ) 869 951 if err != nil { 870 - return fmt.Errorf("update contact_email: %v", err) 952 + return fmt.Errorf("update contact_email: %w", err) 871 953 } 872 954 n, err := res.RowsAffected() 873 955 if err != nil { 874 - return fmt.Errorf("update contact_email rows: %v", err) 956 + return fmt.Errorf("update contact_email rows: %w", err) 875 957 } 876 958 if n == 0 { 877 959 return fmt.Errorf("domain %q not registered", domain) ··· 890 972 token, formatTime(expiresAt), domain, 891 973 ) 892 974 if err != nil { 893 - return fmt.Errorf("set email verify token: %v", err) 975 + return fmt.Errorf("set email verify token: %w", err) 894 976 } 895 977 n, err := res.RowsAffected() 896 978 if err != nil { 897 - return fmt.Errorf("set email verify token rows: %v", err) 979 + return fmt.Errorf("set email verify token rows: %w", err) 898 980 } 899 981 if n == 0 { 900 982 return fmt.Errorf("domain %q not registered", domain) ··· 920 1002 return "", fmt.Errorf("verification token not found") 921 1003 } 922 1004 if err != nil { 923 - return "", fmt.Errorf("verify email lookup: %v", err) 1005 + return "", fmt.Errorf("verify email lookup: %w", err) 924 1006 } 925 1007 expiresAt := parseTime(expiresAtStr) 926 1008 if !expiresAt.IsZero() && time.Now().UTC().After(expiresAt) { ··· 936 1018 domain, 937 1019 ) 938 1020 if err != nil { 939 - return "", fmt.Errorf("mark email verified: %v", err) 1021 + return "", fmt.Errorf("mark email verified: %w", err) 940 1022 } 941 1023 return domain, nil 942 1024 } ··· 953 1035 return false, nil 954 1036 } 955 1037 if err != nil { 956 - return false, fmt.Errorf("is email verified: %v", err) 1038 + return false, fmt.Errorf("is email verified: %w", err) 957 1039 } 958 1040 return verified != 0, nil 959 1041 } ··· 967 1049 domain, 968 1050 ) 969 1051 if err != nil { 970 - return fmt.Errorf("reset email verification: %v", err) 1052 + return fmt.Errorf("reset email verification: %w", err) 971 1053 } 972 1054 return nil 973 1055 } ··· 995 1077 return nil, nil, nil 996 1078 } 997 1079 if err != nil { 998 - return nil, nil, fmt.Errorf("get member by domain: %v", err) 1080 + return nil, nil, fmt.Errorf("get member by domain: %w", err) 999 1081 } 1000 1082 1001 1083 m.DIDVerified = didVerified != 0 ··· 1016 1098 forwardTo, domain, 1017 1099 ) 1018 1100 if err != nil { 1019 - return fmt.Errorf("set forward_to: %v", err) 1101 + return fmt.Errorf("set forward_to: %w", err) 1020 1102 } 1021 1103 n, err := res.RowsAffected() 1022 1104 if err != nil { 1023 - return fmt.Errorf("set forward_to rows: %v", err) 1105 + return fmt.Errorf("set forward_to rows: %w", err) 1024 1106 } 1025 1107 if n == 0 { 1026 1108 return fmt.Errorf("domain %q not registered", domain) ··· 1054 1136 formatTime(m.CreatedAt), formatTime(m.DeliveredAt), m.ContentFingerprint, 1055 1137 ) 1056 1138 if err != nil { 1057 - return 0, fmt.Errorf("insert message: %v", err) 1139 + return 0, fmt.Errorf("insert message: %w", s.recordIfBusy("insert_message", err)) 1058 1140 } 1059 1141 return res.LastInsertId() 1060 1142 } ··· 1080 1162 if status == MsgSent { 1081 1163 deliveredAt = formatTime(time.Now().UTC()) 1082 1164 } 1083 - _, err := s.db.ExecContext(ctx, 1165 + res, err := s.db.ExecContext(ctx, 1084 1166 `UPDATE messages SET status = ?, smtp_code = ?, delivered_at = ? WHERE id = ?`, 1085 1167 status, smtpCode, deliveredAt, id, 1086 1168 ) 1087 - return err 1169 + if err != nil { 1170 + return s.recordIfBusy("update_message_status", err) 1171 + } 1172 + rows, err := res.RowsAffected() 1173 + if err != nil { 1174 + return fmt.Errorf("rows affected: %w", err) 1175 + } 1176 + if rows == 0 { 1177 + return ErrMessageNotFound 1178 + } 1179 + return nil 1180 + } 1181 + 1182 + // ListQueuedMessageIDsOlderThan returns message row IDs whose status 1183 + // is still "queued" and whose created_at is at least minAge old. 1184 + // Used by the orphan-reconciliation janitor to find rows whose spool 1185 + // file vanished (Enqueue failure mid-batch, manual spool wipe, FS 1186 + // corruption). minAge prevents racing with rows that were just 1187 + // inserted but haven't had their spool file landed yet. 1188 + func (s *Store) ListQueuedMessageIDsOlderThan(ctx context.Context, minAge time.Duration, limit int) ([]int64, error) { 1189 + if limit <= 0 { 1190 + limit = 100 1191 + } 1192 + cutoff := formatTime(time.Now().UTC().Add(-minAge)) 1193 + rows, err := s.db.QueryContext(ctx, 1194 + `SELECT id FROM messages WHERE status = ? AND created_at < ? ORDER BY id ASC LIMIT ?`, 1195 + MsgQueued, cutoff, limit, 1196 + ) 1197 + if err != nil { 1198 + return nil, fmt.Errorf("query queued: %w", err) 1199 + } 1200 + defer rows.Close() 1201 + var ids []int64 1202 + for rows.Next() { 1203 + var id int64 1204 + if err := rows.Scan(&id); err != nil { 1205 + return nil, fmt.Errorf("scan queued id: %w", err) 1206 + } 1207 + ids = append(ids, id) 1208 + } 1209 + if err := rows.Err(); err != nil { 1210 + return nil, fmt.Errorf("iter queued ids: %w", err) 1211 + } 1212 + return ids, nil 1088 1213 } 1089 1214 1090 1215 func scanMessage(sc scanner) (*Message, error) { ··· 1099 1224 return nil, nil 1100 1225 } 1101 1226 if err != nil { 1102 - return nil, fmt.Errorf("scan message: %v", err) 1227 + return nil, fmt.Errorf("scan message: %w", err) 1103 1228 } 1104 1229 1105 1230 m.CreatedAt = parseTime(createdAt) ··· 1150 1275 if err == sql.ErrNoRows { 1151 1276 current = 0 1152 1277 } else if err != nil { 1153 - return 0, fmt.Errorf("read counter: %v", err) 1278 + return 0, fmt.Errorf("read counter: %w", err) 1154 1279 } 1155 1280 1156 1281 if current+count > limit { ··· 1165 1290 did, windowType, formatTime(windowStart), count, count, 1166 1291 ) 1167 1292 if err != nil { 1168 - return current, fmt.Errorf("increment counter: %v", err) 1293 + return current, fmt.Errorf("increment counter: %w", err) 1169 1294 } 1170 1295 1171 1296 return current, nil ··· 1202 1327 e.MemberDID, e.EventType, e.MessageID, e.Recipient, e.Details, formatTime(e.CreatedAt), 1203 1328 ) 1204 1329 if err != nil { 1205 - return 0, fmt.Errorf("insert feedback event: %v", err) 1330 + return 0, fmt.Errorf("insert feedback event: %w", err) 1206 1331 } 1207 1332 return res.LastInsertId() 1208 1333 } ··· 1216 1341 memberDID, MsgSent, MsgBounced, formatTime(since), 1217 1342 ).Scan(&total) 1218 1343 if err != nil { 1219 - return 0, 0, fmt.Errorf("count terminal: %v", err) 1344 + return 0, 0, fmt.Errorf("count terminal: %w", err) 1220 1345 } 1221 1346 1222 1347 err = s.db.QueryRowContext(ctx, ··· 1224 1349 memberDID, MsgBounced, formatTime(since), 1225 1350 ).Scan(&bounced) 1226 1351 if err != nil { 1227 - return 0, 0, fmt.Errorf("count bounced: %v", err) 1352 + return 0, 0, fmt.Errorf("count bounced: %w", err) 1228 1353 } 1229 1354 1230 1355 return total, bounced, nil 1231 1356 } 1232 1357 1358 + // GetDailySendCounts returns per-day terminal (sent+bounced) message counts 1359 + // for the last n days, oldest-to-newest. Days with zero sends are included 1360 + // so callers get a fixed-length slice suitable for sparklines. 1361 + func (s *Store) GetDailySendCounts(ctx context.Context, memberDID string, days int) ([]int64, error) { 1362 + if days <= 0 { 1363 + days = 14 1364 + } 1365 + // Compute the inclusive cutoff in Go so SQLite parameter binding 1366 + // works cleanly. date('now', 'localtime', 'start of day', '-13 days') 1367 + // gives the first instant of the oldest day we care about. 1368 + cutoff := time.Now().UTC().AddDate(0, 0, -(days - 1)).Format("2006-01-02") 1369 + 1370 + rows, err := s.db.QueryContext(ctx, 1371 + `SELECT date(created_at) as day, COUNT(*) 1372 + FROM messages 1373 + WHERE member_did = ? AND status IN (?, ?) AND date(created_at) >= ? 1374 + GROUP BY day 1375 + ORDER BY day ASC`, 1376 + memberDID, MsgSent, MsgBounced, cutoff, 1377 + ) 1378 + if err != nil { 1379 + return nil, fmt.Errorf("daily send counts: %w", err) 1380 + } 1381 + defer rows.Close() 1382 + 1383 + counts := make(map[string]int64) 1384 + for rows.Next() { 1385 + var day string 1386 + var c int64 1387 + if err := rows.Scan(&day, &c); err != nil { 1388 + return nil, fmt.Errorf("scan daily count: %w", err) 1389 + } 1390 + counts[day] = c 1391 + } 1392 + if err := rows.Err(); err != nil { 1393 + return nil, fmt.Errorf("daily send counts rows: %w", err) 1394 + } 1395 + 1396 + // Fill in zero days so the slice is exactly `days` long. 1397 + out := make([]int64, days) 1398 + now := time.Now().UTC() 1399 + for i := 0; i < days; i++ { 1400 + day := now.AddDate(0, 0, -(days-1-i)).Format("2006-01-02") 1401 + out[i] = counts[day] 1402 + } 1403 + return out, nil 1404 + } 1405 + 1406 + // GetComplaintCount returns the number of feedback_events with event_type 1407 + // 'complaint' for the member since the given time. 1408 + func (s *Store) GetComplaintCount(ctx context.Context, memberDID string, since time.Time) (int64, error) { 1409 + var n int64 1410 + err := s.db.QueryRowContext(ctx, 1411 + `SELECT COUNT(*) FROM feedback_events 1412 + WHERE member_did = ? AND event_type = ? AND created_at >= ?`, 1413 + memberDID, "complaint", formatTime(since), 1414 + ).Scan(&n) 1415 + if err != nil { 1416 + return 0, fmt.Errorf("count complaints: %w", err) 1417 + } 1418 + return n, nil 1419 + } 1420 + 1233 1421 // GetUniqueRecipientDomainsSince counts DISTINCT recipient domains a member 1234 1422 // has sent to since the given time. Used by the DomainSpray detection rule — 1235 1423 // legitimate transactional mail usually goes to a small handful of domains; ··· 1246 1434 memberDID, formatTime(since), 1247 1435 ).Scan(&n) 1248 1436 if err != nil { 1249 - return 0, fmt.Errorf("count unique recipient domains: %v", err) 1437 + return 0, fmt.Errorf("count unique recipient domains: %w", err) 1250 1438 } 1251 1439 return n, nil 1252 1440 } ··· 1266 1454 memberDID, formatTime(since), 1267 1455 ).Scan(&n) 1268 1456 if err != nil { 1269 - return 0, fmt.Errorf("count sends since: %v", err) 1457 + return 0, fmt.Errorf("count sends since: %w", err) 1270 1458 } 1271 1459 return n, nil 1272 1460 } ··· 1288 1476 memberDID, fingerprint, formatTime(since), 1289 1477 ).Scan(&n) 1290 1478 if err != nil { 1291 - return 0, fmt.Errorf("count same-content recipients: %v", err) 1479 + return 0, fmt.Errorf("count same-content recipients: %w", err) 1292 1480 } 1293 1481 return n, nil 1294 1482 } ··· 1304 1492 MsgSent, MsgBounced, formatTime(before), 1305 1493 ) 1306 1494 if err != nil { 1307 - return 0, fmt.Errorf("purge old messages: %v", err) 1495 + return 0, fmt.Errorf("purge old messages: %w", err) 1308 1496 } 1309 1497 return res.RowsAffected() 1310 1498 } 1311 1499 1312 1500 // --- Bypass DIDs --- 1313 1501 1314 - // InsertBypassDID adds a DID to the label bypass list. Idempotent. 1315 - func (s *Store) InsertBypassDID(ctx context.Context, did string) error { 1316 - _, err := s.db.ExecContext(ctx, 1317 - `INSERT OR IGNORE INTO bypass_dids (did) VALUES (?)`, did, 1318 - ) 1319 - return err 1502 + // BypassEntry pairs a bypassed DID with its lifecycle metadata. Empty 1503 + // expiresAt means "permanent" — supported only for legacy entries 1504 + // migrated from the pre-#213 schema; new entries always carry an 1505 + // explicit expiry capped at 30 days. 1506 + type BypassEntry struct { 1507 + DID string 1508 + ExpiresAt time.Time // zero value = legacy permanent 1509 + Reason string 1510 + CreatedAt time.Time 1511 + } 1512 + 1513 + // BypassAuditEntry is one append-only row in the bypass_audit table. 1514 + // Action is "add" or "remove". Used for incident reconstruction; not 1515 + // served on the dashboard. 1516 + type BypassAuditEntry struct { 1517 + ID int64 1518 + DID string 1519 + Action string 1520 + Reason string 1521 + ExpiresAt time.Time 1522 + CreatedAt time.Time 1523 + } 1524 + 1525 + // InsertBypassDID adds a DID to the label bypass list and writes a 1526 + // matching audit row in the same transaction. expiresAt may be zero 1527 + // only for the legacy permanent path used by migration restoration; 1528 + // new admin-driven calls always pass a non-zero expiry. Idempotent 1529 + // in the bypass_dids set (INSERT OR REPLACE) but every call appends 1530 + // to bypass_audit so a re-issue is observable. 1531 + func (s *Store) InsertBypassDID(ctx context.Context, did string, expiresAt time.Time, reason string) error { 1532 + now := time.Now().UTC() 1533 + tx, err := s.db.BeginTx(ctx, nil) 1534 + if err != nil { 1535 + return fmt.Errorf("begin: %w", err) 1536 + } 1537 + defer tx.Rollback() 1538 + if _, err := tx.ExecContext(ctx, 1539 + `INSERT INTO bypass_dids (did, expires_at, reason, created_at) 1540 + VALUES (?, ?, ?, ?) 1541 + ON CONFLICT(did) DO UPDATE SET 1542 + expires_at = excluded.expires_at, 1543 + reason = excluded.reason, 1544 + created_at = excluded.created_at`, 1545 + did, formatTime(expiresAt), reason, formatTime(now), 1546 + ); err != nil { 1547 + return fmt.Errorf("insert bypass: %w", err) 1548 + } 1549 + if _, err := tx.ExecContext(ctx, 1550 + `INSERT INTO bypass_audit (did, action, reason, expires_at, created_at) 1551 + VALUES (?, 'add', ?, ?, ?)`, 1552 + did, reason, formatTime(expiresAt), formatTime(now), 1553 + ); err != nil { 1554 + return fmt.Errorf("insert audit: %w", err) 1555 + } 1556 + return tx.Commit() 1557 + } 1558 + 1559 + // DeleteBypassDID removes a DID from the label bypass list and writes 1560 + // an audit row noting the removal. reason names the trigger ("manual", 1561 + // "expired", etc.) so post-hoc analysis can distinguish operator 1562 + // action from janitor cleanup. 1563 + func (s *Store) DeleteBypassDID(ctx context.Context, did, reason string) error { 1564 + now := time.Now().UTC() 1565 + tx, err := s.db.BeginTx(ctx, nil) 1566 + if err != nil { 1567 + return fmt.Errorf("begin: %w", err) 1568 + } 1569 + defer tx.Rollback() 1570 + if _, err := tx.ExecContext(ctx, `DELETE FROM bypass_dids WHERE did = ?`, did); err != nil { 1571 + return fmt.Errorf("delete bypass: %w", err) 1572 + } 1573 + if _, err := tx.ExecContext(ctx, 1574 + `INSERT INTO bypass_audit (did, action, reason, expires_at, created_at) 1575 + VALUES (?, 'remove', ?, '', ?)`, 1576 + did, reason, formatTime(now), 1577 + ); err != nil { 1578 + return fmt.Errorf("insert audit: %w", err) 1579 + } 1580 + return tx.Commit() 1320 1581 } 1321 1582 1322 - // DeleteBypassDID removes a DID from the label bypass list. 1323 - func (s *Store) DeleteBypassDID(ctx context.Context, did string) error { 1324 - _, err := s.db.ExecContext(ctx, 1325 - `DELETE FROM bypass_dids WHERE did = ?`, did, 1583 + // PurgeExpiredBypassDIDs deletes bypass entries whose expires_at is 1584 + // non-empty and in the past, writing 'remove' audit rows with reason 1585 + // "expired" so the dashboard can distinguish janitor evictions from 1586 + // operator removals. Returns the number of evicted DIDs. 1587 + // 1588 + // Legacy entries with expires_at='' are NOT touched — they were 1589 + // migrated from a permanent-bypass schema and removing them would be 1590 + // a behavior change the operator hasn't authorized. Convert legacy 1591 + // entries by re-adding with explicit expiry. 1592 + func (s *Store) PurgeExpiredBypassDIDs(ctx context.Context) (int, error) { 1593 + now := time.Now().UTC() 1594 + cutoff := formatTime(now) 1595 + tx, err := s.db.BeginTx(ctx, nil) 1596 + if err != nil { 1597 + return 0, fmt.Errorf("begin: %w", err) 1598 + } 1599 + defer tx.Rollback() 1600 + rows, err := tx.QueryContext(ctx, 1601 + `SELECT did FROM bypass_dids WHERE expires_at != '' AND expires_at < ?`, 1602 + cutoff, 1326 1603 ) 1327 - return err 1604 + if err != nil { 1605 + return 0, fmt.Errorf("scan expired: %w", err) 1606 + } 1607 + var dids []string 1608 + for rows.Next() { 1609 + var d string 1610 + if err := rows.Scan(&d); err != nil { 1611 + rows.Close() 1612 + return 0, fmt.Errorf("scan did: %w", err) 1613 + } 1614 + dids = append(dids, d) 1615 + } 1616 + rows.Close() 1617 + if err := rows.Err(); err != nil { 1618 + return 0, fmt.Errorf("iter expired: %w", err) 1619 + } 1620 + for _, d := range dids { 1621 + if _, err := tx.ExecContext(ctx, `DELETE FROM bypass_dids WHERE did = ?`, d); err != nil { 1622 + return 0, fmt.Errorf("delete %s: %w", d, err) 1623 + } 1624 + if _, err := tx.ExecContext(ctx, 1625 + `INSERT INTO bypass_audit (did, action, reason, expires_at, created_at) 1626 + VALUES (?, 'remove', 'expired', '', ?)`, 1627 + d, cutoff, 1628 + ); err != nil { 1629 + return 0, fmt.Errorf("audit %s: %w", d, err) 1630 + } 1631 + } 1632 + if err := tx.Commit(); err != nil { 1633 + return 0, fmt.Errorf("commit: %w", err) 1634 + } 1635 + return len(dids), nil 1328 1636 } 1329 1637 1330 - // ListBypassDIDs returns all DIDs in the label bypass list. 1638 + // ListBypassDIDs returns all DIDs in the label bypass list, excluding 1639 + // entries whose expiry has already passed. Legacy entries with 1640 + // expires_at='' are always returned (permanent grandfather). 1331 1641 func (s *Store) ListBypassDIDs(ctx context.Context) ([]string, error) { 1332 - rows, err := s.db.QueryContext(ctx, `SELECT did FROM bypass_dids ORDER BY did`) 1642 + now := formatTime(time.Now().UTC()) 1643 + rows, err := s.db.QueryContext(ctx, 1644 + `SELECT did FROM bypass_dids 1645 + WHERE expires_at = '' OR expires_at >= ? 1646 + ORDER BY did`, 1647 + now, 1648 + ) 1333 1649 if err != nil { 1334 1650 return nil, err 1335 1651 } ··· 1369 1685 memberDID, strings.ToLower(recipient), source, formatTime(time.Now().UTC()), 1370 1686 ) 1371 1687 if err != nil { 1372 - return fmt.Errorf("insert suppression: %v", err) 1688 + return fmt.Errorf("insert suppression: %w", err) 1373 1689 } 1374 1690 return nil 1375 1691 } ··· 1385 1701 return false, nil 1386 1702 } 1387 1703 if err != nil { 1388 - return false, fmt.Errorf("is suppressed: %v", err) 1704 + return false, fmt.Errorf("is suppressed: %w", err) 1389 1705 } 1390 1706 return true, nil 1391 1707 } ··· 1400 1716 memberDID, 1401 1717 ) 1402 1718 if err != nil { 1403 - return nil, fmt.Errorf("list suppressions: %v", err) 1719 + return nil, fmt.Errorf("list suppressions: %w", err) 1404 1720 } 1405 1721 defer rows.Close() 1406 1722 ··· 1425 1741 memberDID, strings.ToLower(recipient), 1426 1742 ) 1427 1743 if err != nil { 1428 - return fmt.Errorf("delete suppression: %v", err) 1744 + return fmt.Errorf("delete suppression: %w", err) 1429 1745 } 1430 1746 return nil 1431 1747 } ··· 1436 1752 var st Stats 1437 1753 err := s.db.QueryRowContext(ctx, `SELECT COUNT(*) FROM members`).Scan(&st.Members) 1438 1754 if err != nil { 1439 - return st, fmt.Errorf("count members: %v", err) 1755 + return st, fmt.Errorf("count members: %w", err) 1440 1756 } 1441 1757 err = s.db.QueryRowContext(ctx, `SELECT COUNT(*) FROM member_domains`).Scan(&st.Domains) 1442 1758 if err != nil { 1443 - return st, fmt.Errorf("count domains: %v", err) 1759 + return st, fmt.Errorf("count domains: %w", err) 1444 1760 } 1445 1761 err = s.db.QueryRowContext(ctx, `SELECT COUNT(*) FROM messages`).Scan(&st.Messages) 1446 1762 if err != nil { 1447 - return st, fmt.Errorf("count messages: %v", err) 1763 + return st, fmt.Errorf("count messages: %w", err) 1448 1764 } 1449 1765 err = s.db.QueryRowContext(ctx, `SELECT COUNT(*) FROM messages WHERE status = ?`, MsgBounced).Scan(&st.Bounces) 1450 1766 if err != nil { 1451 - return st, fmt.Errorf("count bounces: %v", err) 1767 + return st, fmt.Errorf("count bounces: %w", err) 1452 1768 } 1453 1769 return st, nil 1454 1770 } ··· 1461 1777 var active, suspended, pending int64 1462 1778 err := s.db.QueryRowContext(ctx, `SELECT COUNT(*) FROM members WHERE status = ?`, StatusActive).Scan(&active) 1463 1779 if err != nil { 1464 - return 0, 0, 0, fmt.Errorf("count active members: %v", err) 1780 + return 0, 0, 0, fmt.Errorf("count active members: %w", err) 1465 1781 } 1466 1782 err = s.db.QueryRowContext(ctx, `SELECT COUNT(*) FROM members WHERE status = ?`, StatusSuspended).Scan(&suspended) 1467 1783 if err != nil { 1468 - return 0, 0, 0, fmt.Errorf("count suspended members: %v", err) 1784 + return 0, 0, 0, fmt.Errorf("count suspended members: %w", err) 1469 1785 } 1470 1786 err = s.db.QueryRowContext(ctx, `SELECT COUNT(*) FROM members WHERE status = ?`, StatusPending).Scan(&pending) 1471 1787 if err != nil { 1472 - return 0, 0, 0, fmt.Errorf("count pending members: %v", err) 1788 + return 0, 0, 0, fmt.Errorf("count pending members: %w", err) 1473 1789 } 1474 1790 return active, suspended, pending, nil 1475 1791 } ··· 1486 1802 rkey, formatTime(at), domain, 1487 1803 ) 1488 1804 if err != nil { 1489 - return fmt.Errorf("set attestation published: %v", err) 1805 + return fmt.Errorf("set attestation published: %w", err) 1490 1806 } 1491 1807 n, err := res.RowsAffected() 1492 1808 if err != nil { 1493 - return fmt.Errorf("set attestation rows: %v", err) 1809 + return fmt.Errorf("set attestation rows: %w", err) 1494 1810 } 1495 1811 if n == 0 { 1496 1812 return fmt.Errorf("domain %q not registered", domain) ··· 1519 1835 return nil, nil 1520 1836 } 1521 1837 if err != nil { 1522 - return nil, fmt.Errorf("get attestation state: %v", err) 1838 + return nil, fmt.Errorf("get attestation state: %w", err) 1523 1839 } 1524 1840 return &AttestationState{RKey: rkey, PublishedAt: parseTime(publishedAt)}, nil 1525 1841 } ··· 1567 1883 formatTime(r.ExpiresAt), formatTime(r.CreatedAt), 1568 1884 ) 1569 1885 if err != nil { 1570 - return fmt.Errorf("save oauth auth request: %v", err) 1886 + return fmt.Errorf("save oauth auth request: %w", err) 1571 1887 } 1572 1888 return nil 1573 1889 } ··· 1595 1911 return nil, nil 1596 1912 } 1597 1913 if err != nil { 1598 - return nil, fmt.Errorf("get oauth auth request: %v", err) 1914 + return nil, fmt.Errorf("get oauth auth request: %w", err) 1599 1915 } 1600 1916 r.ExpiresAt = parseTime(expiresAt) 1601 1917 r.CreatedAt = parseTime(createdAt) ··· 1622 1938 return "", fmt.Errorf("no pending request for request_uri") 1623 1939 } 1624 1940 if err != nil { 1625 - return "", fmt.Errorf("find state by request_uri: %v", err) 1941 + return "", fmt.Errorf("find state by request_uri: %w", err) 1626 1942 } 1627 1943 return state, nil 1628 1944 } ··· 1639 1955 accountDID, domain, string(attestation), formatTime(expiresAt), state, 1640 1956 ) 1641 1957 if err != nil { 1642 - return fmt.Errorf("augment oauth auth request: %v", err) 1958 + return fmt.Errorf("augment oauth auth request: %w", err) 1643 1959 } 1644 1960 n, err := res.RowsAffected() 1645 1961 if err != nil { 1646 - return fmt.Errorf("augment oauth rows: %v", err) 1962 + return fmt.Errorf("augment oauth rows: %w", err) 1647 1963 } 1648 1964 if n == 0 { 1649 1965 return fmt.Errorf("no pending row for state") ··· 1658 1974 `DELETE FROM oauth_auth_requests WHERE state = ?`, state, 1659 1975 ) 1660 1976 if err != nil { 1661 - return fmt.Errorf("delete oauth auth request: %v", err) 1977 + return fmt.Errorf("delete oauth auth request: %w", err) 1662 1978 } 1663 1979 return nil 1664 1980 } ··· 1711 2027 formatTime(sess.CreatedAt), formatTime(sess.UpdatedAt), 1712 2028 ) 1713 2029 if err != nil { 1714 - return fmt.Errorf("save oauth session: %v", err) 2030 + return fmt.Errorf("save oauth session: %w", err) 1715 2031 } 1716 2032 return nil 1717 2033 } ··· 1739 2055 return nil, nil 1740 2056 } 1741 2057 if err != nil { 1742 - return nil, fmt.Errorf("get oauth session: %v", err) 2058 + return nil, fmt.Errorf("get oauth session: %w", err) 1743 2059 } 1744 2060 if scopes != "" { 1745 2061 sess.Scopes = strings.Split(scopes, " ") ··· 1756 2072 did, sessionID, 1757 2073 ) 1758 2074 if err != nil { 1759 - return fmt.Errorf("delete oauth session: %v", err) 2075 + return fmt.Errorf("delete oauth session: %w", err) 1760 2076 } 1761 2077 return nil 1762 2078 } ··· 1769 2085 formatTime(now), 1770 2086 ) 1771 2087 if err != nil { 1772 - return 0, fmt.Errorf("cleanup expired oauth: %v", err) 2088 + return 0, fmt.Errorf("cleanup expired oauth: %w", err) 1773 2089 } 1774 2090 return res.RowsAffected() 1775 2091 } ··· 1809 2125 n.MemberDID, n.Action, n.Actor, n.Note, formatTime(reviewedAt), 1810 2126 ) 1811 2127 if err != nil { 1812 - return 0, fmt.Errorf("insert member review note: %v", err) 2128 + return 0, fmt.Errorf("insert member review note: %w", err) 1813 2129 } 1814 2130 return res.LastInsertId() 1815 2131 } ··· 1823 2139 did, 1824 2140 ) 1825 2141 if err != nil { 1826 - return nil, fmt.Errorf("list member review notes: %v", err) 2142 + return nil, fmt.Errorf("list member review notes: %w", err) 1827 2143 } 1828 2144 defer rows.Close() 1829 2145 ··· 1832 2148 var n MemberReviewNote 1833 2149 var reviewedAt string 1834 2150 if err := rows.Scan(&n.ID, &n.MemberDID, &n.Action, &n.Actor, &n.Note, &reviewedAt); err != nil { 1835 - return nil, fmt.Errorf("scan review note: %v", err) 2151 + return nil, fmt.Errorf("scan review note: %w", err) 1836 2152 } 1837 2153 n.ReviewedAt = parseTime(reviewedAt) 1838 2154 out = append(out, n) ··· 1852 2168 ReviewActionReactivated, formatTime(since), 1853 2169 ) 1854 2170 if err != nil { 1855 - return nil, fmt.Errorf("list reactivated dids: %v", err) 2171 + return nil, fmt.Errorf("list reactivated dids: %w", err) 1856 2172 } 1857 2173 defer rows.Close() 1858 2174 ··· 1883 2199 senderDID, formatTime(since), 1884 2200 ).Scan(&n) 1885 2201 if err != nil { 1886 - return 0, fmt.Errorf("count relay_rejected since: %v", err) 2202 + return 0, fmt.Errorf("count relay_rejected since: %w", err) 1887 2203 } 1888 2204 return n, nil 1889 2205 } ··· 1961 2277 formatTime(p.CreatedAt), formatTime(p.ExpiresAt), 1962 2278 ) 1963 2279 if err != nil { 1964 - return fmt.Errorf("create pending enrollment: %v", err) 2280 + return fmt.Errorf("create pending enrollment: %w", err) 1965 2281 } 1966 2282 return nil 1967 2283 } ··· 1983 2299 return nil, nil 1984 2300 } 1985 2301 if err != nil { 1986 - return nil, fmt.Errorf("get pending enrollment: %v", err) 2302 + return nil, fmt.Errorf("get pending enrollment: %w", err) 1987 2303 } 1988 2304 p.TermsAccepted = termsAccepted != 0 1989 2305 p.CreatedAt = parseTime(createdAt) ··· 1999 2315 `DELETE FROM pending_enrollments WHERE token = ?`, token, 2000 2316 ) 2001 2317 if err != nil { 2002 - return fmt.Errorf("delete pending enrollment: %v", err) 2318 + return fmt.Errorf("delete pending enrollment: %w", err) 2003 2319 } 2004 2320 return nil 2005 2321 } ··· 2013 2329 formatTime(cutoff), 2014 2330 ) 2015 2331 if err != nil { 2016 - return 0, fmt.Errorf("clean expired pending enrollments: %v", err) 2332 + return 0, fmt.Errorf("clean expired pending enrollments: %w", err) 2017 2333 } 2018 2334 return res.RowsAffected() 2019 2335 }
+117 -5
internal/relaystore/store_test.go
··· 4 4 5 5 import ( 6 6 "context" 7 + "fmt" 7 8 "testing" 8 9 "time" 9 10 ) ··· 739 740 } 740 741 741 742 // Add two 742 - if err := s.InsertBypassDID(ctx, "did:web:test1.example.com"); err != nil { 743 + if err := s.InsertBypassDID(ctx, "did:web:test1.example.com", time.Time{}, "test"); err != nil { 743 744 t.Fatalf("InsertBypassDID: %v", err) 744 745 } 745 - if err := s.InsertBypassDID(ctx, "did:web:test2.example.com"); err != nil { 746 + if err := s.InsertBypassDID(ctx, "did:web:test2.example.com", time.Time{}, "test"); err != nil { 746 747 t.Fatalf("InsertBypassDID: %v", err) 747 748 } 748 749 ··· 752 753 } 753 754 754 755 // Duplicate insert should be idempotent (no error) 755 - if err := s.InsertBypassDID(ctx, "did:web:test1.example.com"); err != nil { 756 + if err := s.InsertBypassDID(ctx, "did:web:test1.example.com", time.Time{}, "test"); err != nil { 756 757 t.Fatalf("duplicate InsertBypassDID should be idempotent: %v", err) 757 758 } 758 759 dids, _ = s.ListBypassDIDs(ctx) ··· 761 762 } 762 763 763 764 // Remove one 764 - if err := s.DeleteBypassDID(ctx, "did:web:test1.example.com"); err != nil { 765 + if err := s.DeleteBypassDID(ctx, "did:web:test1.example.com", "test"); err != nil { 765 766 t.Fatalf("DeleteBypassDID: %v", err) 766 767 } 767 768 dids, _ = s.ListBypassDIDs(ctx) ··· 773 774 } 774 775 775 776 // Remove non-existent (no error) 776 - if err := s.DeleteBypassDID(ctx, "did:web:nonexistent.com"); err != nil { 777 + if err := s.DeleteBypassDID(ctx, "did:web:nonexistent.com", "test"); err != nil { 777 778 t.Fatalf("DeleteBypassDID nonexistent: %v", err) 778 779 } 779 780 } ··· 1670 1671 } 1671 1672 if n != 2 { 1672 1673 t.Errorf("CountRelayRejectedSince = %d, want 2", n) 1674 + } 1675 + } 1676 + 1677 + // --- Daily send counts --- 1678 + 1679 + func TestGetDailySendCounts_ReturnsFixedLengthSlice(t *testing.T) { 1680 + s := testStore(t) 1681 + ctx := context.Background() 1682 + did := "did:plc:daily1111111111111111111" 1683 + insertTestMemberWithDomain(t, s, did, "daily.example.com") 1684 + 1685 + now := time.Now().UTC() 1686 + // Insert 3 messages on different days 1687 + for i := 0; i < 3; i++ { 1688 + _, err := s.InsertMessage(ctx, &Message{ 1689 + MemberDID: did, FromAddr: "x@daily.example.com", ToAddr: "y@z.com", 1690 + MessageID: fmt.Sprintf("<m%d>", i), Status: MsgSent, 1691 + CreatedAt: now.AddDate(0, 0, -(2 - i)), 1692 + }) 1693 + if err != nil { 1694 + t.Fatalf("insert: %v", err) 1695 + } 1696 + } 1697 + 1698 + daily, err := s.GetDailySendCounts(ctx, did, 14) 1699 + if err != nil { 1700 + t.Fatalf("GetDailySendCounts: %v", err) 1701 + } 1702 + if len(daily) != 14 { 1703 + t.Fatalf("expected 14 days, got %d", len(daily)) 1704 + } 1705 + // Last 3 days should have 1 each, earlier days 0 1706 + if daily[11] != 1 || daily[12] != 1 || daily[13] != 1 { 1707 + t.Errorf("unexpected distribution: %v", daily) 1708 + } 1709 + } 1710 + 1711 + func TestGetDailySendCounts_MemberIsolation(t *testing.T) { 1712 + s := testStore(t) 1713 + ctx := context.Background() 1714 + insertTestMemberWithDomain(t, s, "did:plc:a", "a.example.com") 1715 + insertTestMemberWithDomain(t, s, "did:plc:b", "b.example.com") 1716 + 1717 + now := time.Now().UTC() 1718 + _, _ = s.InsertMessage(ctx, &Message{MemberDID: "did:plc:a", FromAddr: "x@a.com", ToAddr: "y@z.com", MessageID: "<m>", Status: MsgSent, CreatedAt: now}) 1719 + 1720 + aDaily, _ := s.GetDailySendCounts(ctx, "did:plc:a", 14) 1721 + bDaily, _ := s.GetDailySendCounts(ctx, "did:plc:b", 14) 1722 + 1723 + if aDaily[13] != 1 { 1724 + t.Errorf("a expected 1 send today, got %d", aDaily[13]) 1725 + } 1726 + if bDaily[13] != 0 { 1727 + t.Errorf("b expected 0 sends today, got %d", bDaily[13]) 1728 + } 1729 + } 1730 + 1731 + // --- Complaint counts --- 1732 + 1733 + func TestGetComplaintCount(t *testing.T) { 1734 + s := testStore(t) 1735 + ctx := context.Background() 1736 + did := "did:plc:complaint1111111111111" 1737 + insertTestMemberWithDomain(t, s, did, "complaint.example.com") 1738 + 1739 + now := time.Now().UTC() 1740 + // Insert 2 complaints recently, 1 old 1741 + for i := 0; i < 2; i++ { 1742 + _, err := s.InsertFeedbackEvent(ctx, &FeedbackEvent{ 1743 + MemberDID: did, EventType: "complaint", 1744 + CreatedAt: now.Add(-time.Hour), 1745 + }) 1746 + if err != nil { 1747 + t.Fatalf("insert: %v", err) 1748 + } 1749 + } 1750 + _, _ = s.InsertFeedbackEvent(ctx, &FeedbackEvent{ 1751 + MemberDID: did, EventType: "complaint", 1752 + CreatedAt: now.Add(-48 * time.Hour), 1753 + }) 1754 + // A bounce should not count 1755 + _, _ = s.InsertFeedbackEvent(ctx, &FeedbackEvent{ 1756 + MemberDID: did, EventType: "bounce_hard", 1757 + CreatedAt: now.Add(-time.Hour), 1758 + }) 1759 + 1760 + n, err := s.GetComplaintCount(ctx, did, now.Add(-24*time.Hour)) 1761 + if err != nil { 1762 + t.Fatalf("GetComplaintCount: %v", err) 1763 + } 1764 + if n != 2 { 1765 + t.Errorf("expected 2 complaints, got %d", n) 1766 + } 1767 + } 1768 + 1769 + func TestGetComplaintCount_MemberIsolation(t *testing.T) { 1770 + s := testStore(t) 1771 + ctx := context.Background() 1772 + insertTestMemberWithDomain(t, s, "did:plc:a", "a.example.com") 1773 + insertTestMemberWithDomain(t, s, "did:plc:b", "b.example.com") 1774 + 1775 + now := time.Now().UTC() 1776 + _, _ = s.InsertFeedbackEvent(ctx, &FeedbackEvent{MemberDID: "did:plc:a", EventType: "complaint", CreatedAt: now}) 1777 + 1778 + aCount, _ := s.GetComplaintCount(ctx, "did:plc:a", now.Add(-time.Hour)) 1779 + bCount, _ := s.GetComplaintCount(ctx, "did:plc:b", now.Add(-time.Hour)) 1780 + if aCount != 1 { 1781 + t.Errorf("a expected 1, got %d", aCount) 1782 + } 1783 + if bCount != 0 { 1784 + t.Errorf("b expected 0, got %d", bCount) 1673 1785 } 1674 1786 } 1675 1787
+17 -17
internal/store/sqlite.go
··· 51 51 func New(dsn string) (*Store, error) { 52 52 db, err := sql.Open("sqlite", dsn) 53 53 if err != nil { 54 - return nil, fmt.Errorf("open sqlite: %v", err) 54 + return nil, fmt.Errorf("open sqlite: %w", err) 55 55 } 56 56 if _, err := db.Exec("PRAGMA journal_mode=WAL"); err != nil { 57 57 db.Close() 58 - return nil, fmt.Errorf("set WAL mode: %v", err) 58 + return nil, fmt.Errorf("set WAL mode: %w", err) 59 59 } 60 60 if _, err := db.Exec("PRAGMA busy_timeout = 5000"); err != nil { 61 61 db.Close() 62 - return nil, fmt.Errorf("set busy timeout: %v", err) 62 + return nil, fmt.Errorf("set busy timeout: %w", err) 63 63 } 64 64 if _, err := db.Exec("PRAGMA foreign_keys=ON"); err != nil { 65 65 db.Close() 66 - return nil, fmt.Errorf("enable foreign keys: %v", err) 66 + return nil, fmt.Errorf("enable foreign keys: %w", err) 67 67 } 68 68 s := &Store{db: db, notifyCh: make(chan struct{})} 69 69 if err := s.migrate(); err != nil { 70 70 db.Close() 71 - return nil, fmt.Errorf("migrate: %v", err) 71 + return nil, fmt.Errorf("migrate: %w", err) 72 72 } 73 73 return s, nil 74 74 } ··· 138 138 l.Src, l.URI, l.Val, l.Cts, boolToInt(l.Neg), l.Sig, l.RawCBOR, 139 139 ) 140 140 if err != nil { 141 - return 0, fmt.Errorf("insert label: %v", err) 141 + return 0, fmt.Errorf("insert label: %w", err) 142 142 } 143 143 s.notifyLabelInserted() 144 144 return res.LastInsertId() ··· 209 209 func (s *Store) queryLabels(ctx context.Context, query string, args []any) ([]Label, string, error) { 210 210 rows, err := s.db.QueryContext(ctx, query, args...) 211 211 if err != nil { 212 - return nil, "", fmt.Errorf("query labels: %v", err) 212 + return nil, "", fmt.Errorf("query labels: %w", err) 213 213 } 214 214 defer rows.Close() 215 215 ··· 219 219 var l Label 220 220 var neg int 221 221 if err := rows.Scan(&l.Seq, &l.Src, &l.URI, &l.Val, &l.Cts, &neg, &l.Sig, &l.RawCBOR); err != nil { 222 - return nil, "", fmt.Errorf("scan label: %v", err) 222 + return nil, "", fmt.Errorf("scan label: %w", err) 223 223 } 224 224 l.Neg = neg != 0 225 225 lastSeq = l.Seq ··· 238 238 func (s *Store) UpsertAttestation(ctx context.Context, a *Attestation) error { 239 239 selectors, err := json.Marshal(a.DKIMSelectors) 240 240 if err != nil { 241 - return fmt.Errorf("marshal selectors: %v", err) 241 + return fmt.Errorf("marshal selectors: %w", err) 242 242 } 243 243 _, err = s.db.ExecContext(ctx, 244 244 `INSERT INTO attestations (did, domain, rkey, dkim_selectors, relay_member, verified, last_verified, created_at) ··· 253 253 boolToInt(a.Verified), formatTime(a.LastVerified), formatTime(a.CreatedAt), 254 254 ) 255 255 if err != nil { 256 - return fmt.Errorf("upsert attestation: %v", err) 256 + return fmt.Errorf("upsert attestation: %w", err) 257 257 } 258 258 return nil 259 259 } ··· 291 291 FROM attestations WHERE did = ?`, did, 292 292 ) 293 293 if err != nil { 294 - return nil, fmt.Errorf("get attestations for DID: %v", err) 294 + return nil, fmt.Errorf("get attestations for DID: %w", err) 295 295 } 296 296 defer rows.Close() 297 297 ··· 311 311 `SELECT did, domain, rkey, dkim_selectors, relay_member, verified, last_verified, created_at FROM attestations`, 312 312 ) 313 313 if err != nil { 314 - return nil, fmt.Errorf("list attestations: %v", err) 314 + return nil, fmt.Errorf("list attestations: %w", err) 315 315 } 316 316 defer rows.Close() 317 317 ··· 350 350 return nil, nil 351 351 } 352 352 if err != nil { 353 - return nil, fmt.Errorf("scan attestation: %v", err) 353 + return nil, fmt.Errorf("scan attestation: %w", err) 354 354 } 355 355 356 356 if err := json.Unmarshal([]byte(selJSON), &a.DKIMSelectors); err != nil { 357 - return nil, fmt.Errorf("unmarshal selectors: %v", err) 357 + return nil, fmt.Errorf("unmarshal selectors: %w", err) 358 358 } 359 359 a.RelayMember = relay != 0 360 360 a.Verified = verified != 0 ··· 369 369 var cursor int64 370 370 err := s.db.QueryRowContext(ctx, `SELECT cursor FROM cursor WHERE id = 1`).Scan(&cursor) 371 371 if err != nil { 372 - return 0, fmt.Errorf("get cursor: %v", err) 372 + return 0, fmt.Errorf("get cursor: %w", err) 373 373 } 374 374 return cursor, nil 375 375 } ··· 394 394 var st Stats 395 395 err := s.db.QueryRowContext(ctx, `SELECT COUNT(*) FROM labels`).Scan(&st.Labels) 396 396 if err != nil { 397 - return st, fmt.Errorf("count labels: %v", err) 397 + return st, fmt.Errorf("count labels: %w", err) 398 398 } 399 399 err = s.db.QueryRowContext(ctx, `SELECT COUNT(*) FROM attestations`).Scan(&st.Attestations) 400 400 if err != nil { 401 - return st, fmt.Errorf("count attestations: %v", err) 401 + return st, fmt.Errorf("count attestations: %w", err) 402 402 } 403 403 return st, nil 404 404 }
+1 -1
osprey/tests/requirements.txt
··· 1 1 kafka-python==2.0.2 2 - PyYAML==6.0.1 2 + PyYAML==6.0.3
+4
relay-config.json.example
··· 5 5 6 6 // Inbound SMTP for bounce processing 7 7 "inboundAddr": ":25", 8 + // Per-source-IP rate limit on inbound MAIL FROM. Defaults shown below. 9 + // Set inboundRateLimitMsgsPerMinute to 0 to disable (legacy behavior). 10 + "inboundRateLimitMsgsPerMinute": 30, 11 + "inboundRateLimitBurst": 10, 8 12 9 13 // Admin API (Tailscale-only recommended) 10 14 "adminAddr": ":8080",
+1 -1
vendor/modules.txt
··· 6 6 # github.com/beorn7/perks v1.0.1 7 7 ## explicit; go 1.11 8 8 github.com/beorn7/perks/quantile 9 - # github.com/bluesky-social/indigo v0.0.0-20260417172304-7da09df6081d 9 + # github.com/bluesky-social/indigo v0.0.0-20260422192121-9bad73ca4cad 10 10 ## explicit; go 1.26 11 11 github.com/bluesky-social/indigo/atproto/atclient 12 12 github.com/bluesky-social/indigo/atproto/atcrypto