···11+// Auto-Moderation rules engine for anti-spam and other moderation tasks.
22+//
33+// The code in this package includes an "engine" which processes atproto commit events (and identity updates), maintains caches and counters, and pushes moderation decisions to an external mod service (eg, appview). A framework for writing new "rules" for the engine to execute are also provided.
44+//
55+// It does not provide label API endpoints like queryLabels; see labelmaker for a self-contained labeling service.
66+//
77+// Code for subscribing to a firehose is not included here; see cmd/hepa for a complete service built on this library.
88+package automod
+99-3
automod/engine.go
···11package automod
2233import (
44+ "bytes"
45 "context"
66+ "fmt"
57 "log/slog"
88+ "strings"
69 "sync"
7101111+ comatproto "github.com/bluesky-social/indigo/api/atproto"
812 "github.com/bluesky-social/indigo/atproto/identity"
1313+ "github.com/bluesky-social/indigo/atproto/syntax"
1414+ lexutil "github.com/bluesky-social/indigo/lex/util"
1515+ "github.com/bluesky-social/indigo/repo"
1616+ "github.com/bluesky-social/indigo/repomgr"
917 "github.com/bluesky-social/indigo/xrpc"
1018)
1119···1927 CountStore CountStore
2028}
21292222-func (e *Engine) ExecuteIdentity() error {
3030+func (e *Engine) ProcessIdentityEvent(t string, did syntax.DID) error {
2331 ctx := context.Background()
24322533 // similar to an HTTP server, we want to recover any panics from rule execution
···3038 // TODO: circuit-break on repeated panics?
3139 }
3240 }()
4141+4242+ ident, err := e.Directory.LookupDID(ctx, did)
4343+ if err != nil {
4444+ return fmt.Errorf("resolving identity: %w", err)
4545+ }
4646+ if ident == nil {
4747+ return fmt.Errorf("identity not found for did: %s", did.String())
4848+ }
4949+5050+ evt := IdentityEvent{
5151+ Event{
5252+ Engine: e,
5353+ Account: AccountMeta{Identity: ident},
5454+ },
5555+ }
5656+ e.CallIdentityRules(&evt)
33573458 _ = ctx
3559 return nil
3660}
37613838-func (e *Engine) ExecuteCommit() error {
3939- ctx := context.Background()
6262+// this method takes a full firehose commit event. it must not be a tooBig
6363+func (e *Engine) ProcessCommit(ctx context.Context, commit *comatproto.SyncSubscribeRepos_Commit) error {
40644165 // similar to an HTTP server, we want to recover any panics from rule execution
4266 defer func() {
···4771 }
4872 }()
49737474+ r, err := repo.ReadRepoFromCar(ctx, bytes.NewReader(commit.Blocks))
7575+ if err != nil {
7676+ // TODO: handle this case (instead of return nil)
7777+ slog.Error("reading repo from car", "size_bytes", len(commit.Blocks), "err", err)
7878+ return nil
7979+ }
8080+8181+ did, err := syntax.ParseDID(commit.Repo)
8282+ if err != nil {
8383+ return fmt.Errorf("bad DID syntax in event: %w", err)
8484+ }
8585+8686+ ident, err := e.Directory.LookupDID(ctx, did)
8787+ if err != nil {
8888+ return fmt.Errorf("resolving identity: %w", err)
8989+ }
9090+ if ident == nil {
9191+ return fmt.Errorf("identity not found for did: %s", did.String())
9292+ }
9393+9494+ for _, op := range commit.Ops {
9595+ ek := repomgr.EventKind(op.Action)
9696+ logOp := slog.With("op_path", op.Path, "op_cid", op.Cid)
9797+ switch ek {
9898+ case repomgr.EvtKindCreateRecord:
9999+ rc, rec, err := r.GetRecord(ctx, op.Path)
100100+ if err != nil {
101101+ // TODO: handle this case (instead of return nil)
102102+ logOp.Error("fetching record from event CAR slice", "err", err)
103103+ return nil
104104+ }
105105+ if lexutil.LexLink(rc) != *op.Cid {
106106+ // TODO: handle this case (instead of return nil)
107107+ logOp.Error("mismatch in record and op cid", "record_cid", rc)
108108+ return nil
109109+ }
110110+111111+ if strings.HasPrefix(op.Path, "app.bsky.feed.post/") {
112112+ // TODO: handle as a PostEvent specially
113113+ } else {
114114+ // XXX: pass record in to event
115115+ _ = rec
116116+ evt := RecordEvent{
117117+ Event{
118118+ Engine: e,
119119+ Account: AccountMeta{Identity: ident},
120120+ },
121121+ []string{},
122122+ false,
123123+ []ModReport{},
124124+ []string{},
125125+ }
126126+ e.CallRecordRules(&evt)
127127+ // TODO persist
128128+ }
129129+ case repomgr.EvtKindUpdateRecord:
130130+ slog.Info("ignoring record update", "did", commit.Repo, "seq", commit.Seq, "path", op.Path)
131131+ return nil
132132+ case repomgr.EvtKindDeleteRecord:
133133+ slog.Info("ignoring record deletion", "did", commit.Repo, "seq", commit.Seq, "path", op.Path)
134134+ return nil
135135+ }
136136+ }
137137+50138 _ = ctx
139139+ return nil
140140+}
141141+142142+func (e *Engine) CallIdentityRules(evt *IdentityEvent) error {
143143+ return nil
144144+}
145145+146146+func (e *Engine) CallRecordRules(evt *RecordEvent) error {
51147 return nil
52148}
53149
+7-6
automod/event.go
···11111212// information about a repo/account/identity, always pre-populated and relevant to many rules
1313type AccountMeta struct {
1414- Identity identity.Identity
1414+ Identity *identity.Identity
1515 // TODO: createdAt / age
1616}
17171818// base type for events. events are both containers for data about the event itself (similar to an HTTP request type); aggregate results and state (counters, mod actions) to be persisted after all rules are run; and act as an API for additional network reads and operations.
1919type Event struct {
2020- engine Engine
2020+ Engine *Engine
2121 Err *error
2222 Account AccountMeta
2323 CounterIncrements []string
···2828}
29293030func (e *Event) CountTotal(key string) int {
3131- v, err := e.engine.GetCount(key, PeriodTotal)
3131+ v, err := e.Engine.GetCount(key, PeriodTotal)
3232 if err != nil {
3333 e.Err = &err
3434 return 0
···3737}
38383939func (e *Event) CountDay(key string) int {
4040- v, err := e.engine.GetCount(key, PeriodDay)
4040+ v, err := e.Engine.GetCount(key, PeriodDay)
4141 if err != nil {
4242 e.Err = &err
4343 return 0
···4646}
47474848func (e *Event) CountHour(key string) int {
4949- v, err := e.engine.GetCount(key, PeriodHour)
4949+ v, err := e.Engine.GetCount(key, PeriodHour)
5050 if err != nil {
5151 e.Err = &err
5252 return 0
···5555}
56565757func (e *Event) InSet(name, val string) bool {
5858- v, err := e.engine.InSet(name, val)
5858+ v, err := e.Engine.InSet(name, val)
5959 if err != nil {
6060 e.Err = &err
6161 return false
···89899090type RecordEvent struct {
9191 Event
9292+9293 RecordLabels []string
9394 RecordTakedown bool
9495 RecordReports []ModReport
+7
cmd/hepa/README.md
···11+22+HEPA
33+====
44+55+This is a simple auto-moderation daemon which wraps the automod package.
66+77+The name is a reference to HEPA air filters, which help keep the local atmosphere clean and healthy for humans.
+287
cmd/hepa/firehose.go
···11+package main
22+33+import (
44+ "bytes"
55+ "context"
66+ "fmt"
77+ "net/http"
88+ "net/url"
99+ "strings"
1010+ "time"
1111+1212+ comatproto "github.com/bluesky-social/indigo/api/atproto"
1313+ //bsky "github.com/bluesky-social/indigo/api/bsky"
1414+ "github.com/bluesky-social/indigo/atproto/syntax"
1515+ "github.com/bluesky-social/indigo/events"
1616+ "github.com/bluesky-social/indigo/events/schedulers/autoscaling"
1717+ "github.com/bluesky-social/indigo/repo"
1818+1919+ "github.com/carlmjohnson/versioninfo"
2020+ "github.com/gorilla/websocket"
2121+ "github.com/ipfs/go-cid"
2222+ typegen "github.com/whyrusleeping/cbor-gen"
2323+)
2424+2525+func (s *Server) getLastCursor() (int64, error) {
2626+ var lastSeq LastSeq
2727+ if err := s.db.Find(&lastSeq).Error; err != nil {
2828+ return 0, err
2929+ }
3030+3131+ if lastSeq.ID == 0 {
3232+ return 0, s.db.Create(&lastSeq).Error
3333+ }
3434+3535+ return lastSeq.Seq, nil
3636+}
3737+3838+func (s *Server) updateLastCursor(curs int64) error {
3939+ return s.db.Model(LastSeq{}).Where("id = 1").Update("seq", curs).Error
4040+}
4141+4242+func (s *Server) Run(ctx context.Context) error {
4343+ cur, err := s.getLastCursor()
4444+ if err != nil {
4545+ return fmt.Errorf("get last cursor: %w", err)
4646+ }
4747+4848+ err = s.bfs.LoadJobs(ctx)
4949+ if err != nil {
5050+ return fmt.Errorf("loading backfill jobs: %w", err)
5151+ }
5252+ go s.bf.Start()
5353+ go s.discoverRepos()
5454+5555+ d := websocket.DefaultDialer
5656+ u, err := url.Parse(s.bgshost)
5757+ if err != nil {
5858+ return fmt.Errorf("invalid bgshost URI: %w", err)
5959+ }
6060+ u.Path = "xrpc/com.atproto.sync.subscribeRepos"
6161+ if cur != 0 {
6262+ u.RawQuery = fmt.Sprintf("cursor=%d", cur)
6363+ }
6464+ con, _, err := d.Dial(u.String(), http.Header{
6565+ "User-Agent": []string{fmt.Sprintf("palomar/%s", versioninfo.Short())},
6666+ })
6767+ if err != nil {
6868+ return fmt.Errorf("events dial failed: %w", err)
6969+ }
7070+7171+ rsc := &events.RepoStreamCallbacks{
7272+ RepoCommit: func(evt *comatproto.SyncSubscribeRepos_Commit) error {
7373+ ctx := context.Background()
7474+ ctx, span := tracer.Start(ctx, "RepoCommit")
7575+ defer span.End()
7676+7777+ defer func() {
7878+ if evt.Seq%50 == 0 {
7979+ if err := s.updateLastCursor(evt.Seq); err != nil {
8080+ s.logger.Error("failed to persist cursor", "err", err)
8181+ }
8282+ }
8383+ }()
8484+ logEvt := s.logger.With("repo", evt.Repo, "rev", evt.Rev, "seq", evt.Seq)
8585+ if evt.TooBig && evt.Prev != nil {
8686+ // TODO: handle this case (instead of return nil)
8787+ logEvt.Error("skipping non-genesis tooBig events for now")
8888+ return nil
8989+ }
9090+9191+ if evt.TooBig {
9292+ if err := s.processTooBigCommit(ctx, evt); err != nil {
9393+ // TODO: handle this case (instead of return nil)
9494+ logEvt.Error("failed to process tooBig event", "err", err)
9595+ return nil
9696+ }
9797+9898+ return nil
9999+ }
100100+101101+ if !s.skipBackfill {
102102+ // Check if we've backfilled this repo, if not, we should enqueue it
103103+ job, err := s.bfs.GetJob(ctx, evt.Repo)
104104+ if job == nil && err == nil {
105105+ logEvt.Info("enqueueing backfill job for new repo")
106106+ if err := s.bfs.EnqueueJob(evt.Repo); err != nil {
107107+ logEvt.Warn("failed to enqueue backfill job", "err", err)
108108+ }
109109+ }
110110+ }
111111+112112+ if err = s.engine.ProcessCommit(ctx, evt); err != nil {
113113+ // TODO: handle this, instead of return nul
114114+ logEvt.Error("failed to process commit", "err", err)
115115+ return nil
116116+ }
117117+118118+ return nil
119119+120120+ },
121121+ RepoHandle: func(evt *comatproto.SyncSubscribeRepos_Handle) error {
122122+ ctx := context.Background()
123123+ ctx, span := tracer.Start(ctx, "RepoHandle")
124124+ defer span.End()
125125+126126+ did, err := syntax.ParseDID(evt.Did)
127127+ if err != nil {
128128+ s.logger.Error("bad DID in RepoHandle event", "did", evt.Did, "handle", evt.Handle, "seq", evt.Seq, "err", err)
129129+ return nil
130130+ }
131131+ if err := s.engine.ProcessIdentityEvent("handle", did); err != nil {
132132+ s.logger.Error("processing handle update failed", "did", evt.Did, "handle", evt.Handle, "seq", evt.Seq, "err", err)
133133+ }
134134+ return nil
135135+ },
136136+ }
137137+138138+ return events.HandleRepoStream(
139139+ ctx, con, autoscaling.NewScheduler(
140140+ autoscaling.DefaultAutoscaleSettings(),
141141+ s.bgshost,
142142+ rsc.EventHandler,
143143+ ),
144144+ )
145145+}
146146+147147+func (s *Server) discoverRepos() {
148148+ ctx := context.Background()
149149+ log := s.logger.With("func", "discoverRepos")
150150+ log.Info("starting repo discovery")
151151+152152+ cursor := ""
153153+ limit := int64(500)
154154+155155+ totalEnqueued := 0
156156+ totalSkipped := 0
157157+ totalErrored := 0
158158+159159+ for {
160160+ resp, err := comatproto.SyncListRepos(ctx, s.bgsxrpc, cursor, limit)
161161+ if err != nil {
162162+ log.Error("failed to list repos", "err", err)
163163+ time.Sleep(5 * time.Second)
164164+ continue
165165+ }
166166+ log.Info("got repo page", "count", len(resp.Repos), "cursor", resp.Cursor)
167167+ enqueued := 0
168168+ skipped := 0
169169+ errored := 0
170170+ for _, repo := range resp.Repos {
171171+ job, err := s.bfs.GetJob(ctx, repo.Did)
172172+ if job == nil && err == nil {
173173+ log.Info("enqueuing backfill job for new repo", "did", repo.Did)
174174+ if err := s.bfs.EnqueueJob(repo.Did); err != nil {
175175+ log.Warn("failed to enqueue backfill job", "err", err)
176176+ errored++
177177+ continue
178178+ }
179179+ enqueued++
180180+ } else if err != nil {
181181+ log.Warn("failed to get backfill job", "did", repo.Did, "err", err)
182182+ errored++
183183+ } else {
184184+ skipped++
185185+ }
186186+ }
187187+ log.Info("enqueued repos", "enqueued", enqueued, "skipped", skipped, "errored", errored)
188188+ totalEnqueued += enqueued
189189+ totalSkipped += skipped
190190+ totalErrored += errored
191191+ if resp.Cursor != nil && *resp.Cursor != "" {
192192+ cursor = *resp.Cursor
193193+ } else {
194194+ break
195195+ }
196196+ }
197197+198198+ log.Info("finished repo discovery", "totalEnqueued", totalEnqueued, "totalSkipped", totalSkipped, "totalErrored", totalErrored)
199199+}
200200+201201+func (s *Server) handleCreateOrUpdate(ctx context.Context, rawDID string, path string, recP *typegen.CBORMarshaler, rcid *cid.Cid) error {
202202+ // Since this gets called in a backfill job, we need to check if the path is a post or profile
203203+ if !strings.Contains(path, "app.bsky.feed.post") && !strings.Contains(path, "app.bsky.actor.profile") {
204204+ return nil
205205+ }
206206+207207+ did, err := syntax.ParseDID(rawDID)
208208+ if err != nil {
209209+ return fmt.Errorf("bad DID syntax in event: %w", err)
210210+ }
211211+212212+ ident, err := s.dir.LookupDID(ctx, did)
213213+ if err != nil {
214214+ return fmt.Errorf("resolving identity: %w", err)
215215+ }
216216+ if ident == nil {
217217+ return fmt.Errorf("identity not found for did: %s", did.String())
218218+ }
219219+ rec := *recP
220220+221221+ _ = rec
222222+ /* XXX:
223223+ switch rec := rec.(type) {
224224+ case *bsky.FeedPost:
225225+ // XXX: if err := s.indexPost(ctx, ident, rec, path, *rcid); err != nil {
226226+ _ = rec
227227+ if err := s.engine.ProcessCommit(ctx, evt); err != nil {
228228+ postsFailed.Inc()
229229+ return fmt.Errorf("processing post for %s: %w", did.String(), err)
230230+ }
231231+ postsIndexed.Inc()
232232+ case *bsky.ActorProfile:
233233+ // XXX: if err := s.indexProfile(ctx, ident, rec, path, *rcid); err != nil {
234234+ if err := s.engine.ProcessCommit(ctx, evt); err != nil {
235235+ profilesFailed.Inc()
236236+ return fmt.Errorf("processing profile for %s: %w", did.String(), err)
237237+ }
238238+ profilesIndexed.Inc()
239239+ default:
240240+ }
241241+ */
242242+ return nil
243243+}
244244+245245+func (s *Server) handleDelete(ctx context.Context, rawDID, path string) error {
246246+ // TODO: just ignoring for now
247247+ return nil
248248+}
249249+250250+func (s *Server) processTooBigCommit(ctx context.Context, evt *comatproto.SyncSubscribeRepos_Commit) error {
251251+ repodata, err := comatproto.SyncGetRepo(ctx, s.bgsxrpc, evt.Repo, "")
252252+ if err != nil {
253253+ return err
254254+ }
255255+256256+ r, err := repo.ReadRepoFromCar(ctx, bytes.NewReader(repodata))
257257+ if err != nil {
258258+ return err
259259+ }
260260+261261+ did, err := syntax.ParseDID(evt.Repo)
262262+ if err != nil {
263263+ return fmt.Errorf("bad DID in repo event: %w", err)
264264+ }
265265+266266+ return r.ForEach(ctx, "", func(k string, v cid.Cid) error {
267267+ if strings.HasPrefix(k, "app.bsky.feed.post") || strings.HasPrefix(k, "app.bsky.actor.profile") {
268268+ rcid, rec, err := r.GetRecord(ctx, k)
269269+ if err != nil {
270270+ // TODO: handle this case (instead of return nil)
271271+ s.logger.Error("failed to get record from repo checkout", "path", k, "err", err)
272272+ return nil
273273+ }
274274+275275+ // TODO: may want to treat this as a regular event?
276276+ _ = rcid
277277+ _ = did
278278+ _ = rec
279279+ /* XXX:
280280+ if err := s.engine.ProcessRecord(ctx, did, m, rec); err != nil {
281281+ return fmt.Errorf("processing record from tooBig commit: %w", err)
282282+ }
283283+ */
284284+ }
285285+ return nil
286286+ })
287287+}