···11+package backfill
22+33+import (
44+ "github.com/prometheus/client_golang/prometheus"
55+ "github.com/prometheus/client_golang/prometheus/promauto"
66+)
77+88+var backfillJobsEnqueued = promauto.NewCounterVec(prometheus.CounterOpts{
99+ Name: "backfill_jobs_enqueued_total",
1010+ Help: "The total number of backfill jobs enqueued",
1111+}, []string{"backfiller_name"})
1212+1313+var backfillJobsProcessed = promauto.NewCounterVec(prometheus.CounterOpts{
1414+ Name: "backfill_jobs_processed_total",
1515+ Help: "The total number of backfill jobs processed",
1616+}, []string{"backfiller_name"})
1717+1818+var backfillRecordsProcessed = promauto.NewCounterVec(prometheus.CounterOpts{
1919+ Name: "backfill_records_processed_total",
2020+ Help: "The total number of backfill records processed",
2121+}, []string{"backfiller_name"})
2222+2323+var backfillOpsBuffered = promauto.NewGaugeVec(prometheus.GaugeOpts{
2424+ Name: "backfill_ops_buffered",
2525+ Help: "The number of backfill operations buffered",
2626+}, []string{"backfiller_name"})
2727+2828+var backfillBytesProcessed = promauto.NewCounterVec(prometheus.CounterOpts{
2929+ Name: "backfill_bytes_processed_total",
3030+ Help: "The total number of backfill bytes processed",
3131+}, []string{"backfiller_name"})
+33
backfill/util.go
···11+package backfill
22+33+import (
44+ "io"
55+66+ "github.com/prometheus/client_golang/prometheus"
77+)
88+99+type instrumentedReader struct {
1010+ source io.ReadCloser
1111+ counter prometheus.Counter
1212+}
1313+1414+func (r instrumentedReader) Read(b []byte) (int, error) {
1515+ n, err := r.source.Read(b)
1616+ r.counter.Add(float64(n))
1717+ return n, err
1818+}
1919+2020+func (r instrumentedReader) Close() error {
2121+ var buf [32]byte
2222+ var n int
2323+ var err error
2424+ for err == nil {
2525+ n, err = r.source.Read(buf[:])
2626+ r.counter.Add(float64(n))
2727+ }
2828+ closeerr := r.source.Close()
2929+ if err != nil && err != io.EOF {
3030+ return err
3131+ }
3232+ return closeerr
3333+}
+280
search/firehose.go
···11+package search
22+33+import (
44+ "bytes"
55+ "context"
66+ "fmt"
77+ "net/http"
88+ "strings"
99+1010+ comatproto "github.com/bluesky-social/indigo/api/atproto"
1111+ bsky "github.com/bluesky-social/indigo/api/bsky"
1212+ "github.com/bluesky-social/indigo/backfill"
1313+ "github.com/bluesky-social/indigo/events"
1414+ "github.com/bluesky-social/indigo/events/schedulers/autoscaling"
1515+ lexutil "github.com/bluesky-social/indigo/lex/util"
1616+ "github.com/bluesky-social/indigo/repo"
1717+ "github.com/bluesky-social/indigo/repomgr"
1818+ "github.com/gorilla/websocket"
1919+ "github.com/ipfs/go-cid"
2020+ typegen "github.com/whyrusleeping/cbor-gen"
2121+)
2222+2323+func (s *Server) getLastCursor() (int64, error) {
2424+ var lastSeq LastSeq
2525+ if err := s.db.Find(&lastSeq).Error; err != nil {
2626+ return 0, err
2727+ }
2828+2929+ if lastSeq.ID == 0 {
3030+ return 0, s.db.Create(&lastSeq).Error
3131+ }
3232+3333+ return lastSeq.Seq, nil
3434+}
3535+3636+func (s *Server) updateLastCursor(curs int64) error {
3737+ return s.db.Model(LastSeq{}).Where("id = 1").Update("seq", curs).Error
3838+}
3939+4040+func (s *Server) RunIndexer(ctx context.Context) error {
4141+ cur, err := s.getLastCursor()
4242+ if err != nil {
4343+ return fmt.Errorf("get last cursor: %w", err)
4444+ }
4545+4646+ err = s.bfs.LoadJobs(ctx)
4747+ if err != nil {
4848+ return fmt.Errorf("loading backfill jobs: %w", err)
4949+ }
5050+ s.bf.Start()
5151+5252+ d := websocket.DefaultDialer
5353+ con, _, err := d.Dial(fmt.Sprintf("%s/xrpc/com.atproto.sync.subscribeRepos?cursor=%d", s.bgshost, cur), http.Header{})
5454+ if err != nil {
5555+ return fmt.Errorf("events dial failed: %w", err)
5656+ }
5757+5858+ rsc := &events.RepoStreamCallbacks{
5959+ RepoCommit: func(evt *comatproto.SyncSubscribeRepos_Commit) error {
6060+ defer func() {
6161+ if evt.Seq%50 == 0 {
6262+ if err := s.updateLastCursor(evt.Seq); err != nil {
6363+ log.Error("Failed to update cursor: ", err)
6464+ }
6565+ }
6666+ }()
6767+ if evt.TooBig && evt.Prev != nil {
6868+ log.Errorf("skipping non-genesis too big events for now: %d", evt.Seq)
6969+ return nil
7070+ }
7171+7272+ if evt.TooBig {
7373+ if err := s.processTooBigCommit(ctx, evt); err != nil {
7474+ log.Errorf("failed to process tooBig event: %s", err)
7575+ return nil
7676+ }
7777+7878+ return nil
7979+ }
8080+8181+ r, err := repo.ReadRepoFromCar(ctx, bytes.NewReader(evt.Blocks))
8282+ if err != nil {
8383+ log.Errorf("reading repo from car (seq: %d, len: %d): %w", evt.Seq, len(evt.Blocks), err)
8484+ return nil
8585+ }
8686+8787+ for _, op := range evt.Ops {
8888+ ek := repomgr.EventKind(op.Action)
8989+ switch ek {
9090+ case repomgr.EvtKindCreateRecord, repomgr.EvtKindUpdateRecord:
9191+ rc, rec, err := r.GetRecord(ctx, op.Path)
9292+ if err != nil {
9393+ e := fmt.Errorf("getting record %s (%s) within seq %d for %s: %w", op.Path, *op.Cid, evt.Seq, evt.Repo, err)
9494+ log.Error(e)
9595+ return nil
9696+ }
9797+9898+ if lexutil.LexLink(rc) != *op.Cid {
9999+ log.Errorf("mismatch in record and op cid: %s != %s", rc, *op.Cid)
100100+ return nil
101101+ }
102102+103103+ if err := s.handleOp(ctx, ek, evt.Seq, op.Path, evt.Repo, &rc, rec); err != nil {
104104+ log.Errorf("failed to handle op: %s", err)
105105+ return nil
106106+ }
107107+108108+ case repomgr.EvtKindDeleteRecord:
109109+ if err := s.handleOp(ctx, ek, evt.Seq, op.Path, evt.Repo, nil, nil); err != nil {
110110+ log.Errorf("failed to handle delete: %s", err)
111111+ return nil
112112+ }
113113+ }
114114+ }
115115+116116+ return nil
117117+118118+ },
119119+ RepoHandle: func(evt *comatproto.SyncSubscribeRepos_Handle) error {
120120+ if err := s.updateUserHandle(ctx, evt.Did, evt.Handle); err != nil {
121121+ log.Errorf("failed to update user handle: %s", err)
122122+ }
123123+ return nil
124124+ },
125125+ }
126126+127127+ return events.HandleRepoStream(
128128+ ctx, con, autoscaling.NewScheduler(
129129+ autoscaling.DefaultAutoscaleSettings(),
130130+ s.bgshost,
131131+ rsc.EventHandler,
132132+ ),
133133+ )
134134+}
135135+136136+func (s *Server) handleCreateOrUpdate(ctx context.Context, did string, path string, recP *typegen.CBORMarshaler, rcid *cid.Cid) error {
137137+ // Since this gets called in a backfill job, we need to check if the path is a post or profile
138138+ if !strings.Contains(path, "app.bsky.feed.post") && !strings.Contains(path, "app.bsky.actor.profile") {
139139+ return nil
140140+ }
141141+142142+ u, err := s.getOrCreateUser(ctx, did)
143143+ if err != nil {
144144+ return fmt.Errorf("checking user: %w", err)
145145+ }
146146+ rec := *recP
147147+148148+ switch rec := rec.(type) {
149149+ case *bsky.FeedPost:
150150+ if err := s.indexPost(ctx, u, rec, path, *rcid); err != nil {
151151+ return fmt.Errorf("indexing post: %w", err)
152152+ }
153153+ case *bsky.ActorProfile:
154154+ if err := s.indexProfile(ctx, u, rec); err != nil {
155155+ return fmt.Errorf("indexing profile: %w", err)
156156+ }
157157+ default:
158158+ }
159159+ return nil
160160+}
161161+162162+func (s *Server) handleDelete(ctx context.Context, did string, path string) error {
163163+ // Since this gets called in a backfill job, we need to check if the path is a post or profile
164164+ if !strings.Contains(path, "app.bsky.feed.post") && !strings.Contains(path, "app.bsky.actor.profile") {
165165+ return nil
166166+ }
167167+168168+ u, err := s.getOrCreateUser(ctx, did)
169169+ if err != nil {
170170+ return err
171171+ }
172172+173173+ switch {
174174+ // TODO: handle profile deletes, its an edge case, but worth doing still
175175+ case strings.Contains(path, "app.bsky.feed.post"):
176176+ if err := s.deletePost(ctx, u, path); err != nil {
177177+ return err
178178+ }
179179+ }
180180+181181+ return nil
182182+}
183183+184184+func (s *Server) handleOp(ctx context.Context, op repomgr.EventKind, seq int64, path string, did string, rcid *cid.Cid, rec typegen.CBORMarshaler) error {
185185+ var err error
186186+ if !strings.Contains(path, "app.bsky.feed.post") && !strings.Contains(path, "app.bsky.actor.profile") {
187187+ return nil
188188+ }
189189+190190+ if op == repomgr.EvtKindCreateRecord || op == repomgr.EvtKindUpdateRecord {
191191+ log.Infof("handling create(%d): %s - %s", seq, did, path)
192192+193193+ // Try to buffer the op, if it fails, we need to create a backfill job
194194+ _, err := s.bfs.BufferOp(ctx, did, string(op), path, &rec, rcid)
195195+ if err == backfill.ErrJobNotFound {
196196+ log.Infof("no job found for repo %s, creating one", did)
197197+198198+ if err := s.bfs.EnqueueJob(did); err != nil {
199199+ return fmt.Errorf("enqueueing job: %w", err)
200200+ }
201201+202202+ // Try to buffer the op again so it gets picked up by the backfill job
203203+ _, err = s.bfs.BufferOp(ctx, did, string(op), path, &rec, rcid)
204204+ if err != nil {
205205+ return fmt.Errorf("buffering op: %w", err)
206206+ }
207207+ } else if err == backfill.ErrJobComplete {
208208+ // Backfill is done for this repo so we can just index it now
209209+ err = s.handleCreateOrUpdate(ctx, did, path, &rec, rcid)
210210+ }
211211+ } else if op == repomgr.EvtKindDeleteRecord {
212212+ log.Infof("handling delete(%d): %s - %s", seq, did, path)
213213+214214+ // Try to buffer the op, if it fails, we need to create a backfill job
215215+ _, err := s.bfs.BufferOp(ctx, did, string(op), path, &rec, rcid)
216216+ if err == backfill.ErrJobNotFound {
217217+ log.Infof("no job found for repo %s, creating one", did)
218218+219219+ if err := s.bfs.EnqueueJob(did); err != nil {
220220+ return fmt.Errorf("enqueueing job: %w", err)
221221+ }
222222+223223+ // Try to buffer the op again so it gets picked up by the backfill job
224224+ _, err = s.bfs.BufferOp(ctx, did, string(op), path, &rec, rcid)
225225+ if err != nil {
226226+ return fmt.Errorf("buffering op: %w", err)
227227+ }
228228+ } else if err == backfill.ErrJobComplete {
229229+ // Backfill is done for this repo so we can delete imemdiately
230230+ err = s.handleDelete(ctx, did, path)
231231+ }
232232+ }
233233+234234+ if err != nil {
235235+ return fmt.Errorf("failed to handle op: %w", err)
236236+ }
237237+238238+ return nil
239239+}
240240+241241+func (s *Server) processTooBigCommit(ctx context.Context, evt *comatproto.SyncSubscribeRepos_Commit) error {
242242+ repodata, err := comatproto.SyncGetRepo(ctx, s.bgsxrpc, evt.Repo, "", evt.Commit.String())
243243+ if err != nil {
244244+ return err
245245+ }
246246+247247+ r, err := repo.ReadRepoFromCar(ctx, bytes.NewReader(repodata))
248248+ if err != nil {
249249+ return err
250250+ }
251251+252252+ u, err := s.getOrCreateUser(ctx, evt.Repo)
253253+ if err != nil {
254254+ return err
255255+ }
256256+257257+ return r.ForEach(ctx, "", func(k string, v cid.Cid) error {
258258+ if strings.HasPrefix(k, "app.bsky.feed.post") || strings.HasPrefix(k, "app.bsky.actor.profile") {
259259+ rcid, rec, err := r.GetRecord(ctx, k)
260260+ if err != nil {
261261+ log.Errorf("failed to get record from repo checkout: %s", err)
262262+ return nil
263263+ }
264264+265265+ switch rec := rec.(type) {
266266+ case *bsky.FeedPost:
267267+ if err := s.indexPost(ctx, u, rec, k, rcid); err != nil {
268268+ return fmt.Errorf("indexing post: %w", err)
269269+ }
270270+ case *bsky.ActorProfile:
271271+ if err := s.indexProfile(ctx, u, rec); err != nil {
272272+ return fmt.Errorf("indexing profile: %w", err)
273273+ }
274274+ default:
275275+ }
276276+277277+ }
278278+ return nil
279279+ })
280280+}