this repo has no description
0
fork

Configure Feed

Select the types of activity you want to include in your feed.

Some new backfill package noodling for efficient network crawling

Jaz e15d79c6 5641d3c2

+1228
+483
backfill/next/backfill.go
··· 1 + package next 2 + 3 + import ( 4 + "context" 5 + "errors" 6 + "fmt" 7 + "log/slog" 8 + "net/http" 9 + "sync" 10 + "time" 11 + 12 + "github.com/bluesky-social/indigo/repo" 13 + 14 + "github.com/ipfs/go-cid" 15 + "go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp" 16 + "go.opentelemetry.io/otel" 17 + "golang.org/x/time/rate" 18 + ) 19 + 20 + // Job is an interface for a backfill job 21 + type Job interface { 22 + Repo() string 23 + State() string 24 + SetState(ctx context.Context, state string) error 25 + RetryCount() int 26 + } 27 + 28 + // Store is an interface for a backfill store which holds Jobs 29 + type Store interface { 30 + GetJob(ctx context.Context, repo string) (Job, error) 31 + GetNextEnqueuedJob(ctx context.Context, pds string) (Job, error) 32 + EnqueueJob(ctx context.Context, pds, repo string) error 33 + } 34 + 35 + var ( 36 + // StateEnqueued is the state of a backfill job when it is first created 37 + StateEnqueued = "enqueued" 38 + // StateInProgress is the state of a backfill job when it is being processed 39 + StateInProgress = "in_progress" 40 + // StateComplete is the state of a backfill job when it has been processed 41 + StateComplete = "complete" 42 + ) 43 + 44 + // ErrJobComplete is returned when trying to buffer an op for a job that is complete 45 + var ErrJobComplete = errors.New("job is complete") 46 + 47 + // ErrJobNotFound is returned when trying to buffer an op for a job that doesn't exist 48 + var ErrJobNotFound = errors.New("job not found") 49 + 50 + var tracer = otel.Tracer("backfiller") 51 + 52 + // A Backfiller is a generic backfiller that can handle concurrent backfill jobs across multiple PDS instances. 53 + type Backfiller struct { 54 + Name string 55 + 56 + HandleCreateRecord func(ctx context.Context, repo string, rev string, path string, rec *[]byte, cid *cid.Cid) error 57 + Store Store 58 + 59 + perPDSBackfillConcurrency int 60 + perPDSSyncsPerSecond int 61 + globalRecordCreateConcurrency int 62 + 63 + globalRecordCreationLimiter *rate.Limiter 64 + 65 + NSIDFilter string 66 + 67 + pdsBackfillers map[string]*PDSBackfiller 68 + lk sync.Mutex 69 + 70 + stop chan struct{} 71 + } 72 + 73 + type BackfillerOptions struct { 74 + PerPDSBackfillConcurrency int 75 + PerPDSSyncsPerSecond int 76 + GlobalRecordCreateConcurrency int 77 + NSIDFilter string 78 + Client *http.Client 79 + } 80 + 81 + func DefaultBackfillerOptions() *BackfillerOptions { 82 + return &BackfillerOptions{ 83 + PerPDSBackfillConcurrency: 10, 84 + GlobalRecordCreateConcurrency: 100, 85 + NSIDFilter: "", 86 + PerPDSSyncsPerSecond: 2, 87 + } 88 + } 89 + 90 + func NewBackfiller( 91 + name string, 92 + store Store, 93 + handleCreate func(ctx context.Context, repo string, rev string, path string, rec *[]byte, cid *cid.Cid) error, 94 + opts *BackfillerOptions, 95 + ) *Backfiller { 96 + if opts == nil { 97 + opts = DefaultBackfillerOptions() 98 + } 99 + 100 + return &Backfiller{ 101 + Name: name, 102 + Store: store, 103 + HandleCreateRecord: handleCreate, 104 + perPDSBackfillConcurrency: opts.PerPDSBackfillConcurrency, 105 + perPDSSyncsPerSecond: opts.PerPDSSyncsPerSecond, 106 + globalRecordCreateConcurrency: opts.GlobalRecordCreateConcurrency, 107 + globalRecordCreationLimiter: rate.NewLimiter(rate.Limit(opts.GlobalRecordCreateConcurrency), opts.GlobalRecordCreateConcurrency), 108 + NSIDFilter: opts.NSIDFilter, 109 + pdsBackfillers: make(map[string]*PDSBackfiller), 110 + stop: make(chan struct{}), 111 + } 112 + } 113 + 114 + func (b *Backfiller) EnqueueJob(ctx context.Context, pds, repo string) error { 115 + log := slog.With("component", "backfiller", "name", b.Name, "pds", pds, "repo", repo) 116 + log.Info("enqueueing backfill job") 117 + 118 + if err := b.Store.EnqueueJob(ctx, pds, repo); err != nil { 119 + log.Error("failed to enqueue backfill job", "error", err) 120 + return err 121 + } 122 + 123 + // Check if we already have a backfiller for this PDS 124 + b.lk.Lock() 125 + defer b.lk.Unlock() 126 + if _, exists := b.pdsBackfillers[pds]; !exists { 127 + log.Info("creating new PDS backfiller", "pds", pds) 128 + opts := DefaultPDSBackfillerOptions() 129 + opts.ParallelBackfills = b.perPDSBackfillConcurrency 130 + opts.SyncRequestsPerSecond = b.perPDSSyncsPerSecond 131 + opts.RecordCreateLimiter = b.globalRecordCreationLimiter 132 + opts.NSIDFilter = b.NSIDFilter 133 + 134 + pdsBackfiller := NewPDSBackfiller(pds, pds, b.Store, b.HandleCreateRecord, opts) 135 + b.pdsBackfillers[pds] = pdsBackfiller 136 + pdsBackfiller.Start() 137 + } 138 + backfillJobsEnqueued.WithLabelValues(b.Name).Inc() 139 + log.Info("backfill job enqueued successfully") 140 + return nil 141 + } 142 + 143 + func (b *Backfiller) Shutdown(ctx context.Context) error { 144 + log := slog.With("component", "backfiller", "name", b.Name) 145 + log.Info("shutting down backfiller") 146 + close(b.stop) 147 + b.lk.Lock() 148 + defer b.lk.Unlock() 149 + // Concurrently stop all PDS backfillers 150 + var wg sync.WaitGroup 151 + for _, pdsBackfiller := range b.pdsBackfillers { 152 + wg.Add(1) 153 + go func(pds *PDSBackfiller) { 154 + defer wg.Done() 155 + pds.Stop(ctx) 156 + }(pdsBackfiller) 157 + } 158 + wg.Wait() 159 + log.Info("all PDS backfillers stopped") 160 + return nil 161 + } 162 + 163 + type PDSBackfiller struct { 164 + Name string 165 + Hostname string 166 + client *http.Client 167 + 168 + HandleCreateRecord func(ctx context.Context, repo string, rev string, path string, rec *[]byte, cid *cid.Cid) error 169 + Store Store 170 + 171 + backfillConcurrency int 172 + syncLimiter *rate.Limiter 173 + 174 + recordCreateConcurrency int 175 + recordCreateLimiter *rate.Limiter 176 + 177 + NSIDFilter string 178 + 179 + wg sync.WaitGroup 180 + stop chan struct{} 181 + } 182 + 183 + type PDSBackfillerOptions struct { 184 + ParallelBackfills int 185 + ParallelRecordCreates int 186 + RecordCreateLimiter *rate.Limiter 187 + NSIDFilter string 188 + SyncRequestsPerSecond int 189 + Client *http.Client 190 + } 191 + 192 + func DefaultPDSBackfillerOptions() *PDSBackfillerOptions { 193 + return &PDSBackfillerOptions{ 194 + ParallelBackfills: 10, 195 + ParallelRecordCreates: 100, 196 + RecordCreateLimiter: rate.NewLimiter(rate.Limit(100), 100), 197 + NSIDFilter: "", 198 + SyncRequestsPerSecond: 2, 199 + Client: &http.Client{ 200 + Transport: otelhttp.NewTransport(http.DefaultTransport), 201 + Timeout: 600 * time.Second, 202 + }, 203 + } 204 + } 205 + 206 + // NewPDSBackfiller creates a new backfiller for a single PDS instance 207 + func NewPDSBackfiller( 208 + name string, 209 + hostname string, 210 + store Store, 211 + handleCreate func(ctx context.Context, repo string, rev string, path string, rec *[]byte, cid *cid.Cid) error, 212 + opts *PDSBackfillerOptions, 213 + ) *PDSBackfiller { 214 + if opts == nil { 215 + opts = DefaultPDSBackfillerOptions() 216 + } 217 + 218 + return &PDSBackfiller{ 219 + Name: name, 220 + Hostname: hostname, 221 + Store: store, 222 + HandleCreateRecord: handleCreate, 223 + backfillConcurrency: opts.ParallelBackfills, 224 + recordCreateConcurrency: opts.ParallelRecordCreates, 225 + NSIDFilter: opts.NSIDFilter, 226 + syncLimiter: rate.NewLimiter(rate.Limit(opts.SyncRequestsPerSecond), opts.SyncRequestsPerSecond), 227 + recordCreateLimiter: opts.RecordCreateLimiter, 228 + stop: make(chan struct{}), 229 + client: opts.Client, 230 + wg: sync.WaitGroup{}, 231 + } 232 + } 233 + 234 + // Start starts the backfill processor routine 235 + func (b *PDSBackfiller) Start() { 236 + ctx := context.Background() 237 + 238 + log := slog.With("component", "backfiller", "name", b.Name, "hostname", b.Hostname) 239 + log.Info("starting backfill processor for PDS", "hostname", b.Hostname) 240 + 241 + // Start a producer to enqueue jobs 242 + jobs := make(chan Job) 243 + 244 + b.wg.Add(1) 245 + go func() { 246 + defer b.wg.Done() 247 + defer close(jobs) 248 + log := log.With("subcomponent", "producer") 249 + for { 250 + select { 251 + case <-b.stop: 252 + log.Info("stopping backfill producer") 253 + return 254 + default: 255 + } 256 + 257 + // Get the next job 258 + job, err := b.Store.GetNextEnqueuedJob(ctx, b.Hostname) 259 + if err != nil { 260 + log.Error("failed to get next enqueued job", "error", err) 261 + time.Sleep(1 * time.Second) 262 + continue 263 + } else if job == nil { 264 + time.Sleep(1 * time.Second) 265 + continue 266 + } 267 + jobs <- job 268 + } 269 + }() 270 + 271 + // Start the worker processes 272 + for i := 0; i < b.backfillConcurrency; i++ { 273 + b.wg.Add(1) 274 + go func() { 275 + defer b.wg.Done() 276 + log := log.With("subcomponent", "worker", "worker_id", i) 277 + for job := range jobs { 278 + select { 279 + case <-b.stop: 280 + log.Info("stopping backfill worker") 281 + return 282 + default: 283 + } 284 + 285 + log := log.With("job", job.Repo(), "state", job.State()) 286 + log.Info("processing backfill job") 287 + 288 + if err := job.SetState(ctx, StateInProgress); err != nil { 289 + log.Error("failed to set job state to in_progress", "error", err) 290 + continue 291 + } 292 + 293 + newState, err := b.BackfillRepo(ctx, job) 294 + if err != nil { 295 + log.Error("failed to backfill repo", "error", err) 296 + } else { 297 + log.Info("backfill job completed successfully", "new_state", newState) 298 + } 299 + 300 + if err := job.SetState(ctx, newState); err != nil { 301 + log.Error("failed to set job completion state", "error", err) 302 + continue 303 + } 304 + 305 + backfillJobsProcessed.WithLabelValues(b.Name).Inc() 306 + } 307 + }() 308 + } 309 + } 310 + 311 + // Stop stops the backfill processor 312 + func (b *PDSBackfiller) Stop(ctx context.Context) { 313 + log := slog.With("source", "backfiller", "name", b.Name, "hostname", b.Hostname) 314 + log.Info("stopping PDS backfiller") 315 + close(b.stop) 316 + b.wg.Wait() 317 + log.Info("PDS backfiller stopped") 318 + } 319 + 320 + type recordQueueItem struct { 321 + recordPath string 322 + nodeCid cid.Cid 323 + } 324 + 325 + type recordResult struct { 326 + recordPath string 327 + err error 328 + } 329 + 330 + type FetchRepoError struct { 331 + StatusCode int 332 + Status string 333 + } 334 + 335 + func (e *FetchRepoError) Error() string { 336 + reason := "unknown error" 337 + if e.StatusCode == http.StatusBadRequest { 338 + reason = "repo not found" 339 + } else { 340 + reason = e.Status 341 + } 342 + return fmt.Sprintf("failed to get repo: %s (%d)", reason, e.StatusCode) 343 + } 344 + 345 + // BackfillRepo backfills a repo 346 + func (b *PDSBackfiller) BackfillRepo(ctx context.Context, job Job) (string, error) { 347 + ctx, span := tracer.Start(ctx, "BackfillRepo") 348 + defer span.End() 349 + 350 + start := time.Now() 351 + 352 + repoDID := job.Repo() 353 + 354 + log := slog.With("source", "backfiller_backfill_repo", "repo", repoDID) 355 + if job.RetryCount() > 0 { 356 + log = log.With("retry_count", job.RetryCount()) 357 + } 358 + log.Info(fmt.Sprintf("processing backfill for %s", repoDID)) 359 + 360 + r, err := b.fetchRepo(ctx, repoDID, b.Hostname) 361 + if err != nil { 362 + slog.Warn("repo CAR fetch from PDS failed", "did", repoDID, "pds", b.Hostname, "err", err) 363 + rfe, ok := err.(*FetchRepoError) 364 + if ok { 365 + return fmt.Sprintf("failed to fetch repo CAR from PDS (http %d:%s)", rfe.StatusCode, rfe.Status), err 366 + } 367 + return "failed to fetch repo CAR from PDS", err 368 + } 369 + 370 + numRecords := 0 371 + numRoutines := b.recordCreateConcurrency 372 + recordQueue := make(chan recordQueueItem, numRoutines) 373 + recordResults := make(chan recordResult, numRoutines) 374 + 375 + // Producer routine 376 + go func() { 377 + defer close(recordQueue) 378 + if err := r.ForEach(ctx, b.NSIDFilter, func(recordPath string, nodeCid cid.Cid) error { 379 + numRecords++ 380 + recordQueue <- recordQueueItem{recordPath: recordPath, nodeCid: nodeCid} 381 + return nil 382 + }); err != nil { 383 + log.Error("failed to iterate records in repo", "err", err) 384 + } 385 + }() 386 + 387 + rev := r.SignedCommit().Rev 388 + 389 + // Consumer routines 390 + wg := sync.WaitGroup{} 391 + for i := 0; i < numRoutines; i++ { 392 + wg.Add(1) 393 + go func() { 394 + defer wg.Done() 395 + for item := range recordQueue { 396 + blk, err := r.Blockstore().Get(ctx, item.nodeCid) 397 + if err != nil { 398 + recordResults <- recordResult{recordPath: item.recordPath, err: fmt.Errorf("failed to get blocks for record: %w", err)} 399 + continue 400 + } 401 + 402 + raw := blk.RawData() 403 + 404 + if err := b.recordCreateLimiter.Wait(ctx); err != nil { 405 + recordResults <- recordResult{recordPath: item.recordPath, err: fmt.Errorf("failed to wait for record create limiter: %w", err)} 406 + break 407 + } 408 + 409 + err = b.HandleCreateRecord(ctx, repoDID, rev, item.recordPath, &raw, &item.nodeCid) 410 + if err != nil { 411 + recordResults <- recordResult{recordPath: item.recordPath, err: fmt.Errorf("failed to handle create record: %w", err)} 412 + continue 413 + } 414 + 415 + backfillRecordsProcessed.WithLabelValues(b.Name).Inc() 416 + recordResults <- recordResult{recordPath: item.recordPath, err: err} 417 + } 418 + }() 419 + } 420 + 421 + resultWG := sync.WaitGroup{} 422 + resultWG.Add(1) 423 + // Handle results 424 + go func() { 425 + defer resultWG.Done() 426 + for result := range recordResults { 427 + if result.err != nil { 428 + log.Error("Error processing record", "record", result.recordPath, "error", result.err) 429 + } 430 + } 431 + }() 432 + 433 + wg.Wait() 434 + close(recordResults) 435 + resultWG.Wait() 436 + 437 + log.Info("backfill complete", 438 + "records_backfilled", numRecords, 439 + "duration", time.Since(start), 440 + ) 441 + 442 + return StateComplete, nil 443 + } 444 + 445 + // Fetches a repo CAR file over HTTP from the indicated host. If successful, parses the CAR and returns repo.Repo 446 + func (b *PDSBackfiller) fetchRepo(ctx context.Context, did, host string) (*repo.Repo, error) { 447 + url := fmt.Sprintf("https://%s/xrpc/com.atproto.sync.getRepo?did=%s", host, did) 448 + 449 + req, err := http.NewRequestWithContext(ctx, "GET", url, nil) 450 + if err != nil { 451 + return nil, fmt.Errorf("failed to create request: %w", err) 452 + } 453 + 454 + req.Header.Set("Accept", "application/vnd.ipld.car") 455 + req.Header.Set("User-Agent", fmt.Sprintf("atproto-backfill-%s/2.0.0", b.Name)) 456 + 457 + b.syncLimiter.Wait(ctx) 458 + 459 + resp, err := b.client.Do(req) 460 + if err != nil { 461 + return nil, fmt.Errorf("failed to send request: %w", err) 462 + } 463 + 464 + if resp.StatusCode != http.StatusOK { 465 + return nil, &FetchRepoError{ 466 + StatusCode: resp.StatusCode, 467 + Status: resp.Status, 468 + } 469 + } 470 + 471 + instrumentedReader := instrumentedReader{ 472 + source: resp.Body, 473 + counter: backfillBytesProcessed.WithLabelValues(b.Name), 474 + } 475 + 476 + defer instrumentedReader.Close() 477 + 478 + repo, err := repo.ReadRepoFromCar(ctx, instrumentedReader) 479 + if err != nil { 480 + return nil, fmt.Errorf("failed to parse repo from CAR file: %w", err) 481 + } 482 + return repo, nil 483 + }
+374
backfill/next/gormstore.go
··· 1 + package next 2 + 3 + import ( 4 + "context" 5 + "errors" 6 + "fmt" 7 + "strings" 8 + "sync" 9 + "time" 10 + 11 + "gorm.io/gorm" 12 + ) 13 + 14 + type Gormjob struct { 15 + repo string 16 + pds string 17 + state string 18 + rev string 19 + 20 + lk sync.Mutex 21 + 22 + dbj *GormDBJob 23 + db *gorm.DB 24 + 25 + createdAt time.Time 26 + updatedAt time.Time 27 + 28 + retryCount int 29 + retryAfter *time.Time 30 + } 31 + 32 + type GormDBJob struct { 33 + gorm.Model 34 + Repo string `gorm:"unique;index"` 35 + PDS string `gorm:"index;index:enqueued_pds_job_idx;index:retryable_pds_job_idx"` 36 + State string `gorm:"index:enqueued_pds_job_idx,where:state = 'enqueued';index:retryable_pds_job_idx,where:state like 'failed%'"` 37 + Rev string 38 + RetryCount int 39 + RetryAfter *time.Time `gorm:"index:retryable_pds_job_idx,sort:desc"` 40 + } 41 + 42 + type queue struct { 43 + qlk sync.Mutex 44 + taskQueue []string 45 + } 46 + 47 + // Gormstore is a gorm-backed implementation of the Backfill Store interface 48 + type Gormstore struct { 49 + lk sync.RWMutex 50 + jobs map[string]*Gormjob 51 + 52 + pdsQueues map[string]*queue 53 + 54 + db *gorm.DB 55 + } 56 + 57 + func NewGormstore(db *gorm.DB) *Gormstore { 58 + return &Gormstore{ 59 + jobs: make(map[string]*Gormjob), 60 + pdsQueues: make(map[string]*queue), 61 + db: db, 62 + } 63 + } 64 + 65 + func (s *Gormstore) LoadJobs(ctx context.Context) error { 66 + s.lk.Lock() 67 + defer s.lk.Unlock() 68 + return s.loadJobs(ctx, 20_000) 69 + } 70 + 71 + type todoJob struct { 72 + pds string 73 + repo string 74 + } 75 + 76 + func (s *Gormstore) loadJobs(ctx context.Context, limit int) error { 77 + enqueuedIndexClause := "" 78 + retryableIndexClause := "" 79 + 80 + // If the DB is a SQLite DB, we can use INDEXED BY to speed up the query 81 + if s.db.Dialector.Name() == "sqlite" { 82 + enqueuedIndexClause = "INDEXED BY enqueued_pds_job_idx" 83 + retryableIndexClause = "INDEXED BY retryable_pds_job_idx" 84 + } 85 + 86 + enqueuedSelect := fmt.Sprintf(`SELECT pds, repo FROM gorm_db_jobs %s WHERE state = 'enqueued' LIMIT ?`, enqueuedIndexClause) 87 + retryableSelect := fmt.Sprintf(`SELECT pds, repo FROM gorm_db_jobs %s WHERE state like 'failed%%' AND (retry_after = NULL OR retry_after < ?) LIMIT ?`, retryableIndexClause) 88 + 89 + todoJobs := make([]todoJob, 0, limit) 90 + if err := s.db.Raw(enqueuedSelect, limit).Scan(&todoJobs).Error; err != nil { 91 + return err 92 + } 93 + 94 + if len(todoJobs) < limit { 95 + moreTodo := make([]todoJob, 0, limit-len(todoJobs)) 96 + if err := s.db.Raw(retryableSelect, time.Now(), limit-len(todoJobs)).Scan(&moreTodo).Error; err != nil { 97 + return err 98 + } 99 + todoJobs = append(todoJobs, moreTodo...) 100 + } 101 + 102 + for _, job := range todoJobs { 103 + if pdsQueue, ok := s.pdsQueues[job.pds]; ok { 104 + pdsQueue.qlk.Lock() 105 + pdsQueue.taskQueue = append(pdsQueue.taskQueue, job.repo) 106 + pdsQueue.qlk.Unlock() 107 + } else { 108 + s.pdsQueues[job.pds] = &queue{ 109 + taskQueue: []string{job.repo}, 110 + } 111 + } 112 + } 113 + 114 + return nil 115 + } 116 + 117 + func (s *Gormstore) GetOrCreateJob(ctx context.Context, pds, repo, state string) (Job, error) { 118 + j, err := s.getJob(ctx, repo) 119 + if err == nil { 120 + return j, nil 121 + } 122 + 123 + if !errors.Is(err, ErrJobNotFound) { 124 + return nil, err 125 + } 126 + 127 + if err := s.createJobForRepo(pds, repo, state); err != nil { 128 + return nil, err 129 + } 130 + 131 + return s.getJob(ctx, repo) 132 + } 133 + 134 + func (s *Gormstore) EnqueueJob(ctx context.Context, pds, repo string) error { 135 + _, err := s.GetOrCreateJob(ctx, pds, repo, StateEnqueued) 136 + if err != nil { 137 + return err 138 + } 139 + 140 + // Add the job to the task queue for the PDS 141 + s.lk.Lock() 142 + pdsQueue, ok := s.pdsQueues[pds] 143 + if !ok { 144 + pdsQueue = &queue{ 145 + taskQueue: []string{repo}, 146 + } 147 + s.pdsQueues[pds] = pdsQueue 148 + s.lk.Unlock() 149 + } else { 150 + s.lk.Unlock() 151 + pdsQueue.qlk.Lock() 152 + pdsQueue.taskQueue = append(pdsQueue.taskQueue, repo) 153 + pdsQueue.qlk.Unlock() 154 + } 155 + 156 + return nil 157 + } 158 + 159 + func (s *Gormstore) EnqueueJobWithState(ctx context.Context, pds, repo, state string) error { 160 + _, err := s.GetOrCreateJob(ctx, pds, repo, state) 161 + if err != nil { 162 + return err 163 + } 164 + 165 + // Add the job to the task queue for the PDS 166 + s.lk.Lock() 167 + pdsQueue, ok := s.pdsQueues[pds] 168 + if !ok { 169 + pdsQueue = &queue{ 170 + taskQueue: []string{repo}, 171 + } 172 + s.pdsQueues[pds] = pdsQueue 173 + s.lk.Unlock() 174 + } else { 175 + s.lk.Unlock() 176 + pdsQueue.qlk.Lock() 177 + pdsQueue.taskQueue = append(pdsQueue.taskQueue, repo) 178 + pdsQueue.qlk.Unlock() 179 + } 180 + 181 + return nil 182 + } 183 + 184 + func (s *Gormstore) createJobForRepo(pds, repo, state string) error { 185 + dbj := &GormDBJob{ 186 + Repo: repo, 187 + PDS: pds, 188 + State: state, 189 + } 190 + if err := s.db.Create(dbj).Error; err != nil { 191 + if errors.Is(err, gorm.ErrDuplicatedKey) { 192 + return nil 193 + } 194 + return err 195 + } 196 + 197 + s.lk.Lock() 198 + defer s.lk.Unlock() 199 + 200 + // Convert it to an in-memory job 201 + if _, ok := s.jobs[repo]; ok { 202 + // The DB create should have errored if the job already existed, but just in case 203 + return fmt.Errorf("job already exists for repo %s", repo) 204 + } 205 + 206 + j := &Gormjob{ 207 + repo: repo, 208 + pds: pds, 209 + createdAt: time.Now(), 210 + updatedAt: time.Now(), 211 + state: state, 212 + 213 + dbj: dbj, 214 + db: s.db, 215 + } 216 + s.jobs[repo] = j 217 + 218 + return nil 219 + } 220 + 221 + func (s *Gormstore) GetJob(ctx context.Context, repo string) (Job, error) { 222 + return s.getJob(ctx, repo) 223 + } 224 + 225 + func (s *Gormstore) getJob(ctx context.Context, repo string) (*Gormjob, error) { 226 + cj := s.checkJobCache(ctx, repo) 227 + if cj != nil { 228 + return cj, nil 229 + } 230 + 231 + return s.loadJob(ctx, repo) 232 + } 233 + 234 + func (s *Gormstore) loadJob(ctx context.Context, repo string) (*Gormjob, error) { 235 + var dbj GormDBJob 236 + if err := s.db.Find(&dbj, "repo = ?", repo).Error; err != nil { 237 + return nil, err 238 + } 239 + 240 + if dbj.ID == 0 { 241 + return nil, ErrJobNotFound 242 + } 243 + 244 + j := &Gormjob{ 245 + repo: dbj.Repo, 246 + pds: dbj.PDS, 247 + state: dbj.State, 248 + rev: dbj.Rev, 249 + createdAt: dbj.CreatedAt, 250 + updatedAt: dbj.UpdatedAt, 251 + 252 + dbj: &dbj, 253 + db: s.db, 254 + 255 + retryCount: dbj.RetryCount, 256 + retryAfter: dbj.RetryAfter, 257 + } 258 + s.lk.Lock() 259 + defer s.lk.Unlock() 260 + // would imply a race condition 261 + exist, ok := s.jobs[repo] 262 + if ok { 263 + return exist, nil 264 + } 265 + s.jobs[repo] = j 266 + return j, nil 267 + } 268 + 269 + func (s *Gormstore) checkJobCache(ctx context.Context, repo string) *Gormjob { 270 + s.lk.RLock() 271 + defer s.lk.RUnlock() 272 + 273 + j, ok := s.jobs[repo] 274 + if !ok || j == nil { 275 + return nil 276 + } 277 + return j 278 + } 279 + 280 + func (s *Gormstore) GetNextEnqueuedJob(ctx context.Context, pds string) (Job, error) { 281 + s.lk.Lock() 282 + pdsQueue, ok := s.pdsQueues[pds] 283 + s.lk.Unlock() 284 + if !ok { 285 + return nil, nil 286 + } 287 + pdsQueue.qlk.Lock() 288 + defer pdsQueue.qlk.Unlock() 289 + 290 + if len(pdsQueue.taskQueue) == 0 { 291 + if err := s.loadJobs(ctx, 1000); err != nil { 292 + return nil, err 293 + } 294 + 295 + if len(pdsQueue.taskQueue) == 0 { 296 + return nil, nil 297 + } 298 + } 299 + 300 + for len(pdsQueue.taskQueue) > 0 { 301 + first := pdsQueue.taskQueue[0] 302 + pdsQueue.taskQueue = pdsQueue.taskQueue[1:] 303 + 304 + j, err := s.getJob(ctx, first) 305 + if err != nil { 306 + return nil, err 307 + } 308 + 309 + shouldRetry := strings.HasPrefix(j.State(), "failed") && j.retryAfter != nil && time.Now().After(*j.retryAfter) 310 + 311 + if j.State() == StateEnqueued || shouldRetry { 312 + return j, nil 313 + } 314 + } 315 + return nil, nil 316 + } 317 + 318 + func (j *Gormjob) Repo() string { 319 + return j.repo 320 + } 321 + 322 + func (j *Gormjob) State() string { 323 + j.lk.Lock() 324 + defer j.lk.Unlock() 325 + 326 + return j.state 327 + } 328 + 329 + // MaxRetries is the maximum number of times to retry a backfill job 330 + var MaxRetries = 10 331 + 332 + func computeExponentialBackoff(attempt int) time.Duration { 333 + return time.Duration(1<<uint(attempt)) * 10 * time.Second 334 + } 335 + 336 + func (j *Gormjob) SetState(ctx context.Context, state string) error { 337 + j.lk.Lock() 338 + defer j.lk.Unlock() 339 + 340 + j.state = state 341 + j.updatedAt = time.Now() 342 + 343 + if strings.HasPrefix(state, "failed") { 344 + if j.retryCount < MaxRetries { 345 + next := time.Now().Add(computeExponentialBackoff(j.retryCount)) 346 + j.retryAfter = &next 347 + j.retryCount++ 348 + } else { 349 + j.retryAfter = nil 350 + } 351 + } 352 + 353 + // Persist the job to the database 354 + j.dbj.State = state 355 + return j.db.Save(j.dbj).Error 356 + } 357 + 358 + func (j *Gormjob) RetryCount() int { 359 + j.lk.Lock() 360 + defer j.lk.Unlock() 361 + return j.retryCount 362 + } 363 + 364 + func (s *Gormstore) PurgeRepo(ctx context.Context, repo string) error { 365 + if err := s.db.Exec("DELETE FROM gorm_db_jobs WHERE repo = ?", repo).Error; err != nil { 366 + return err 367 + } 368 + 369 + s.lk.Lock() 370 + defer s.lk.Unlock() 371 + delete(s.jobs, repo) 372 + 373 + return nil 374 + }
+26
backfill/next/metrics.go
··· 1 + package next 2 + 3 + import ( 4 + "github.com/prometheus/client_golang/prometheus" 5 + "github.com/prometheus/client_golang/prometheus/promauto" 6 + ) 7 + 8 + var backfillJobsEnqueued = promauto.NewCounterVec(prometheus.CounterOpts{ 9 + Name: "backfill_jobs_enqueued_total", 10 + Help: "The total number of backfill jobs enqueued", 11 + }, []string{"backfiller_name"}) 12 + 13 + var backfillJobsProcessed = promauto.NewCounterVec(prometheus.CounterOpts{ 14 + Name: "backfill_jobs_processed_total", 15 + Help: "The total number of backfill jobs processed", 16 + }, []string{"backfiller_name"}) 17 + 18 + var backfillRecordsProcessed = promauto.NewCounterVec(prometheus.CounterOpts{ 19 + Name: "backfill_records_processed_total", 20 + Help: "The total number of backfill records processed", 21 + }, []string{"backfiller_name"}) 22 + 23 + var backfillBytesProcessed = promauto.NewCounterVec(prometheus.CounterOpts{ 24 + Name: "backfill_bytes_processed_total", 25 + Help: "The total number of backfill bytes processed", 26 + }, []string{"backfiller_name"})
+33
backfill/next/util.go
··· 1 + package next 2 + 3 + import ( 4 + "io" 5 + 6 + "github.com/prometheus/client_golang/prometheus" 7 + ) 8 + 9 + type instrumentedReader struct { 10 + source io.ReadCloser 11 + counter prometheus.Counter 12 + } 13 + 14 + func (r instrumentedReader) Read(b []byte) (int, error) { 15 + n, err := r.source.Read(b) 16 + r.counter.Add(float64(n)) 17 + return n, err 18 + } 19 + 20 + func (r instrumentedReader) Close() error { 21 + var buf [32]byte 22 + var n int 23 + var err error 24 + for err == nil { 25 + n, err = r.source.Read(buf[:]) 26 + r.counter.Add(float64(n)) 27 + } 28 + closeerr := r.source.Close() 29 + if err != nil && err != io.EOF { 30 + return err 31 + } 32 + return closeerr 33 + }
+312
cmd/linear/main.go
··· 1 + package main 2 + 3 + import ( 4 + "context" 5 + "encoding/json" 6 + "fmt" 7 + "log/slog" 8 + "net/http" 9 + _ "net/http/pprof" 10 + "os" 11 + "os/signal" 12 + "strings" 13 + "syscall" 14 + 15 + comatproto "github.com/bluesky-social/indigo/api/atproto" 16 + "github.com/bluesky-social/indigo/atproto/data" 17 + backfill "github.com/bluesky-social/indigo/backfill/next" 18 + "github.com/bluesky-social/indigo/xrpc" 19 + "github.com/ipfs/go-cid" 20 + slogGorm "github.com/orandin/slog-gorm" 21 + "github.com/prometheus/client_golang/prometheus/promhttp" 22 + "github.com/urfave/cli/v2" 23 + "golang.org/x/time/rate" 24 + "gorm.io/driver/sqlite" 25 + "gorm.io/gorm" 26 + ) 27 + 28 + type Linear struct { 29 + db *gorm.DB 30 + logger *slog.Logger 31 + 32 + bf *backfill.Backfiller 33 + teardown chan struct{} 34 + 35 + out *os.File 36 + outChan chan []byte 37 + fileClosed chan struct{} 38 + } 39 + 40 + type Line struct { 41 + Repo string `json:"repo"` 42 + Collection string `json:"collection"` 43 + RKey string `json:"rkey"` 44 + Cid string `json:"cid"` 45 + Record json.RawMessage `json:"record"` 46 + } 47 + 48 + func main() { 49 + app := cli.NewApp() 50 + 51 + app.Flags = []cli.Flag{ 52 + 53 + &cli.StringFlag{ 54 + Name: "metrics-listen", 55 + Usage: "listen endpoint for metrics and pprof", 56 + Value: "localhost:8081", 57 + EnvVars: []string{"LINEAR_METRICS_LISTEN"}, 58 + }, 59 + } 60 + app.Commands = []*cli.Command{ 61 + { 62 + Name: "sync", 63 + Action: Sync, 64 + Flags: []cli.Flag{ 65 + &cli.StringFlag{ 66 + Name: "backfill-sqlite-path", 67 + Usage: "path to the backfill SQLite database file", 68 + Value: "data/backfill.db", 69 + EnvVars: []string{"LINEAR_BACKFILL_DB_PATH"}, 70 + }, 71 + &cli.StringSliceFlag{ 72 + Name: "pds-list", 73 + Usage: "list of PDSs to backfill, can be specified multiple times", 74 + Value: cli.NewStringSlice("blusher.us-east.host.bsky.network", "yellowfoot.us-west.host.bsky.network", "psathyrella.us-west.host.bsky.network", "hollowfoot.us-west.host.bsky.network", "fuzzyfoot.us-west.host.bsky.network", "panus.us-west.host.bsky.network", "mazegill.us-west.host.bsky.network", "pioppino.us-west.host.bsky.network", "waxcap.us-west.host.bsky.network", "elfcup.us-east.host.bsky.network"), 75 + EnvVars: []string{"LINEAR_PDS_LIST"}, 76 + }, 77 + &cli.StringFlag{ 78 + Name: "output-file", 79 + EnvVars: []string{"LINEAR_OUTPUT_FILE"}, 80 + Value: "data/linear.jsonl", 81 + }, 82 + }, 83 + }, 84 + } 85 + 86 + err := app.Run(os.Args) 87 + if err != nil { 88 + slog.Error("exited with error", "err", err) 89 + os.Exit(1) 90 + } 91 + } 92 + 93 + func Sync(cctx *cli.Context) error { 94 + ctx := cctx.Context 95 + ctx, cancel := context.WithCancel(ctx) 96 + defer cancel() 97 + 98 + logLevel := slog.LevelInfo 99 + logger := slog.New(slog.NewJSONHandler(os.Stdout, &slog.HandlerOptions{Level: logLevel, AddSource: true})) 100 + slog.SetDefault(slog.New(logger.Handler())) 101 + 102 + mlisten := cctx.String("metrics-listen") 103 + go func() { 104 + http.Handle("/metrics", promhttp.Handler()) 105 + if err := http.ListenAndServe(mlisten, nil); err != nil { 106 + logger.Error("failed to set up metrics listener", "err", err) 107 + } 108 + }() 109 + 110 + store, db, err := setupBackfillStore(cctx.Context, cctx.String("backfill-sqlite-path")) 111 + if err != nil { 112 + return fmt.Errorf("failed to setup backfill store: %w", err) 113 + } 114 + 115 + backfillOutFile, err := os.OpenFile(cctx.String("output-file"), os.O_APPEND|os.O_CREATE|os.O_WRONLY, 0644) 116 + if err != nil { 117 + return fmt.Errorf("failed to open output file: %w", err) 118 + } 119 + 120 + linear := &Linear{ 121 + db: db, 122 + logger: logger, 123 + 124 + out: backfillOutFile, 125 + outChan: make(chan []byte, 100_000), 126 + fileClosed: make(chan struct{}), 127 + 128 + teardown: make(chan struct{}), 129 + } 130 + 131 + linear.startWriter() 132 + 133 + opts := backfill.DefaultBackfillerOptions() 134 + opts.GlobalRecordCreateConcurrency = 100_000 135 + opts.PerPDSSyncsPerSecond = 10 136 + opts.PerPDSBackfillConcurrency = 15 137 + 138 + bf := backfill.NewBackfiller("linear-backfiller-v2", store, linear.handleCreate, opts) 139 + 140 + linear.bf = bf 141 + 142 + err = store.LoadJobs(ctx) 143 + if err != nil { 144 + return fmt.Errorf("failed to load jobs from store: %w", err) 145 + } 146 + 147 + // Walk the PDS list's listRepos endpoints and add jobs to the backfiller 148 + pdsList := cctx.StringSlice("pds-list") 149 + 150 + go func() { 151 + for _, pds := range pdsList { 152 + go func(pds string) { 153 + logger.Info("enqueuing PDS for backfill", "pds", pds) 154 + 155 + xrpcc := xrpc.Client{} 156 + xrpcc.Host = fmt.Sprintf("https://%s", pds) 157 + 158 + listLimiter := rate.NewLimiter(5, 1) 159 + 160 + cursor := "" 161 + for { 162 + if err := listLimiter.Wait(ctx); err != nil { 163 + logger.Error("failed to wait for rate limiter", "pds", pds, "err", err) 164 + continue 165 + } 166 + 167 + page, err := comatproto.SyncListRepos(ctx, &xrpcc, cursor, 1000) 168 + if err != nil { 169 + logger.Error("failed to list repos for PDS", "pds", pds, "err", err) 170 + continue 171 + } 172 + 173 + for _, repo := range page.Repos { 174 + logger.Info("found repo to backfill", "pds", pds, "repo", repo.Did) 175 + 176 + if err := bf.EnqueueJob(ctx, pds, repo.Did); err != nil { 177 + logger.Error("failed to enqueue job for PDS", "pds", pds, "repo", repo.Did, "err", err) 178 + } else { 179 + logger.Info("enqueued job for PDS", "pds", pds, "repo", repo.Did) 180 + } 181 + } 182 + 183 + if page.Cursor == nil { 184 + logger.Info("no more repos to process for PDS", "pds", pds) 185 + break 186 + } 187 + 188 + cursor = *page.Cursor 189 + } 190 + logger.Info("finished enqueuing PDS for backfill", "pds", pds) 191 + }(pds) 192 + } 193 + }() 194 + 195 + signals := make(chan os.Signal, 1) 196 + signal.Notify(signals, syscall.SIGINT, syscall.SIGTERM) 197 + 198 + <-signals 199 + if err := linear.shutdown(ctx); err != nil { 200 + logger.Error("shutdown encountered an error", "err", err) 201 + } 202 + 203 + return nil 204 + 205 + } 206 + 207 + func (lin *Linear) shutdown(ctx context.Context) error { 208 + // first, close down all 'sources' of work 209 + lin.bf.Shutdown(ctx) 210 + close(lin.teardown) 211 + <-lin.fileClosed 212 + return nil 213 + } 214 + 215 + func (l *Linear) handleCreate(ctx context.Context, repo string, rev string, path string, recB *[]byte, cid *cid.Cid) error { 216 + col, rkey, err := splitPath(path) 217 + if err != nil { 218 + return err 219 + } 220 + 221 + asCbor, err := data.UnmarshalCBOR(*recB) 222 + if err != nil { 223 + return fmt.Errorf("failed to unmarshal record: %w", err) 224 + } 225 + 226 + recJSON, err := json.Marshal(asCbor) 227 + if err != nil { 228 + return fmt.Errorf("failed to marshal record to json: %w", err) 229 + } 230 + 231 + line := Line{ 232 + Repo: repo, 233 + Collection: col, 234 + RKey: rkey, 235 + Cid: cid.String(), 236 + Record: recJSON, 237 + } 238 + 239 + lineB, err := json.Marshal(line) 240 + if err != nil { 241 + return fmt.Errorf("failed to marshal line to json: %w", err) 242 + } 243 + 244 + l.outChan <- lineB 245 + 246 + return nil 247 + } 248 + 249 + func setupBackfillStore(ctx context.Context, sqlitePath string) (*backfill.Gormstore, *gorm.DB, error) { 250 + bfdb, err := gorm.Open(sqlite.Open(sqlitePath), &gorm.Config{ 251 + TranslateError: true, 252 + Logger: slogGorm.New(slogGorm.SetLogLevel(slogGorm.ErrorLogType, slog.LevelDebug)), 253 + }) 254 + if err != nil { 255 + return nil, nil, err 256 + } 257 + 258 + if err := bfdb.Exec("PRAGMA journal_mode=WAL;").Error; err != nil { 259 + return nil, nil, err 260 + } 261 + 262 + if err := bfdb.AutoMigrate(&backfill.GormDBJob{}); err != nil { 263 + return nil, nil, err 264 + } 265 + 266 + store := backfill.NewGormstore(bfdb) 267 + if err := store.LoadJobs(ctx); err != nil { 268 + return nil, nil, err 269 + } 270 + 271 + return store, bfdb, nil 272 + } 273 + 274 + func splitPath(p string) (string, string, error) { 275 + parts := strings.Split(p, "/") 276 + if len(parts) != 2 { 277 + return "", "", fmt.Errorf("path must be collection and rkey") 278 + } 279 + 280 + return parts[0], parts[1], nil 281 + } 282 + 283 + func (lin *Linear) startWriter() { 284 + log := lin.logger.With("source", "writer") 285 + log.Info("starting writer") 286 + newline := []byte("\n") 287 + 288 + // Start the writer 289 + go func() { 290 + for { 291 + select { 292 + case <-lin.teardown: 293 + if err := lin.out.Sync(); err != nil { 294 + log.Error("failed to sync output file", "err", err) 295 + } 296 + if err := lin.out.Close(); err != nil { 297 + log.Error("failed to close output file", "err", err) 298 + } 299 + close(lin.fileClosed) 300 + close(lin.outChan) 301 + return 302 + case line := <-lin.outChan: 303 + if _, err := lin.out.Write(line); err != nil { 304 + log.Error("failed to write line to output file", "err", err) 305 + } 306 + if _, err := lin.out.Write(newline); err != nil { 307 + log.Error("failed to write newline to output file", "err", err) 308 + } 309 + } 310 + } 311 + }() 312 + }