Select the types of activity you want to include in your feed.
fix did:web resolution, fix batch inserts too many variables issue, backfill all collections when actor enters the system to account for repos that aren't included in listReposByCollection endpoint
···98989999/// Resolve a DID to get ATP data (PDS endpoint and handle)
100100pub fn resolve_did(did: String, plc_url: String) -> Result(AtprotoData, String) {
101101+ // Check if this is a did:web DID
102102+ case string.starts_with(did, "did:web:") {
103103+ True -> resolve_did_web(did)
104104+ False -> resolve_did_plc(did, plc_url)
105105+ }
106106+}
107107+108108+/// Resolve a did:web DID by fetching the DID document from HTTPS
109109+fn resolve_did_web(did: String) -> Result(AtprotoData, String) {
110110+ // Extract the domain from did:web:example.com
111111+ // did:web format: did:web:<domain>[:<port>][:<path>]
112112+ let parts = string.split(did, ":")
113113+ case parts {
114114+ ["did", "web", domain, ..rest] -> {
115115+ // Build the URL: https://<domain>/.well-known/did.json
116116+ // If there are additional path components, they go before /.well-known/did.json
117117+ let base_domain = case rest {
118118+ [] -> domain
119119+ path_parts -> domain <> "/" <> string.join(path_parts, "/")
120120+ }
121121+ let url = "https://" <> base_domain <> "/.well-known/did.json"
122122+123123+ case request.to(url) {
124124+ Error(_) -> Error("Failed to create request for did:web DID: " <> did)
125125+ Ok(req) -> {
126126+ case hackney.send(req) {
127127+ Error(_) -> Error("Failed to fetch did:web DID data for: " <> did)
128128+ Ok(resp) -> {
129129+ case resp.status {
130130+ 200 -> parse_atproto_data(resp.body, did)
131131+ _ ->
132132+ Error(
133133+ "Failed to resolve DID "
134134+ <> did
135135+ <> " (status: "
136136+ <> string.inspect(resp.status)
137137+ <> ")",
138138+ )
139139+ }
140140+ }
141141+ }
142142+ }
143143+ }
144144+ }
145145+ _ -> Error("Invalid did:web format: " <> did)
146146+ }
147147+}
148148+149149+/// Resolve a did:plc DID through the PLC directory
150150+fn resolve_did_plc(did: String, plc_url: String) -> Result(AtprotoData, String) {
101151 let url = plc_url <> "/" <> did
102152103153 case request.to(url) {
···266316 Ok(req) -> {
267317 case hackney.send(req) {
268318 Error(err) -> {
269269- // Only log unexpected errors (not TLS/DNS/timeout issues)
319319+ // Log all errors with details
270320 let err_str = string.inspect(err)
271271- case
321321+ let is_expected_error =
272322 string.contains(err_str, "TlsAlert")
273323 || string.contains(err_str, "Nxdomain")
274324 || string.contains(err_str, "Timeout")
275275- {
276276- True -> Nil
325325+ || string.contains(err_str, "Econnrefused")
326326+ || string.contains(err_str, "Closed")
327327+328328+ case is_expected_error {
329329+ True ->
330330+ logging.log(
331331+ logging.Warning,
332332+ "[backfill] Network error fetching "
333333+ <> repo
334334+ <> "/"
335335+ <> collection
336336+ <> " from "
337337+ <> pds_url
338338+ <> ": "
339339+ <> err_str,
340340+ )
277341 False ->
278342 logging.log(
279343 logging.Error,
···281345 <> repo
282346 <> "/"
283347 <> collection
348348+ <> " from "
349349+ <> pds_url
284350 <> ": "
285351 <> err_str,
286352 )
···293359 case parse_list_records_response(resp.body, repo, collection) {
294360 Ok(#(records, next_cursor)) -> {
295361 let new_acc = list.append(acc, records)
362362+ let total_so_far = list.length(new_acc)
363363+296364 case next_cursor {
297297- Some(c) ->
365365+ Some(c) -> {
366366+ logging.log(
367367+ logging.Info,
368368+ "[backfill] Fetched "
369369+ <> string.inspect(list.length(records))
370370+ <> " records (total: "
371371+ <> string.inspect(total_so_far)
372372+ <> "), continuing with cursor for "
373373+ <> repo
374374+ <> "/"
375375+ <> collection,
376376+ )
298377 fetch_records_paginated(
299378 repo,
300379 collection,
···302381 Some(c),
303382 new_acc,
304383 )
305305- None -> new_acc
384384+ }
385385+ None -> {
386386+ logging.log(
387387+ logging.Info,
388388+ "[backfill] Completed fetching "
389389+ <> string.inspect(total_so_far)
390390+ <> " records for "
391391+ <> repo
392392+ <> "/"
393393+ <> collection,
394394+ )
395395+ new_acc
396396+ }
306397 }
307398 }
308399 Error(err) -> {
···324415 // 302/308: redirect (PDS moved)
325416 // 403: forbidden (private account)
326417 // 502/520: bad gateway / cloudflare error (server down)
327327- 400 | 404 | 302 | 308 | 403 | 502 | 520 -> acc
418418+ 400 | 404 | 302 | 308 | 403 | 502 | 520 -> {
419419+ acc
420420+ }
328421 // Other unexpected errors should be logged
329422 _ -> {
330423 logging.log(
331424 logging.Error,
332332- "[backfill] Failed to fetch records for "
425425+ "[backfill] Unexpected status "
426426+ <> string.inspect(resp.status)
427427+ <> " fetching "
333428 <> repo
334429 <> "/"
335430 <> collection
336336- <> " (status: "
337337- <> string.inspect(resp.status)
431431+ <> " from "
432432+ <> pds_url
433433+ <> " (URL: "
434434+ <> url
338435 <> ")",
339436 )
340437 acc
···613710 })
614711}
615712616616-/// Backfill all external collections for a newly discovered actor
713713+/// Backfill all collections (primary and external) for a newly discovered actor
617714/// This is called when a new actor is created via Jetstream or GraphQL mutations
618618-pub fn backfill_external_collections_for_actor(
715715+pub fn backfill_collections_for_actor(
619716 db: sqlight.Connection,
620717 did: String,
718718+ collection_ids: List(String),
621719 external_collection_ids: List(String),
622720 plc_url: String,
623721) -> Nil {
722722+ let all_collections = list.append(collection_ids, external_collection_ids)
723723+ let total_count = list.length(all_collections)
724724+624725 logging.log(
625726 logging.Info,
626727 "[backfill] Starting background sync for new actor: "
627728 <> did
628729 <> " ("
730730+ <> string.inspect(total_count)
731731+ <> " collections: "
732732+ <> string.inspect(list.length(collection_ids))
733733+ <> " primary + "
629734 <> string.inspect(list.length(external_collection_ids))
630630- <> " external collections)",
735735+ <> " external)",
631736 )
632737633738 // Resolve DID to get PDS endpoint
634739 case resolve_did(did, plc_url) {
635740 Ok(atp_data) -> {
636636- // Fetch and index records for each external collection
637637- list.each(external_collection_ids, fn(collection) {
741741+ // Fetch and index records for all collections (primary + external)
742742+ list.each(all_collections, fn(collection) {
638743 logging.log(
639744 logging.Info,
640745 "[backfill] Fetching " <> collection <> " for " <> did,
+95-45
server/src/database.gleam
···521521 case uris {
522522 [] -> Ok(dict.new())
523523 _ -> {
524524- // Build placeholders for SQL IN clause
525525- let placeholders =
526526- list.map(uris, fn(_) { "?" })
527527- |> string.join(", ")
524524+ // Process in batches to avoid SQL parameter limits (max 999 params)
525525+ let batch_size = 900
526526+ let batches = list.sized_chunk(uris, batch_size)
528527529529- let sql =
530530- "
531531- SELECT uri, cid
532532- FROM record
533533- WHERE uri IN (" <> placeholders <> ")
534534- "
528528+ // Process each batch and merge results
529529+ use accumulated_dict <- result.try(
530530+ list.try_fold(batches, dict.new(), fn(acc_dict, batch) {
531531+ // Build placeholders for SQL IN clause
532532+ let placeholders =
533533+ list.map(batch, fn(_) { "?" })
534534+ |> string.join(", ")
535535536536- // Convert URIs to sqlight.Value list
537537- let params = list.map(uris, sqlight.text)
536536+ let sql =
537537+ "
538538+ SELECT uri, cid
539539+ FROM record
540540+ WHERE uri IN (" <> placeholders <> ")
541541+ "
538542539539- let decoder = {
540540- use uri <- decode.field(0, decode.string)
541541- use cid <- decode.field(1, decode.string)
542542- decode.success(#(uri, cid))
543543- }
543543+ // Convert URIs to sqlight.Value list
544544+ let params = list.map(batch, sqlight.text)
544545545545- use results <- result.try(sqlight.query(
546546- sql,
547547- on: conn,
548548- with: params,
549549- expecting: decoder,
550550- ))
546546+ let decoder = {
547547+ use uri <- decode.field(0, decode.string)
548548+ use cid <- decode.field(1, decode.string)
549549+ decode.success(#(uri, cid))
550550+ }
551551+552552+ use results <- result.try(sqlight.query(
553553+ sql,
554554+ on: conn,
555555+ with: params,
556556+ expecting: decoder,
557557+ ))
558558+559559+ // Merge with accumulated dictionary
560560+ let batch_dict = dict.from_list(results)
561561+ Ok(dict.merge(acc_dict, batch_dict))
562562+ }),
563563+ )
551564552552- // Convert list of tuples to Dict
553553- Ok(dict.from_list(results))
565565+ Ok(accumulated_dict)
566566+ }
567567+ }
568568+}
569569+570570+/// Gets existing CIDs from the database (checks if any CID exists, regardless of URI)
571571+/// Returns a list of CIDs that exist in the database
572572+fn get_existing_cids_batch(
573573+ conn: sqlight.Connection,
574574+ cids: List(String),
575575+) -> Result(List(String), sqlight.Error) {
576576+ case cids {
577577+ [] -> Ok([])
578578+ _ -> {
579579+ // Process in batches to avoid SQL parameter limits (max 999 params)
580580+ let batch_size = 900
581581+ let batches = list.sized_chunk(cids, batch_size)
582582+583583+ // Process each batch and collect results
584584+ use all_results <- result.try(
585585+ list.try_fold(batches, [], fn(acc_results, batch) {
586586+ // Build placeholders for SQL IN clause
587587+ let placeholders =
588588+ list.map(batch, fn(_) { "?" })
589589+ |> string.join(", ")
590590+591591+ let sql =
592592+ "
593593+ SELECT cid
594594+ FROM record
595595+ WHERE cid IN (" <> placeholders <> ")
596596+ "
597597+598598+ let cid_decoder = {
599599+ use cid <- decode.field(0, decode.string)
600600+ decode.success(cid)
601601+ }
602602+603603+ use results <- result.try(sqlight.query(
604604+ sql,
605605+ on: conn,
606606+ with: list.map(batch, sqlight.text),
607607+ expecting: cid_decoder,
608608+ ))
609609+610610+ // Append to accumulated results
611611+ Ok(list.append(acc_results, results))
612612+ }),
613613+ )
614614+615615+ Ok(all_results)
554616 }
555617 }
556618}
···623685 // Get all URIs from the incoming records
624686 let uris = list.map(records, fn(record) { record.uri })
625687626626- // Fetch existing CIDs for these URIs
688688+ // Fetch existing CIDs for these URIs (batched to avoid SQL parameter limits)
627689 use existing_cids <- result.try(get_existing_cids(conn, uris))
628690629691 // Get all CIDs that already exist in the database (for any URI)
630630- let all_incoming_cids = list.map(records, fn(record) { record.cid })
631631- let check_all_cids_sql =
632632- "
633633- SELECT cid
634634- FROM record
635635- WHERE cid IN ("
636636- <> string.join(list.map(all_incoming_cids, fn(_) { "?" }), ", ")
637637- <> ")
638638- "
639639-640640- let cid_decoder = {
641641- use cid <- decode.field(0, decode.string)
642642- decode.success(cid)
643643- }
692692+ // Check in batches to avoid exceeding SQL parameter limits
693693+ let all_incoming_cids =
694694+ list.map(records, fn(record) { record.cid })
695695+ |> list.unique()
644696645645- use existing_cids_in_db <- result.try(sqlight.query(
646646- check_all_cids_sql,
647647- on: conn,
648648- with: list.map(all_incoming_cids, sqlight.text),
649649- expecting: cid_decoder,
697697+ use existing_cids_in_db <- result.try(get_existing_cids_batch(
698698+ conn,
699699+ all_incoming_cids,
650700 ))
651701652702 // Create a set of existing CIDs for fast lookup
+4-2
server/src/event_handler.gleam
···9292 time_us: Int,
9393 commit: goose.CommitData,
9494 plc_url: String,
9595+ collection_ids: List(String),
9596 external_collection_ids: List(String),
9697) -> Nil {
9798 let uri = "at://" <> did <> "/" <> commit.collection <> "/" <> commit.rkey
···162163 // Ensure actor exists before inserting record
163164 case actor_validator.ensure_actor_exists(db, did, plc_url) {
164165 Ok(is_new_actor) -> {
165165- // If this is a new actor, synchronously backfill external collections
166166+ // If this is a new actor, synchronously backfill all collections
166167 // This ensures subscription joins have complete data immediately
167168 // We're already in a spawned process per event, so blocking is fine
168169 case is_new_actor {
···170171 // Publish stats event for new actor
171172 stats_pubsub.publish(stats_pubsub.ActorCreated)
172173173173- backfill.backfill_external_collections_for_actor(
174174+ backfill.backfill_collections_for_actor(
174175 db,
175176 did,
177177+ collection_ids,
176178 external_collection_ids,
177179 plc_url,
178180 )