Auto-indexing service and GraphQL API for AT Protocol Records
0
fork

Configure Feed

Select the types of activity you want to include in your feed.

fix: encode non-UTF-8 binary data as $bytes in JSON

- Add UTF-8 validation in sanitize_for_json to detect raw binary data
- Encode invalid UTF-8 binaries as {"$bytes": "<base64>"} per ATProto spec
- Fixes CAR worker crash on InvalidByte(142) when processing binary blobs
- Add debug logging for validation errors during backfill

+21 -2
+12 -1
server/src/backfill.gleam
··· 353 353 case validate_record(ctx, r.collection, db_record.json) { 354 354 Valid -> #([db_record, ..records], invalids) 355 355 ParseError(_) -> #([db_record, ..records], invalids) 356 - Invalid(_) -> #(records, invalids + 1) 356 + Invalid(msg) -> { 357 + logging.log( 358 + logging.Debug, 359 + "[backfill] Invalid record " 360 + <> r.collection 361 + <> "/" 362 + <> r.rkey 363 + <> ": " 364 + <> msg, 365 + ) 366 + #(records, invalids + 1) 367 + } 357 368 } 358 369 } 359 370 }
+9 -1
server/src/car/cbor_ffi.erl
··· 35 35 %% Convert tuples to lists for JSON 36 36 list_to_tuple([sanitize_term(E) || E <- tuple_to_list(Tuple)]); 37 37 38 + sanitize_term(Bin) when is_binary(Bin) -> 39 + %% Check if binary is valid UTF-8 (text string) or raw bytes 40 + %% ATProto spec: bytes must be encoded as {"$bytes": "<base64>"} 41 + case unicode:characters_to_binary(Bin) of 42 + Bin -> Bin; % Valid UTF-8, pass through as string 43 + _ -> #{<<"$bytes">> => base64:encode(Bin)} % Invalid UTF-8, encode as $bytes 44 + end; 45 + 38 46 sanitize_term(Other) -> 39 - %% Atoms, numbers, binaries, etc - pass through 47 + %% Atoms, numbers, etc - pass through 40 48 Other.