···11"""low-level ATProto PDS client with OAuth and token refresh."""
2233import asyncio
44-import json
54import logging
65from datetime import UTC, datetime, timedelta
76from typing import Any, BinaryIO
···2524def pds_blob_url(pds_url: str, did: str, cid: str) -> str:
2625 """construct a public URL to fetch a blob from a PDS."""
2726 return f"{pds_url}/xrpc/com.atproto.sync.getBlob?did={did}&cid={cid}"
2727+2828+2929+def _describe_exc(e: BaseException) -> str:
3030+ """produce a non-empty, type-qualified description of an exception.
3131+3232+ some exception types (notably httpx.RemoteProtocolError with an empty
3333+ h11 reason, asyncio.CancelledError, and bare HTTPError subclasses)
3434+ stringify to "", which makes downstream error logs and user-visible
3535+ messages useless. always surface the exception type; fall back to the
3636+ repr if str is empty.
3737+ """
3838+ msg = str(e)
3939+ if msg:
4040+ return f"{type(e).__name__}: {msg}"
4141+ return f"{type(e).__name__}: {e!r}" if repr(e) else type(e).__name__
4242+4343+4444+# httpx / httpcore exception classes we treat as transient and retry once
4545+# on before giving up. covers connection drops, read-half failures,
4646+# protocol-level errors (remote closed before fully responding),
4747+# timeouts, and pool exhaustion.
4848+_TRANSIENT_HTTP_ERRORS: tuple[type[BaseException], ...] = (
4949+ httpx.ReadError,
5050+ httpx.ConnectError,
5151+ httpx.RemoteProtocolError,
5252+ httpx.TimeoutException,
5353+ httpx.PoolTimeout,
5454+ httpcore.ReadError,
5555+ httpcore.ConnectError,
5656+ httpcore.RemoteProtocolError,
5757+)
285829593060class PayloadTooLargeError(Exception):
···218248219249 oauth_session = reconstruct_oauth_session(oauth_data)
220250 url = f"{oauth_data['pds_url']}/xrpc/{endpoint}"
251251+ response = None # defensive: bind before the loop so error paths can read it
221252222253 for attempt in range(2):
223254 kwargs: dict[str, Any] = {}
···233264 url=url,
234265 **kwargs,
235266 )
236236- except (
237237- httpx.ReadError,
238238- httpx.ConnectError,
239239- httpcore.ReadError,
240240- httpcore.ConnectError,
241241- ) as e:
267267+ except _TRANSIENT_HTTP_ERRORS as e:
242268 if attempt == 0:
243269 logger.warning(
244244- f"PDS network error for {auth_session.did}, retrying: {type(e).__name__}: {e}"
270270+ f"PDS network error for {auth_session.did}, retrying: {_describe_exc(e)}"
245271 )
246272 await asyncio.sleep(1)
247273 continue
248274 raise Exception(
249249- f"PDS request failed after retry: {type(e).__name__}: {e}"
275275+ f"PDS request failed after retry: {_describe_exc(e)}"
250276 ) from e
251277252278 if response.status_code in success_codes:
···254280 return {}
255281 return response.json()
256282257257- # token expired - refresh and retry
283283+ # token expired - refresh and retry. previously gated on the response
284284+ # body containing "exp" in its message, but under concurrent load the
285285+ # PDS can return 401 with an empty body, a body that can't be parsed,
286286+ # or a body whose message differs across PDS implementations — in
287287+ # which case we'd silently skip the refresh and raise a useless error.
288288+ # always attempt refresh on a first-attempt 401; if the refresh itself
289289+ # is transient-flaky, retry the refresh once before giving up.
258290 if response.status_code == 401 and attempt == 0:
291291+ logger.info(
292292+ f"access token expired or rejected for {auth_session.did}; refreshing"
293293+ )
259294 try:
260260- error_data = response.json()
261261- if "exp" in error_data.get("message", ""):
262262- logger.info(
263263- f"access token expired for {auth_session.did}, attempting refresh"
264264- )
265265- oauth_session = await _refresh_session_tokens(
266266- auth_session, oauth_session
267267- )
268268- continue
269269- except (json.JSONDecodeError, KeyError):
270270- pass
295295+ oauth_session = await _refresh_session_tokens(
296296+ auth_session, oauth_session
297297+ )
298298+ except _TRANSIENT_HTTP_ERRORS as refresh_exc:
299299+ logger.warning(
300300+ f"token refresh hit transient error, retrying once: {_describe_exc(refresh_exc)}"
301301+ )
302302+ await asyncio.sleep(1)
303303+ oauth_session = await _refresh_session_tokens(
304304+ auth_session, oauth_session
305305+ )
306306+ continue
271307272272- raise Exception(f"PDS request failed: {response.status_code} {response.text}")
308308+ # response should always be bound here (attempt==1 branch), but defensive
309309+ # check keeps the error path sane if the loop structure changes.
310310+ if response is None:
311311+ raise Exception("PDS request failed: no response received")
312312+ raise Exception(
313313+ f"PDS request failed: {response.status_code} {response.text or '<empty body>'}"
314314+ )
273315274316275317async def upload_blob(
···304346 # read data if it's a file-like object
305347 blob_data = data if isinstance(data, bytes) else data.read()
306348349349+ response = None # defensive: bind before the loop
350350+307351 for attempt in range(2):
308352 try:
309353 response = await get_oauth_client().make_authenticated_request(
···313357 content=blob_data,
314358 headers={"Content-Type": content_type},
315359 )
316316- except (
317317- httpx.ReadError,
318318- httpx.ConnectError,
319319- httpcore.ReadError,
320320- httpcore.ConnectError,
321321- ) as e:
360360+ except _TRANSIENT_HTTP_ERRORS as e:
322361 if attempt == 0:
323362 logger.warning(
324324- f"PDS blob upload network error for {auth_session.did}, retrying: {type(e).__name__}: {e}"
363363+ f"PDS blob upload network error for {auth_session.did}, retrying: {_describe_exc(e)}"
325364 )
326365 await asyncio.sleep(1)
327366 continue
328367 raise Exception(
329329- f"blob upload failed after retry: {type(e).__name__}: {e}"
368368+ f"blob upload failed after retry: {_describe_exc(e)}"
330369 ) from e
331370332371 if response.status_code == 200:
···335374 # payload too large - PDS rejects due to size limit
336375 if response.status_code == 413:
337376 raise PayloadTooLargeError(
338338- f"blob too large for PDS (limit exceeded): {response.text}"
377377+ f"blob too large for PDS (limit exceeded): {response.text or '<empty body>'}"
339378 )
340379341341- # token expired - refresh and retry
380380+ # token expired - refresh and retry. unconditional on first-attempt
381381+ # 401 (see rationale in make_pds_request).
342382 if response.status_code == 401 and attempt == 0:
383383+ logger.info(
384384+ f"access token expired or rejected for {auth_session.did}; refreshing"
385385+ )
343386 try:
344344- error_data = response.json()
345345- if "exp" in error_data.get("message", ""):
346346- logger.info(
347347- f"access token expired for {auth_session.did}, attempting refresh"
348348- )
349349- oauth_session = await _refresh_session_tokens(
350350- auth_session, oauth_session
351351- )
352352- continue
353353- except (json.JSONDecodeError, KeyError):
354354- pass
387387+ oauth_session = await _refresh_session_tokens(
388388+ auth_session, oauth_session
389389+ )
390390+ except _TRANSIENT_HTTP_ERRORS as refresh_exc:
391391+ logger.warning(
392392+ f"token refresh hit transient error, retrying once: {_describe_exc(refresh_exc)}"
393393+ )
394394+ await asyncio.sleep(1)
395395+ oauth_session = await _refresh_session_tokens(
396396+ auth_session, oauth_session
397397+ )
398398+ continue
355399356356- raise Exception(f"blob upload failed: {response.status_code} {response.text}")
400400+ if response is None:
401401+ raise Exception("blob upload failed: no response received")
402402+ raise Exception(
403403+ f"blob upload failed: {response.status_code} {response.text or '<empty body>'}"
404404+ )
357405358406359407def parse_at_uri(uri: str) -> tuple[str, str, str]:
+7-2
backend/src/backend/api/tracks/uploads.py
···771771 raise ValueError("PDS returned no record data")
772772 _, atproto_cid = atproto_result
773773 except Exception as e:
774774- logger.error("ATProto sync failed for upload %s: %s", ctx.upload_id, e)
774774+ # always include the exception type in the surfaced message — some
775775+ # exception classes (notably httpx.RemoteProtocolError with an empty
776776+ # h11 reason) stringify to "", which makes downstream error logs and
777777+ # the failed-job error field useless.
778778+ err_detail = f"{type(e).__name__}: {e!s}" if str(e) else type(e).__name__
779779+ logger.error("ATProto sync failed for upload %s: %s", ctx.upload_id, err_detail)
775780 # only delete the row if it's still pending — on ambiguous failures
776781 # (timeouts, connection drops) Jetstream may have already finalized it
777782 deleted_pending = False
···797802 await storage.delete(image_id)
798803 # else: Jetstream finalized the row — media belongs to the published track
799804800800- raise UploadPhaseError(f"failed to sync track to ATProto: {e}") from e
805805+ raise UploadPhaseError(f"failed to sync track to ATProto: {err_detail}") from e
801806802807 # step 3: atomic CAS update pending → published + deferred album linkage
803808 async with db_session() as db:
+112
backend/tests/test_pds_network_retry.py
···182182 await upload_blob(mock_auth_session, b"huge-audio", "audio/mpeg")
183183184184 assert mock_client.make_authenticated_request.call_count == 1
185185+186186+187187+class TestMakePdsRequestAuthRefresh:
188188+ """make_pds_request refreshes and retries on 401 regardless of body shape.
189189+190190+ regression for the 2026-04-24 concurrent-upload flake: under load the PDS
191191+ can return 401 with an empty body or a body whose message doesn't contain
192192+ 'exp'. the previous implementation silently skipped refresh in those
193193+ cases, raising an error with an empty/cryptic message. the refresh path
194194+ is now unconditional on first-attempt 401s.
195195+ """
196196+197197+ async def test_refreshes_on_401_with_empty_body(
198198+ self, mock_auth_session: AuthSession
199199+ ) -> None:
200200+ unauthorized_empty_body = _mock_response(401, json_data={})
201201+ unauthorized_empty_body.text = ""
202202+ ok_response = _mock_response(200, {"uri": "at://test"})
203203+ mock_client = AsyncMock()
204204+ mock_client.make_authenticated_request = AsyncMock(
205205+ side_effect=[unauthorized_empty_body, ok_response]
206206+ )
207207+208208+ with (
209209+ patch(
210210+ "backend._internal.atproto.client.get_oauth_client",
211211+ return_value=mock_client,
212212+ ),
213213+ patch(
214214+ "backend._internal.atproto.client._refresh_session_tokens",
215215+ new_callable=AsyncMock,
216216+ return_value=mock_auth_session.oauth_session,
217217+ ) as mock_refresh,
218218+ ):
219219+ # _refresh_session_tokens returns the oauth_session-equivalent,
220220+ # so spoof it with something reconstruct_oauth_session-compatible
221221+ mock_refresh.return_value = MagicMock()
222222+ result = await make_pds_request(
223223+ mock_auth_session,
224224+ "POST",
225225+ "com.atproto.repo.createRecord",
226226+ )
227227+228228+ assert result == {"uri": "at://test"}
229229+ assert mock_client.make_authenticated_request.call_count == 2
230230+ mock_refresh.assert_awaited_once()
231231+232232+ async def test_refreshes_on_401_with_non_exp_message(
233233+ self, mock_auth_session: AuthSession
234234+ ) -> None:
235235+ # PDSes vary on their 401 body — some return "invalid_token", some
236236+ # omit the message, some say "unauthorized". refresh must fire for
237237+ # all of them, not just when 'exp' happens to be in the string.
238238+ unauthorized = _mock_response(
239239+ 401, json_data={"error": "InvalidToken", "message": "unauthorized"}
240240+ )
241241+ ok_response = _mock_response(200, {"uri": "at://test"})
242242+ mock_client = AsyncMock()
243243+ mock_client.make_authenticated_request = AsyncMock(
244244+ side_effect=[unauthorized, ok_response]
245245+ )
246246+247247+ with (
248248+ patch(
249249+ "backend._internal.atproto.client.get_oauth_client",
250250+ return_value=mock_client,
251251+ ),
252252+ patch(
253253+ "backend._internal.atproto.client._refresh_session_tokens",
254254+ new_callable=AsyncMock,
255255+ ) as mock_refresh,
256256+ ):
257257+ mock_refresh.return_value = MagicMock()
258258+ result = await make_pds_request(
259259+ mock_auth_session,
260260+ "POST",
261261+ "com.atproto.repo.createRecord",
262262+ )
263263+264264+ assert result == {"uri": "at://test"}
265265+ mock_refresh.assert_awaited_once()
266266+267267+268268+class TestMakePdsRequestTransientErrors:
269269+ """make_pds_request retries the newly-covered transient httpx errors."""
270270+271271+ async def test_retries_on_remote_protocol_error(
272272+ self, mock_auth_session: AuthSession
273273+ ) -> None:
274274+ # httpx.RemoteProtocolError stringifies to "" when the h11 reason is
275275+ # blank — the exact class that surfaced the silent failure today.
276276+ ok_response = _mock_response(200, {"uri": "at://test"})
277277+ mock_client = AsyncMock()
278278+ mock_client.make_authenticated_request = AsyncMock(
279279+ side_effect=[
280280+ httpx.RemoteProtocolError(""),
281281+ ok_response,
282282+ ]
283283+ )
284284+285285+ with patch(
286286+ "backend._internal.atproto.client.get_oauth_client",
287287+ return_value=mock_client,
288288+ ):
289289+ result = await make_pds_request(
290290+ mock_auth_session,
291291+ "POST",
292292+ "com.atproto.repo.createRecord",
293293+ )
294294+295295+ assert result == {"uri": "at://test"}
296296+ assert mock_client.make_authenticated_request.call_count == 2