···4141 return f"{type(e).__name__}: {e!r}" if repr(e) else type(e).__name__
424243434444-# httpx / httpcore exception classes we treat as transient and retry once
4545-# on before giving up. covers connection drops, read-half failures,
4444+# httpx / httpcore exception classes we treat as transient and retry on
4545+# before giving up. covers connection drops, read-half failures,
4646# protocol-level errors (remote closed before fully responding),
4747# timeouts, and pool exhaustion.
4848_TRANSIENT_HTTP_ERRORS: tuple[type[BaseException], ...] = (
···5555 httpcore.ConnectError,
5656 httpcore.RemoteProtocolError,
5757)
5858+5959+# max attempts for a single PDS request (including the initial try).
6060+# backoff schedule between attempts: element N is the sleep BEFORE
6161+# attempt N+1 runs. 4 attempts with 1s/2s/4s gives exponential-ish
6262+# backoff that totals ~7s of deliberate sleep across all retries,
6363+# on top of whatever time the underlying connect/read took.
6464+_PDS_MAX_ATTEMPTS = 4
6565+_PDS_BACKOFF_SCHEDULE: tuple[float, ...] = (1.0, 2.0, 4.0)
6666+6767+6868+def _backoff_for_attempt(attempt: int) -> float:
6969+ """seconds to sleep AFTER a failed attempt of index `attempt`."""
7070+ return _PDS_BACKOFF_SCHEDULE[min(attempt, len(_PDS_BACKOFF_SCHEDULE) - 1)]
587159726073class PayloadTooLargeError(Exception):
···249262 oauth_session = reconstruct_oauth_session(oauth_data)
250263 url = f"{oauth_data['pds_url']}/xrpc/{endpoint}"
251264 response = None # defensive: bind before the loop so error paths can read it
265265+ has_refreshed = False
252266253253- for attempt in range(2):
267267+ for attempt in range(_PDS_MAX_ATTEMPTS):
254268 kwargs: dict[str, Any] = {}
255269 if payload:
256270 kwargs["json"] = payload
···265279 **kwargs,
266280 )
267281 except _TRANSIENT_HTTP_ERRORS as e:
268268- if attempt == 0:
282282+ if attempt < _PDS_MAX_ATTEMPTS - 1:
283283+ backoff = _backoff_for_attempt(attempt)
269284 logger.warning(
270270- f"PDS network error for {auth_session.did}, retrying: {_describe_exc(e)}"
285285+ f"PDS network error for {auth_session.did} on attempt "
286286+ f"{attempt + 1}/{_PDS_MAX_ATTEMPTS}, backing off {backoff}s: "
287287+ f"{_describe_exc(e)}"
271288 )
272272- await asyncio.sleep(1)
289289+ await asyncio.sleep(backoff)
273290 continue
274291 raise Exception(
275275- f"PDS request failed after retry: {_describe_exc(e)}"
292292+ f"PDS request failed after {_PDS_MAX_ATTEMPTS} attempts: {_describe_exc(e)}"
276293 ) from e
277294278295 if response.status_code in success_codes:
···280297 return {}
281298 return response.json()
282299283283- # token expired - refresh and retry. previously gated on the response
284284- # body containing "exp" in its message, but under concurrent load the
285285- # PDS can return 401 with an empty body, a body that can't be parsed,
286286- # or a body whose message differs across PDS implementations — in
287287- # which case we'd silently skip the refresh and raise a useless error.
288288- # always attempt refresh on a first-attempt 401; if the refresh itself
289289- # is transient-flaky, retry the refresh once before giving up.
290290- if response.status_code == 401 and attempt == 0:
300300+ # 401: token expired or rejected. always attempt refresh on the first
301301+ # 401 we see (under concurrent load PDSes return 401 bodies with
302302+ # varying shapes, including empty — gating on "exp" in the message
303303+ # silently skipped refresh before). if the refresh itself is flaky,
304304+ # retry it once before giving up.
305305+ if response.status_code == 401 and not has_refreshed:
306306+ has_refreshed = True
291307 logger.info(
292308 f"access token expired or rejected for {auth_session.did}; refreshing"
293309 )
···305321 )
306322 continue
307323308308- # response should always be bound here (attempt==1 branch), but defensive
309309- # check keeps the error path sane if the loop structure changes.
324324+ # 5xx: upstream is failing, worth a backoff + retry
325325+ if 500 <= response.status_code < 600 and attempt < _PDS_MAX_ATTEMPTS - 1:
326326+ backoff = _backoff_for_attempt(attempt)
327327+ logger.warning(
328328+ f"PDS {response.status_code} for {auth_session.did} on attempt "
329329+ f"{attempt + 1}/{_PDS_MAX_ATTEMPTS}, backing off {backoff}s"
330330+ )
331331+ await asyncio.sleep(backoff)
332332+ continue
333333+334334+ # 4xx other than 401, or 5xx on the last attempt, or a repeat 401
335335+ # post-refresh: stop retrying and surface the error.
336336+ break
337337+310338 if response is None:
311339 raise Exception("PDS request failed: no response received")
312340 raise Exception(
···347375 blob_data = data if isinstance(data, bytes) else data.read()
348376349377 response = None # defensive: bind before the loop
378378+ has_refreshed = False
350379351351- for attempt in range(2):
380380+ for attempt in range(_PDS_MAX_ATTEMPTS):
352381 try:
353382 response = await get_oauth_client().make_authenticated_request(
354383 session=oauth_session,
···358387 headers={"Content-Type": content_type},
359388 )
360389 except _TRANSIENT_HTTP_ERRORS as e:
361361- if attempt == 0:
390390+ if attempt < _PDS_MAX_ATTEMPTS - 1:
391391+ backoff = _backoff_for_attempt(attempt)
362392 logger.warning(
363363- f"PDS blob upload network error for {auth_session.did}, retrying: {_describe_exc(e)}"
393393+ f"PDS blob upload network error for {auth_session.did} on "
394394+ f"attempt {attempt + 1}/{_PDS_MAX_ATTEMPTS}, backing off "
395395+ f"{backoff}s: {_describe_exc(e)}"
364396 )
365365- await asyncio.sleep(1)
397397+ await asyncio.sleep(backoff)
366398 continue
367399 raise Exception(
368368- f"blob upload failed after retry: {_describe_exc(e)}"
400400+ f"blob upload failed after {_PDS_MAX_ATTEMPTS} attempts: {_describe_exc(e)}"
369401 ) from e
370402371403 if response.status_code == 200:
···377409 f"blob too large for PDS (limit exceeded): {response.text or '<empty body>'}"
378410 )
379411380380- # token expired - refresh and retry. unconditional on first-attempt
381381- # 401 (see rationale in make_pds_request).
382382- if response.status_code == 401 and attempt == 0:
412412+ # 401: refresh once, then retry (same rationale as make_pds_request).
413413+ if response.status_code == 401 and not has_refreshed:
414414+ has_refreshed = True
383415 logger.info(
384416 f"access token expired or rejected for {auth_session.did}; refreshing"
385417 )
···396428 auth_session, oauth_session
397429 )
398430 continue
431431+432432+ # 5xx: backoff and retry
433433+ if 500 <= response.status_code < 600 and attempt < _PDS_MAX_ATTEMPTS - 1:
434434+ backoff = _backoff_for_attempt(attempt)
435435+ logger.warning(
436436+ f"PDS blob upload {response.status_code} for {auth_session.did} "
437437+ f"on attempt {attempt + 1}/{_PDS_MAX_ATTEMPTS}, backing off {backoff}s"
438438+ )
439439+ await asyncio.sleep(backoff)
440440+ continue
441441+442442+ break
399443400444 if response is None:
401445 raise Exception("blob upload failed: no response received")
+9
backend/src/backend/api/tracks/audio_replace.py
···2929from urllib.parse import urljoin
30303131import logfire
3232+from docket import ConcurrencyLimit
3233from fastapi import (
3334 Depends,
3435 File,
···545546async def run_track_audio_replace(
546547 job_id: str,
547548 session_id: str,
549549+ user_did: str,
548550 track_id: int,
549551 file_path: str,
550552 filename: str,
553553+ concurrency: ConcurrencyLimit = ConcurrencyLimit("user_did", max_concurrent=3),
551554) -> None:
552555 """docket task entry point for audio replace.
553556···556559 ReplaceContext, and delegates to the phase orchestrator
557560 (`_process_replace_background`). this is the function registered with
558561 docket; the HTTP handler enqueues it via `schedule_track_audio_replace`.
562562+563563+ `ConcurrencyLimit("user_did", max_concurrent=3)` caps concurrent
564564+ replaces per user's DID at 3 — same as the upload task. prevents a
565565+ user kicking off many replaces at once from overwhelming their PDS's
566566+ connection-limit tolerance.
559567 """
560568 auth_session = await get_session(session_id)
561569 if auth_session is None:
···589597 await docket.add(run_track_audio_replace)(
590598 job_id=ctx.job_id,
591599 session_id=ctx.auth_session.session_id,
600600+ user_did=ctx.auth_session.did,
592601 track_id=ctx.track_id,
593602 file_path=ctx.file_path,
594603 filename=ctx.filename,
+11
backend/src/backend/api/tracks/uploads.py
···12121313import aiofiles
1414import logfire
1515+from docket import ConcurrencyLimit
1516from fastapi import (
1617 Depends,
1718 File,
···995996 support_gate: dict | None,
996997 auto_tag: bool,
997998 unlisted: bool,
999999+ concurrency: ConcurrencyLimit = ConcurrencyLimit("artist_did", max_concurrent=3),
9981000) -> None:
9991001 """docket task entry point for track uploads.
10001002···10071009 rehydrating the session at task start rather than passing the cached
10081010 AuthSession over the wire means we pick up any token refresh that
10091011 happened between the HTTP request and the worker picking up the task.
10121012+10131013+ the `ConcurrencyLimit("artist_did", max_concurrent=3)` caps concurrent
10141014+ uploads per user's DID at 3. a 12-track album upload does not produce
10151015+ 12 parallel `createRecord` calls against the user's PDS (which would
10161016+ exceed the typical PDS's connection-limit + rate-limit tolerance and
10171017+ cause ConnectTimeouts). instead the task queue trickles uploads
10181018+ through 3 at a time. user-visible latency for the slowest track in a
10191019+ large album goes up, but every track publishes successfully rather
10201020+ than 1-2 silently failing on upstream PDS throttling.
10101021 """
10111022 auth_session = await get_session(session_id)
10121023 if auth_session is None:
+9-4
backend/tests/test_pds_network_retry.py
···7777 assert result == {"uri": "at://did:plc:testgoose/fm.plyr.track/abc"}
7878 assert mock_client.make_authenticated_request.call_count == 2
79798080- async def test_raises_after_two_read_errors(
8080+ async def test_raises_after_all_attempts_exhausted(
8181 self, mock_auth_session: AuthSession
8282 ) -> None:
8383+ # with exponential-backoff retries the client now makes
8484+ # _PDS_MAX_ATTEMPTS attempts before giving up. supply enough
8585+ # ReadErrors to exhaust them all. backoffs are real sleeps
8686+ # (1s + 2s + 4s = 7s between attempts) — unavoidable for
8787+ # this test, but only one test pays the cost.
8388 mock_client = AsyncMock()
8489 mock_client.make_authenticated_request = AsyncMock(
8590 side_effect=httpx.ReadError("")
···9095 "backend._internal.atproto.client.get_oauth_client",
9196 return_value=mock_client,
9297 ),
9393- pytest.raises(Exception, match="PDS request failed after retry"),
9898+ pytest.raises(Exception, match=r"PDS request failed after \d+ attempts"),
9499 ):
95100 await make_pds_request(
96101 mock_auth_session,
···147152 assert result == blob_ref
148153 assert mock_client.make_authenticated_request.call_count == 2
149154150150- async def test_raises_after_two_read_errors(
155155+ async def test_raises_after_all_attempts_exhausted(
151156 self, mock_auth_session: AuthSession
152157 ) -> None:
153158 mock_client = AsyncMock()
···160165 "backend._internal.atproto.client.get_oauth_client",
161166 return_value=mock_client,
162167 ),
163163- pytest.raises(Exception, match="blob upload failed after retry"),
168168+ pytest.raises(Exception, match=r"blob upload failed after \d+ attempts"),
164169 ):
165170 await upload_blob(mock_auth_session, b"fake-audio", "audio/mpeg")
166171