···99 "fmt"
1010 "log/slog"
1111 "strings"
1212+ "sync"
1213 "time"
13141415 "atcr.io/pkg/atproto"
···146147type Refresher struct {
147148 clientApp *oauth.ClientApp
148149 uiSessionStore UISessionStore // For invalidating UI sessions on OAuth failures
150150+ didLocks sync.Map // Per-DID mutexes to prevent concurrent DPoP nonce races
149151}
150152151153// NewRefresher creates a new session refresher
···162164163165// GetSession gets a fresh OAuth session for a DID
164166// Loads session from database on every request (database is source of truth)
167167+// Uses per-DID locking to prevent concurrent requests from racing on DPoP nonce updates
168168+//
169169+// Why locking is critical:
170170+// During docker push, multiple layers upload concurrently. Each layer creates a new
171171+// ClientSession by loading from database. Without locking, this race condition occurs:
172172+// 1. Layer A loads session with stale DPoP nonce from DB
173173+// 2. Layer B loads session with same stale nonce (A hasn't updated DB yet)
174174+// 3. Layer A makes request → 401 "use_dpop_nonce" → gets fresh nonce → saves to DB
175175+// 4. Layer B makes request → 401 "use_dpop_nonce" (using stale nonce from step 2)
176176+// 5. DPoP nonce thrashing continues, eventually causing 500 errors
177177+//
178178+// With per-DID locking:
179179+// 1. Layer A acquires lock, loads session, handles nonce negotiation, saves, releases lock
180180+// 2. Layer B acquires lock AFTER A releases, loads fresh nonce from DB, succeeds
165181func (r *Refresher) GetSession(ctx context.Context, did string) (*oauth.ClientSession, error) {
166166- return r.resumeSession(ctx, did)
182182+ // Get or create a mutex for this DID to prevent concurrent session loads
183183+ // This prevents DPoP nonce race conditions when multiple layers upload simultaneously
184184+ mutexInterface, _ := r.didLocks.LoadOrStore(did, &sync.Mutex{})
185185+ mutex := mutexInterface.(*sync.Mutex)
186186+187187+ // Serialize session loading per DID
188188+ mutex.Lock()
189189+ defer mutex.Unlock()
190190+191191+ slog.Debug("Acquired session lock for DID",
192192+ "component", "oauth/refresher",
193193+ "did", did)
194194+195195+ session, err := r.resumeSession(ctx, did)
196196+ if err != nil {
197197+ return nil, err
198198+ }
199199+200200+ slog.Debug("Released session lock for DID",
201201+ "component", "oauth/refresher",
202202+ "did", did)
203203+204204+ return session, nil
167205}
168206169207// resumeSession loads a session from storage
···213251 }
214252215253 // Set up callback to persist token updates to SQLite
216216- // This ensures that when indigo automatically refreshes tokens,
217217- // the new tokens are saved to the database immediately
254254+ // This ensures that when indigo automatically refreshes tokens or updates DPoP nonces,
255255+ // the new state is saved to the database immediately
218256 session.PersistSessionCallback = func(callbackCtx context.Context, updatedData *oauth.ClientSessionData) {
219257 if err := r.clientApp.Store.SaveSession(callbackCtx, *updatedData); err != nil {
220258 slog.Error("Failed to persist OAuth session update",
···223261 "sessionID", sessionID,
224262 "error", err)
225263 } else {
226226- slog.Debug("Persisted OAuth token refresh to database",
264264+ // Log session updates (token refresh, DPoP nonce updates, etc.)
265265+ // Note: updatedData contains the full session state including DPoP nonce,
266266+ // but we don't log sensitive data like tokens or nonces themselves
267267+ slog.Debug("Persisted OAuth session update to database",
227268 "component", "oauth/refresher",
228269 "did", did,
229229- "sessionID", sessionID)
270270+ "sessionID", sessionID,
271271+ "hint", "This includes token refresh and DPoP nonce updates")
230272 }
231273 }
232274 return session, nil