···1919# Run tests
2020go test ./...
21212222+# Run tests for specific package
2323+go test ./pkg/atproto/...
2424+go test ./pkg/appview/storage/...
2525+2626+# Run specific test
2727+go test -run TestManifestStore ./pkg/atproto/...
2828+2229# Run with race detector
2330go test -race ./...
3131+3232+# Run tests with verbose output
3333+go test -v ./...
24342535# Update dependencies
2636go mod tidy
···1011112. HTTP Request → /v2/alice/myapp/manifests/latest
1021123. Registry Middleware (pkg/appview/middleware/registry.go)
103113 → Resolves "alice" to DID and PDS endpoint
104104- → Queries alice's sailor profile for defaultHold
114114+ → Queries alice's sailor profile for defaultHold (returns DID if set)
105115 → If not set, checks alice's io.atcr.hold records
106106- → Falls back to AppView's default_storage_endpoint
107107- → Stores DID/PDS/storage endpoint in context
116116+ → Falls back to AppView's default_hold_did
117117+ → Stores DID/PDS/hold DID in RegistryContext
1081184. Routing Repository (pkg/appview/storage/routing_repository.go)
109119 → Creates RoutingRepository
110120 → Returns ATProto ManifestStore for manifests
111111- → Returns ProxyBlobStore for blobs
112112-5. Blob PUT → Resolved hold service (redirects to S3/storage)
113113-6. Manifest PUT → alice's PDS as io.atcr.manifest record (includes holdEndpoint)
121121+ → Returns ProxyBlobStore for blobs (routes to hold DID)
122122+5. Blob PUT → ProxyBlobStore calls hold's XRPC multipart upload endpoints:
123123+ a. POST /xrpc/io.atcr.hold.initiateUpload (gets uploadID)
124124+ b. POST /xrpc/io.atcr.hold.getPartUploadUrl (gets presigned URL for each part)
125125+ c. PUT to S3 presigned URL (or PUT /xrpc/io.atcr.hold.uploadPart for buffered mode)
126126+ d. POST /xrpc/io.atcr.hold.completeUpload (finalizes upload)
127127+6. Manifest PUT → alice's PDS as io.atcr.manifest record (includes holdDid + holdEndpoint)
128128+ → Manifest also uploaded to PDS blob storage (ATProto CID format)
114129```
115130116131#### Push with BYOS (Bring Your Own Storage)
117132```
1181331. Client: docker push atcr.io/alice/myapp:latest
1191342. Registry Middleware resolves alice → did:plc:alice123
120120-3. Hold discovery via findStorageEndpoint():
121121- a. Check alice's sailor profile for defaultHold
122122- b. If not set, check alice's io.atcr.hold records
123123- c. Fall back to AppView's default_storage_endpoint
124124-4. Found: alice's profile has defaultHold = "https://alice-storage.fly.dev"
125125-5. Routing Repository returns ProxyBlobStore(alice-storage.fly.dev)
126126-6. ProxyBlobStore calls alice-storage.fly.dev for presigned URL
127127-7. Storage service validates alice's DID, generates S3 presigned URL
128128-8. Client redirected to upload blob directly to alice's S3/Storj
129129-9. Manifest stored in alice's PDS with holdEndpoint = "https://alice-storage.fly.dev"
135135+3. Hold discovery via findHoldDID():
136136+ a. Check alice's sailor profile for defaultHold (returns DID if set)
137137+ b. If not set, check alice's io.atcr.hold records (legacy)
138138+ c. Fall back to AppView's default_hold_did
139139+4. Found: alice's profile has defaultHold = "did:web:alice-storage.fly.dev"
140140+5. Routing Repository returns ProxyBlobStore(did:web:alice-storage.fly.dev)
141141+6. ProxyBlobStore:
142142+ a. Resolves hold DID → https://alice-storage.fly.dev (did:web resolution)
143143+ b. Gets service token from alice's PDS via com.atproto.server.getServiceAuth
144144+ c. Calls hold XRPC endpoints with service token authentication:
145145+ - POST /xrpc/io.atcr.hold.initiateUpload
146146+ - POST /xrpc/io.atcr.hold.getPartUploadUrl (returns presigned S3 URL)
147147+ - PUT to S3 presigned URL (direct upload to alice's S3/Storj)
148148+ - POST /xrpc/io.atcr.hold.completeUpload
149149+7. Hold service validates service token, checks crew membership, generates presigned URLs
150150+8. Manifest stored in alice's PDS with:
151151+ - holdDid = "did:web:alice-storage.fly.dev" (primary)
152152+ - holdEndpoint = "https://alice-storage.fly.dev" (backward compat)
130153```
131154132155#### Pull Flow
···1341571. Client: docker pull atcr.io/alice/myapp:latest
1351582. GET /v2/alice/myapp/manifests/latest
1361593. AppView fetches manifest from alice's PDS
137137-4. Manifest contains holdEndpoint = "https://alice-storage.fly.dev"
138138-5. Hold endpoint cached: (alice's DID, "myapp") → "https://alice-storage.fly.dev"
160160+4. Manifest contains:
161161+ - holdDid = "did:web:alice-storage.fly.dev" (primary reference)
162162+ - holdEndpoint = "https://alice-storage.fly.dev" (legacy fallback)
163163+5. Hold DID cached: (alice's DID, "myapp") → "did:web:alice-storage.fly.dev"
164164+ TTL: 10 minutes (covers typical pull operations)
1391656. Client requests blobs: GET /v2/alice/myapp/blobs/sha256:abc123
140140-7. AppView checks cache, routes to hold from manifest (not re-discovered)
141141-8. ProxyBlobStore calls alice-storage.fly.dev for presigned download URL
142142-9. Client redirected to download blob directly from alice's S3
166166+7. AppView checks cache, routes to hold DID from manifest (not re-discovered)
167167+8. ProxyBlobStore:
168168+ a. Resolves hold DID → https://alice-storage.fly.dev
169169+ b. Gets service token from alice's PDS via com.atproto.server.getServiceAuth
170170+ c. Calls GET /xrpc/com.atproto.sync.getBlob?did={userDID}&cid=sha256:abc123&method=GET
171171+ d. Hold returns presigned download URL in JSON response
172172+9. Client redirected to download blob directly from alice's S3 via presigned URL
143173```
144174145145-**Key insight:** Pull uses the historical `holdEndpoint` from the manifest, ensuring blobs are fetched from the hold where they were originally pushed, even if alice later changes her default hold.
175175+**Key insight:** Pull uses the historical `holdDid` from the manifest, ensuring blobs are fetched from the hold where they were originally pushed, even if alice later changes her default hold. Hold cache (10min TTL) avoids re-querying PDS for each blob during the same pull operation.
146176147177### Name Resolution
148178···269299- Uses XRPC protocol (com.atproto.repo.*)
270300271301**lexicon.go**: ATProto record schemas
272272-- `ManifestRecord`: OCI manifest stored as ATProto record (includes `holdEndpoint` field)
302302+- `ManifestRecord`: OCI manifest stored as ATProto record (includes `holdDid` + `holdEndpoint` fields)
273303- `TagRecord`: Tag pointing to manifest digest
274274-- `HoldRecord`: Storage hold definition (for BYOS)
275275-- `HoldCrewRecord`: Hold crew membership/permissions
276276-- `SailorProfileRecord`: User profile with `defaultHold` preference
277277-- Collections: `io.atcr.manifest`, `io.atcr.tag`, `io.atcr.hold`, `io.atcr.hold.crew`, `io.atcr.sailor.profile`
304304+- `HoldRecord`: Storage hold definition (LEGACY - for old BYOS model)
305305+- `HoldCrewRecord`: Hold crew membership (LEGACY - stored in owner's PDS)
306306+- `CaptainRecord`: Hold ownership record (NEW - stored in hold's embedded PDS at rkey "self")
307307+- `CrewRecord`: Hold crew membership (NEW - stored in hold's embedded PDS, one record per member)
308308+- `SailorProfileRecord`: User profile with `defaultHold` preference (can be DID or URL)
309309+- Collections: `io.atcr.manifest`, `io.atcr.tag`, `io.atcr.hold` (legacy), `io.atcr.hold.crew` (used by both legacy and new models), `io.atcr.hold.captain` (new), `io.atcr.sailor.profile`
278310279311**profile.go**: Sailor profile management
280312- `EnsureProfile()`: Creates profile with default hold on first authentication
···289321#### Storage Layer (`pkg/appview/storage/`)
290322291323**routing_repository.go**: Routes content by type
292292-- `Manifests()` → returns ATProto ManifestStore (caches instance for hold endpoint extraction)
324324+- `Manifests()` → returns ATProto ManifestStore (caches instance for hold DID extraction)
293325- `Blobs()` → checks hold cache for pull, uses discovery for push
294294- - Pull: Uses cached `holdEndpoint` from manifest (historical reference)
295295- - Push: Uses discovery-based endpoint from `findStorageEndpoint()`
296296- - Always returns ProxyBlobStore (routes to hold service)
326326+ - Pull: Uses cached `holdDid` from manifest (historical reference)
327327+ - Push: Uses discovery-based DID from `findHoldDID()` in middleware
328328+ - Always returns ProxyBlobStore (routes to hold service via DID)
297329- Implements `distribution.Repository` interface
330330+- Uses RegistryContext to pass DID, PDS endpoint, hold DID, OAuth refresher, etc.
298331299299-**hold_cache.go**: In-memory hold endpoint cache
300300-- Caches `(DID, repository) → holdEndpoint` for pull operations
332332+**hold_cache.go**: In-memory hold DID cache
333333+- Caches `(DID, repository) → holdDid` for pull operations
301334- TTL: 10 minutes (covers typical pull operations)
302335- Cleanup: Background goroutine runs every 5 minutes
303336- **NOTE:** Simple in-memory cache for MVP. For production: use Redis or similar
304304-- Prevents expensive ATProto lookups on every blob request
337337+- Prevents expensive PDS manifest lookups on every blob request during pull
305338306306-**proxy_blob_store.go**: External storage proxy
307307-- Calls user's storage service for presigned URLs
308308-- Issues HTTP redirects for blob uploads/downloads
339339+**proxy_blob_store.go**: External storage proxy (routes to hold via XRPC)
340340+- Resolves hold DID → HTTP URL for XRPC requests (did:web resolution)
341341+- Gets service tokens from user's PDS (`com.atproto.server.getServiceAuth`)
342342+- Calls hold XRPC endpoints with service token authentication:
343343+ - Multipart upload: initiateUpload, getPartUploadUrl, uploadPart, completeUpload, abortUpload
344344+ - Blob read: com.atproto.sync.getBlob (returns presigned download URL)
309345- Implements full `distribution.BlobStore` interface
310310-- Supports multipart uploads for large blobs
311311-- Used when user has `io.atcr.hold` record
346346+- Supports both presigned URL mode (S3 direct) and buffered mode (proxy via hold)
312347313348#### AppView Web UI (`pkg/appview/`)
314349···348383349384#### Hold Service (`cmd/hold/`)
350385351351-Lightweight standalone service for BYOS (Bring Your Own Storage):
386386+Lightweight standalone service for BYOS (Bring Your Own Storage) with embedded PDS:
352387353388**Architecture:**
354354-- Reuses distribution's storage driver factory
355355-- Supports all distribution drivers: S3, Storj, Minio, Azure, GCS, filesystem
356356-- Authorization follows ATProto's public-by-default model
357357-- Generates presigned URLs (15min expiry) or proxies uploads/downloads
389389+- **Embedded PDS**: Each hold has a full ATProto PDS for storing captain + crew records
390390+- **DID**: Hold identified by did:web (e.g., `did:web:hold01.atcr.io`)
391391+- **Storage**: Reuses distribution's storage driver factory (S3, Storj, Minio, Azure, GCS, filesystem)
392392+- **Authorization**: Based on captain + crew records in embedded PDS
393393+- **Blob operations**: Generates presigned URLs (15min expiry) or proxies uploads/downloads via XRPC
358394359395**Authorization Model:**
360396361397Read access:
362398- **Public hold** (`HOLD_PUBLIC=true`): Anonymous + all authenticated users
363363-- **Private hold** (`HOLD_PUBLIC=false`): Authenticated users only (any ATCR user)
399399+- **Private hold** (`HOLD_PUBLIC=false`): Requires authentication + crew membership with blob:read permission
364400365401Write access:
366366-- Hold owner OR crew members only
402402+- Hold owner OR crew members with blob:write permission
367403- Verified via `io.atcr.hold.crew` records in hold's embedded PDS
368404369369-Key insight: "Private" gates anonymous access, not authenticated access. This reflects ATProto's current limitation (no private PDS records yet).
405405+**Embedded PDS Endpoints** (`pkg/hold/pds/xrpc.go`):
370406371371-**Embedded PDS Endpoints:**
372372-373373-Each hold service includes an embedded PDS (Personal Data Server) that stores captain + crew records:
374374-407407+Standard ATProto sync endpoints:
375408- `GET /xrpc/com.atproto.sync.getRepo?did={did}` - Download full repository as CAR file
376409- `GET /xrpc/com.atproto.sync.getRepo?did={did}&since={rev}` - Download repository diff since revision
377410- `GET /xrpc/com.atproto.sync.subscribeRepos` - WebSocket firehose for real-time events
378411- `GET /xrpc/com.atproto.sync.listRepos` - List all repositories (single-user PDS)
412412+- `GET /xrpc/com.atproto.sync.getBlob?did={did}&cid={digest}` - Get blob or presigned download URL
413413+414414+Repository management:
415415+- `GET /xrpc/com.atproto.repo.describeRepo?repo={did}` - Repository metadata
416416+- `GET /xrpc/com.atproto.repo.getRecord?repo={did}&collection={col}&rkey={key}` - Get record
417417+- `GET /xrpc/com.atproto.repo.listRecords?repo={did}&collection={col}` - List records (supports pagination)
418418+- `POST /xrpc/com.atproto.repo.deleteRecord` - Delete record (owner/crew admin only)
419419+- `POST /xrpc/com.atproto.repo.uploadBlob` - Upload ATProto blob (owner/crew admin only)
420420+421421+DID resolution:
379422- `GET /.well-known/did.json` - DID document (did:web resolution)
380380-- Standard ATProto repo endpoints (getRecord, listRecords, etc.)
423423+- `GET /.well-known/atproto-did` - DID for handle resolution
381424382382-The `subscribeRepos` endpoint broadcasts #commit events whenever crew membership changes, allowing AppViews to monitor hold access control in real-time.
425425+Crew management:
426426+- `POST /xrpc/io.atcr.hold.requestCrew` - Request crew membership (authenticated users)
383427384384-**Configuration:** Environment variables (see `.env.example`)
385385-- `HOLD_PUBLIC_URL` - Public URL of hold service (required)
428428+**OCI Multipart Upload Endpoints** (`pkg/hold/oci/xrpc.go`):
429429+430430+All require blob:write permission via service token authentication:
431431+- `POST /xrpc/io.atcr.hold.initiateUpload` - Start multipart upload session
432432+- `POST /xrpc/io.atcr.hold.getPartUploadUrl` - Get presigned URL for uploading a part
433433+- `PUT /xrpc/io.atcr.hold.uploadPart` - Direct buffered part upload (alternative to presigned URLs)
434434+- `POST /xrpc/io.atcr.hold.completeUpload` - Finalize multipart upload and move to final location
435435+- `POST /xrpc/io.atcr.hold.abortUpload` - Cancel multipart upload and cleanup temp data
436436+437437+**AppView-to-Hold Authentication:**
438438+- AppView uses service tokens from user's PDS (`com.atproto.server.getServiceAuth`)
439439+- Service tokens are scoped to specific hold DIDs and include the user's DID
440440+- Hold validates tokens and checks crew membership for authorization
441441+- Tokens cached for 50 seconds (valid for 60 seconds from PDS)
442442+443443+**Configuration:** Environment variables (see `.env.hold.example`)
444444+- `HOLD_PUBLIC_URL` - Public URL of hold service (required, used for did:web generation)
386445- `STORAGE_DRIVER` - Storage driver type (s3, filesystem)
387446- `AWS_ACCESS_KEY_ID`, `AWS_SECRET_ACCESS_KEY` - S3 credentials
388447- `S3_BUCKET`, `S3_ENDPOINT` - S3 configuration
389448- `HOLD_PUBLIC` - Allow public reads (default: false)
390390-- `HOLD_OWNER` - DID for auto-registration (optional)
449449+- `HOLD_OWNER` - DID for captain record creation (optional)
450450+- `HOLD_ALLOW_ALL_CREW` - Allow any authenticated user to register as crew (default: false)
451451+- `HOLD_DATABASE_PATH` - Path for embedded PDS database (required)
452452+- `HOLD_DATABASE_KEY_PATH` - Path for PDS signing keys (optional, generated if missing)
391453392454**Deployment:** Can run on Fly.io, Railway, Docker, Kubernetes, etc.
393455···399461 "$type": "io.atcr.manifest",
400462 "repository": "myapp",
401463 "digest": "sha256:abc123...",
402402- "holdEndpoint": "https://hold1.alice.com",
464464+ "holdDid": "did:web:hold01.atcr.io",
465465+ "holdEndpoint": "https://hold1.atcr.io",
403466 "schemaVersion": 2,
404467 "mediaType": "application/vnd.oci.image.manifest.v1+json",
405468 "config": { "digest": "sha256:...", "size": 1234 },
406469 "layers": [
407470 { "digest": "sha256:...", "size": 5678 }
408471 ],
472472+ "manifestBlob": {
473473+ "$type": "blob",
474474+ "ref": { "$link": "bafyrei..." },
475475+ "mimeType": "application/vnd.oci.image.manifest.v1+json",
476476+ "size": 1234
477477+ },
409478 "createdAt": "2025-09-30T..."
410479}
411480```
412481482482+**Key fields:**
483483+- `holdDid` - DID of the hold service where blobs are stored (PRIMARY reference, new)
484484+- `holdEndpoint` - HTTP URL of hold service (DEPRECATED, kept for backward compatibility)
485485+- `manifestBlob` - Reference to manifest blob in ATProto blob storage (CID format)
486486+413487Record key = manifest digest (without algorithm prefix)
414488Collection = `io.atcr.manifest`
415489···425499```json
426500{
427501 "$type": "io.atcr.sailor.profile",
428428- "defaultHold": "https://hold1.alice.com",
502502+ "defaultHold": "did:web:hold1.alice.com",
429503 "createdAt": "2025-10-02T...",
430504 "updatedAt": "2025-10-02T..."
431505}
···433507434508**Profile Management:**
435509- Created automatically on first authentication (OAuth or Basic Auth)
436436-- If AppView has `default_storage_endpoint` configured, profile gets that as `defaultHold`
510510+- `defaultHold` can be a DID (preferred, e.g., `did:web:hold01.atcr.io`) or legacy URL
511511+- If AppView has `default_hold_did` configured, profile gets that as `defaultHold`
437512- Users can update their profile to change default hold (future: via UI)
438513- Setting `defaultHold` to null opts out of defaults (use own holds or AppView default)
439514440440-**Hold Resolution Priority** (in `findStorageEndpoint()`):
441441-1. **Profile's `defaultHold`** - User's explicit preference
442442-2. **User's `io.atcr.hold` records** - User's own holds
443443-3. **AppView's `default_storage_endpoint`** - Fallback default
515515+**Hold Resolution Priority** (in `findHoldDID()` in middleware):
516516+1. **Profile's `defaultHold`** - User's explicit preference (DID or URL)
517517+2. **User's `io.atcr.hold` records** - User's own holds (legacy BYOS model)
518518+3. **AppView's `default_hold_did`** - Fallback default (configured in middleware)
444519445520This ensures:
446521- Users can join shared holds by setting their profile's `defaultHold`
···472547**Server:**
473548- `ATCR_HTTP_ADDR` - HTTP listen address (default: `:5000`)
474549- `ATCR_BASE_URL` - Public URL for OAuth/JWT realm (auto-detected in dev)
475475-- `ATCR_DEFAULT_HOLD` - Default hold endpoint for blob storage (REQUIRED)
550550+- `ATCR_DEFAULT_HOLD_DID` - Default hold DID for blob storage (REQUIRED, e.g., `did:web:hold01.atcr.io`)
476551477552**Authentication:**
478553- `ATCR_AUTH_KEY_PATH` - JWT signing key path (default: `/var/lib/atcr/auth/private-key.pem`)
···537612**Modifying storage routing**:
5386131. Edit `pkg/appview/storage/routing_repository.go`
5396142. Update `Blobs()` method to change routing logic
540540-3. Consider context values: `storage.endpoint`, `atproto.did`
615615+3. Context is passed via RegistryContext struct (holds DID, PDS endpoint, hold DID, OAuth refresher, etc.)
541616542617**Changing name resolution**:
5436181. Modify `pkg/atproto/resolver.go` for DID/handle resolution
5446192. Update `pkg/appview/middleware/registry.go` if changing routing logic
545545-3. Remember: `findStorageEndpoint()` queries PDS for `io.atcr.hold` records
620620+3. Remember: `findHoldDID()` checks sailor profile, then `io.atcr.hold` records (legacy), then default hold DID
546621547622**Working with OAuth client**:
548623- Client is self-contained: pass `baseURL`, it handles client ID/redirect URI/scopes
···582657583658## Important Context Values
584659585585-When working with the codebase, these context values are used for routing:
660660+When working with the codebase, routing information is passed via the `RegistryContext` struct (`pkg/appview/storage/context.go`):
586661587587-- `atproto.did` - Resolved DID for the user (e.g., `did:plc:alice123`)
588588-- `atproto.pds` - User's PDS endpoint (e.g., `https://bsky.social`)
589589-- `atproto.identity` - Original identity string (handle or DID)
590590-- `storage.endpoint` - Storage service URL (if user has `io.atcr.registry` record)
591591-- `auth.did` - Authenticated DID from validated token
662662+- `DID` - User's DID (e.g., `did:plc:alice123`)
663663+- `PDSEndpoint` - User's PDS endpoint (e.g., `https://bsky.social`)
664664+- `HoldDID` - Hold service DID (e.g., `did:web:hold01.atcr.io`)
665665+- `Repository` - Image repository name (e.g., `myapp`)
666666+- `ATProtoClient` - Client for calling user's PDS with OAuth/Basic Auth
667667+- `Refresher` - OAuth token refresher for service token requests
668668+- `Database` - Database for metrics tracking
669669+- `Authorizer` - Hold authorizer for access control
670670+671671+Legacy context keys (deprecated):
672672+- `hold.did` - Hold DID (now in RegistryContext)
673673+- `auth.did` - Authenticated DID from validated token (now in auth middleware)
592674593675## Documentation References
594676
+8-5
README.md
···21211. **AppView** - Registry API + web UI
2222 - Serves OCI Distribution API (Docker push/pull)
2323 - Resolves handles/DIDs to PDS endpoints
2424- - Routes manifests to PDS, blobs to storage
2424+ - Routes manifests to user's PDS, blobs to hold services
2525 - Web interface for browsing/search
26262727-2. **Hold Service** - Storage service (optional BYOS)
2727+2. **Hold Service** - Storage service with embedded PDS (optional BYOS)
2828+ - Each hold has a full ATProto PDS for access control (captain + crew records)
2929+ - Identified by did:web (e.g., `did:web:hold01.atcr.io`)
2830 - Generates presigned URLs for S3/Storj/Minio/etc.
2929- - Users can deploy their own storage
3131+ - Users can deploy their own storage and control access via crew membership
303231333. **Credential Helper** - Client authentication
3234 - ATProto OAuth with DPoP
3335 - Automatic authentication on first push/pull
34363537**Storage model:**
3636-- Manifests → ATProto records (small JSON)
3737-- Blobs → S3 or BYOS (large binaries)
3838+- Manifests → ATProto records in user's PDS (small JSON, includes `holdDid` reference)
3939+- Blobs → Hold services via XRPC multipart upload (large binaries, stored in S3/etc.)
4040+- AppView uses service tokens to communicate with holds on behalf of users
38413942## Features
4043
+43-11
cmd/appview/serve.go
···2626 "atcr.io/pkg/appview"
2727 "atcr.io/pkg/appview/db"
2828 uihandlers "atcr.io/pkg/appview/handlers"
2929+ "atcr.io/pkg/appview/holdhealth"
2930 "atcr.io/pkg/appview/jetstream"
3031 "github.com/gorilla/mux"
3132)
···7273 return fmt.Errorf("failed to initialize UI database - required for session storage")
7374 }
74757676+ // Initialize hold health checker
7777+ fmt.Println("Initializing hold health checker...")
7878+ cacheTTL := 15 * time.Minute // Cache TTL from user requirements
7979+ healthChecker := holdhealth.NewChecker(cacheTTL)
8080+8181+ // Start background health check worker
8282+ refreshInterval := 5 * time.Minute // Refresh every 5 minutes
8383+ dbAdapter := holdhealth.NewDBAdapter(uiDatabase)
8484+ healthWorker := holdhealth.NewWorker(healthChecker, dbAdapter, refreshInterval)
8585+8686+ // Create context for worker lifecycle management
8787+ workerCtx, workerCancel := context.WithCancel(context.Background())
8888+ defer workerCancel() // Ensure context is cancelled on all exit paths
8989+ healthWorker.Start(workerCtx)
9090+ fmt.Println("Hold health worker started (5min refresh interval, 15min cache TTL)")
9191+7592 // Initialize OAuth components
7693 fmt.Println("Initializing OAuth components...")
7794···132149 middleware.SetGlobalAuthorizer(holdAuthorizer)
133150 fmt.Println("Hold authorizer initialized with database caching")
134151135135- // Initialize UI routes with OAuth app, refresher, and device store
136136- uiTemplates, uiRouter := initializeUIRoutes(uiDatabase, uiReadOnlyDB, uiSessionStore, oauthApp, refresher, baseURL, deviceStore, defaultHoldDID)
152152+ // Initialize UI routes with OAuth app, refresher, device store, and health checker
153153+ uiTemplates, uiRouter := initializeUIRoutes(uiDatabase, uiReadOnlyDB, uiSessionStore, oauthApp, refresher, baseURL, deviceStore, defaultHoldDID, healthChecker)
137154138155 // Create OAuth server
139156 oauthServer := oauth.NewServer(oauthApp)
···256273 select {
257274 case <-stop:
258275 fmt.Println("Shutting down registry server...")
276276+277277+ // Stop health worker first
278278+ fmt.Println("Stopping hold health worker...")
279279+ healthWorker.Stop()
280280+259281 shutdownCtx, cancel := context.WithTimeout(context.Background(), 10*time.Second)
260282 defer cancel()
261283···263285 return fmt.Errorf("server shutdown error: %w", err)
264286 }
265287 case err := <-errChan:
288288+ // Stop health worker on error (workerCancel called by defer)
289289+ healthWorker.Stop()
266290 return fmt.Errorf("server error: %w", err)
267291 }
268292···320344// database: read-write connection for auth and writes
321345// readOnlyDB: read-only connection for public queries (search, user pages, etc.)
322346// defaultHoldDID: DID of the default hold service (e.g., "did:web:hold01.atcr.io")
323323-func initializeUIRoutes(database *sql.DB, readOnlyDB *sql.DB, sessionStore *db.SessionStore, oauthApp *oauth.App, refresher *oauth.Refresher, baseURL string, deviceStore *db.DeviceStore, defaultHoldDID string) (*template.Template, *mux.Router) {
347347+// healthChecker: hold endpoint health checker
348348+func initializeUIRoutes(database *sql.DB, readOnlyDB *sql.DB, sessionStore *db.SessionStore, oauthApp *oauth.App, refresher *oauth.Refresher, baseURL string, deviceStore *db.DeviceStore, defaultHoldDID string, healthChecker *holdhealth.Checker) (*template.Template, *mux.Router) {
324349 // Check if UI is enabled
325350 uiEnabled := os.Getenv("ATCR_UI_ENABLED")
326351 if uiEnabled == "false" {
···356381357382 router.Handle("/api/recent-pushes", middleware.OptionalAuth(sessionStore, database)(
358383 &uihandlers.RecentPushesHandler{
359359- DB: readOnlyDB,
360360- Templates: templates,
361361- RegistryURL: uihandlers.TrimRegistryURL(baseURL),
384384+ DB: readOnlyDB,
385385+ Templates: templates,
386386+ RegistryURL: uihandlers.TrimRegistryURL(baseURL),
387387+ HealthChecker: healthChecker,
362388 },
363389 )).Methods("GET")
364390···428454 },
429455 )).Methods("GET")
430456457457+ // Manifest health check API endpoint (HTMX polling)
458458+ router.Handle("/api/manifest-health", &uihandlers.ManifestHealthHandler{
459459+ HealthChecker: healthChecker,
460460+ }).Methods("GET")
461461+431462 router.Handle("/u/{handle}", middleware.OptionalAuth(sessionStore, database)(
432463 &uihandlers.UserPageHandler{
433464 DB: readOnlyDB,
···438469439470 router.Handle("/r/{handle}/{repository}", middleware.OptionalAuth(sessionStore, database)(
440471 &uihandlers.RepositoryPageHandler{
441441- DB: readOnlyDB,
442442- Templates: templates,
443443- RegistryURL: uihandlers.TrimRegistryURL(baseURL),
444444- Directory: oauthApp.Directory(),
445445- Refresher: refresher,
472472+ DB: readOnlyDB,
473473+ Templates: templates,
474474+ RegistryURL: uihandlers.TrimRegistryURL(baseURL),
475475+ Directory: oauthApp.Directory(),
476476+ Refresher: refresher,
477477+ HealthChecker: healthChecker,
446478 },
447479 )).Methods("GET")
448480
+212-362
docs/BYOS.md
···2233## Overview
4455-ATCR supports "Bring Your Own Storage" (BYOS) for blob storage. This allows users to:
66-- Deploy their own storage service backed by S3/Storj/Minio/filesystem
77-- Control who can use their storage (public or private)
88-- Keep blob data in their own infrastructure while manifests remain in their ATProto PDS
55+ATCR supports "Bring Your Own Storage" (BYOS) for blob storage. Users can:
66+- Deploy their own hold service with embedded PDS
77+- Control access via crew membership in the hold's PDS
88+- Keep blob data in their own S3/Storj/Minio while manifests stay in their user PDS
991010## Architecture
11111212```
1313-┌─────────────────────────────────────────────┐
1414-│ ATCR AppView (API) │
1515-│ - Manifests → ATProto PDS │
1616-│ - Auth & token validation │
1717-│ - Blob routing (issues redirects) │
1818-│ - Profile management │
1919-└─────────────────┬───────────────────────────┘
2020- │
2121- │ Hold discovery priority:
2222- │ 1. io.atcr.sailor.profile.defaultHold
2323- │ 2. io.atcr.hold records
2424- │ 3. AppView default_storage_endpoint
2525- ▼
2626-┌─────────────────────────────────────────────┐
2727-│ User's PDS │
2828-│ - io.atcr.sailor.profile (hold preference) │
2929-│ - io.atcr.hold records (own holds) │
3030-│ - io.atcr.manifest records (with holdEP) │
3131-└─────────────────┬───────────────────────────┘
3232- │
3333- │ Redirects to hold
3434- ▼
3535-┌─────────────────────────────────────────────┐
3636-│ Storage Service (Hold) │
3737-│ - Blob storage (S3/Storj/Minio/filesystem) │
3838-│ - Presigned URL generation │
3939-│ - Authorization (DID-based) │
4040-└─────────────────────────────────────────────┘
4141-```
4242-4343-## ATProto Records
4444-4545-### io.atcr.sailor.profile
4646-4747-**NEW:** User profile for hold selection preferences. Created automatically on first authentication.
4848-4949-```json
5050-{
5151- "$type": "io.atcr.sailor.profile",
5252- "defaultHold": "https://team-hold.example.com",
5353- "createdAt": "2025-10-02T12:00:00Z",
5454- "updatedAt": "2025-10-02T12:00:00Z"
5555-}
1313+┌──────────────────────────────────────────┐
1414+│ ATCR AppView (API) │
1515+│ - Manifests → User's PDS │
1616+│ - Auth & service token management │
1717+│ - Blob routing via XRPC │
1818+│ - Profile management │
1919+└────────────┬─────────────────────────────┘
2020+ │
2121+ │ Hold discovery priority:
2222+ │ 1. io.atcr.sailor.profile.defaultHold (DID)
2323+ │ 2. io.atcr.hold records (legacy)
2424+ │ 3. AppView default_hold_did
2525+ ▼
2626+┌──────────────────────────────────────────┐
2727+│ User's PDS │
2828+│ - io.atcr.sailor.profile (hold DID) │
2929+│ - io.atcr.manifest (with holdDid) │
3030+└────────────┬─────────────────────────────┘
3131+ │
3232+ │ Service token from user's PDS
3333+ ▼
3434+┌──────────────────────────────────────────┐
3535+│ Hold Service (did:web:hold.example.com) │
3636+│ ├── Embedded PDS │
3737+│ │ ├── Captain record (ownership) │
3838+│ │ └── Crew records (access control) │
3939+│ ├── XRPC multipart upload endpoints │
4040+│ └── Storage driver (S3/Storj/etc.) │
4141+└──────────────────────────────────────────┘
5642```
57435858-**Record key:** Always `"self"` (only one profile per user)
5959-6060-**Behavior:**
6161-- Created automatically when user first authenticates (OAuth or Basic Auth)
6262-- If AppView has `default_storage_endpoint`, profile gets that as initial `defaultHold`
6363-- User can update to join shared holds or use their own hold
6464-- Set `defaultHold` to `null` to opt out of defaults (use own hold or AppView default)
6565-6666-**This solves the multi-hold problem:** Users who are crew members of multiple holds can explicitly choose which one to use via their profile.
4444+## Hold Service Components
67456868-### io.atcr.hold
4646+Each hold is a full ATProto actor with:
4747+- **DID**: `did:web:hold.example.com` (hold's identity)
4848+- **Embedded PDS**: Stores captain + crew records (shared data)
4949+- **Storage backend**: S3, Storj, Minio, filesystem, etc.
5050+- **XRPC endpoints**: Standard ATProto + custom OCI multipart upload
69517070-Users create a hold record in their PDS to configure their own storage:
5252+### Records in Hold's PDS
71535454+**Captain record** (`io.atcr.hold.captain/self`):
7255```json
7356{
7474- "$type": "io.atcr.hold",
7575- "endpoint": "https://alice-storage.example.com",
5757+ "$type": "io.atcr.hold.captain",
7658 "owner": "did:plc:alice123",
7759 "public": false,
7878- "createdAt": "2025-10-01T12:00:00Z"
6060+ "deployedAt": "2025-10-14T...",
6161+ "region": "iad",
6262+ "provider": "fly.io"
7963}
8064```
81658282-### io.atcr.hold.crew
8383-8484-Hold owners can add crew members (for shared storage):
8585-6666+**Crew records** (`io.atcr.hold.crew/{rkey}`):
8667```json
8768{
8869 "$type": "io.atcr.hold.crew",
8989- "hold": "at://did:plc:alice/io.atcr.hold/my-storage",
9070 "member": "did:plc:bob456",
9191- "role": "write",
9292- "addedAt": "2025-10-01T12:00:00Z"
7171+ "role": "admin",
7272+ "permissions": ["blob:read", "blob:write"],
7373+ "addedAt": "2025-10-14T..."
9374}
9475```
95769696-**Note:** Crew records are stored in the **hold owner's PDS**, not the crew member's PDS. This ensures the hold owner maintains full control over access.
7777+### Sailor Profile (User's PDS)
97789898-## Storage Service
7979+Users set their preferred hold in their sailor profile:
9980100100-### Deployment
8181+```json
8282+{
8383+ "$type": "io.atcr.sailor.profile",
8484+ "defaultHold": "did:web:hold.example.com",
8585+ "createdAt": "2025-10-02T...",
8686+ "updatedAt": "2025-10-02T..."
8787+}
8888+```
10189102102-The storage service is a lightweight HTTP server that:
103103-1. Accepts presigned URL requests
104104-2. Verifies DID authorization
105105-3. Generates presigned URLs for S3/Storj/etc
106106-4. Returns URLs to AppView for client redirect
9090+## Deployment
1079110892### Configuration
10993110110-The hold service is configured entirely via environment variables. See `.env.example` for all options.
111111-112112-**Required environment variables:**
9494+Hold service is configured entirely via environment variables:
1139511496```bash
115115-# Hold service public URL (REQUIRED)
116116-HOLD_PUBLIC_URL=https://storage.example.com
9797+# Hold identity (REQUIRED)
9898+HOLD_PUBLIC_URL=https://hold.example.com
9999+HOLD_OWNER=did:plc:your-did-here
117100118118-# Storage driver type
101101+# Storage backend
119102STORAGE_DRIVER=s3
120120-121121-# For S3/Minio
122103AWS_ACCESS_KEY_ID=your_access_key
123104AWS_SECRET_ACCESS_KEY=your_secret_key
124105AWS_REGION=us-east-1
125106S3_BUCKET=my-blobs
126107127127-# For Storj (optional - custom S3 endpoint)
128128-# S3_ENDPOINT=https://gateway.storjshare.io
108108+# Access control
109109+HOLD_PUBLIC=false # Require authentication for reads
110110+HOLD_ALLOW_ALL_CREW=false # Only explicit crew members can write
129111130130-# For filesystem storage
131131-# STORAGE_DRIVER=filesystem
132132-# STORAGE_ROOT_DIR=/var/lib/atcr-storage
112112+# Embedded PDS
113113+HOLD_DATABASE_PATH=/var/lib/atcr-hold/hold.db
114114+HOLD_DATABASE_KEY_PATH=/var/lib/atcr-hold/keys
133115```
134116135135-**Authorization:**
136136-137137-ATCR follows ATProto's public-by-default model with gated anonymous access:
138138-139139-**Read Access:**
140140-- **Public hold** (`HOLD_PUBLIC=true`): Anonymous reads allowed (no authentication)
141141-- **Private hold** (`HOLD_PUBLIC=false`): Requires authentication (any ATCR user with sailor.profile)
142142-143143-**Write Access:**
144144-- Always requires authentication
145145-- Must be hold owner OR crew member (verified via `io.atcr.hold.crew` records in owner's PDS)
146146-147147-**Key Points:**
148148-- "Private" just means "no anonymous access" - not "limited user access"
149149-- Any authenticated ATCR user can read from private holds
150150-- Crew membership only controls WRITE access, not READ access
151151-- This aligns with ATProto's public records model (no private PDS records yet)
152152-153153-### Running
117117+### Running Locally
154118155119```bash
156120# Build
157157-go build -o atcr-hold ./cmd/hold
121121+go build -o bin/atcr-hold ./cmd/hold
158122159159-# Set environment variables (or use .env file)
160160-export HOLD_PUBLIC_URL=https://storage.example.com
161161-export STORAGE_DRIVER=s3
162162-export AWS_ACCESS_KEY_ID=...
163163-export AWS_SECRET_ACCESS_KEY=...
164164-export AWS_REGION=us-east-1
165165-export S3_BUCKET=my-blobs
123123+# Run (with env vars or .env file)
124124+export HOLD_PUBLIC_URL=http://localhost:8080
125125+export HOLD_OWNER=did:plc:your-did-here
126126+export STORAGE_DRIVER=filesystem
127127+export STORAGE_ROOT_DIR=/tmp/atcr-hold
128128+export HOLD_DATABASE_PATH=/tmp/atcr-hold/hold.db
166129167167-# Run
168168-./atcr-hold
130130+./bin/atcr-hold
169131```
170132171171-**Registration (required):**
172172-173173-The hold service must be registered in a PDS to be discoverable by the AppView.
174174-175175-**Standard registration workflow:**
176176-177177-1. Set `HOLD_OWNER` to your DID:
178178- ```bash
179179- export HOLD_OWNER=did:plc:your-did-here
180180- ```
181181-182182-2. Start the hold service:
183183- ```bash
184184- ./atcr-hold
185185- ```
186186-187187-3. **Check the logs** for the OAuth authorization URL:
188188- ```
189189- ================================================================================
190190- OAUTH AUTHORIZATION REQUIRED
191191- ================================================================================
192192-193193- Please visit this URL to authorize the hold service:
194194-195195- https://bsky.app/authorize?client_id=...
196196-197197- Waiting for authorization...
198198- ================================================================================
199199- ```
200200-201201-4. Visit the URL in your browser and authorize
202202-203203-5. The hold service will:
204204- - Exchange the authorization code for a token
205205- - Create `io.atcr.hold` record in your PDS
206206- - Create `io.atcr.hold.crew` record (making you the owner)
207207- - Save registration state
208208-209209-6. On subsequent runs, the service checks if already registered and skips OAuth
210210-211211-**Alternative methods:**
212212-213213-- **Manual API registration**: Call `POST /register` with your own OAuth token
214214-- **Completely manual**: Create PDS records yourself using any ATProto client
133133+On first run, the hold service creates:
134134+- Captain record in embedded PDS (making you the owner)
135135+- Crew record for owner with all permissions
136136+- DID document at `/.well-known/did.json`
215137216138### Deploy to Fly.io
217139···223145224146[env]
225147 HOLD_PUBLIC_URL = "https://my-atcr-hold.fly.dev"
226226- HOLD_SERVER_ADDR = ":8080"
227148 STORAGE_DRIVER = "s3"
228149 AWS_REGION = "us-east-1"
229150 S3_BUCKET = "my-blobs"
230151 HOLD_PUBLIC = "false"
152152+ HOLD_ALLOW_ALL_CREW = "false"
231153232154[http_service]
233155 internal_port = 8080
···250172fly secrets set AWS_ACCESS_KEY_ID=...
251173fly secrets set AWS_SECRET_ACCESS_KEY=...
252174fly secrets set HOLD_OWNER=did:plc:your-did-here
253253-254254-# Check logs for OAuth URL on first run
255255-fly logs
256256-257257-# Visit the OAuth URL shown in logs to authorize
258258-# The hold service will register itself in your PDS
259175```
260176261177## Request Flow
262178263179### Push with BYOS
264180265265-1. **Docker push** `atcr.io/alice/myapp:latest`
266266-2. **AppView** resolves `alice` → `did:plc:alice123`
267267-3. **AppView** discovers hold via priority logic:
268268- - Check alice's `io.atcr.sailor.profile` for `defaultHold`
269269- - If not set, check alice's `io.atcr.hold` records
270270- - Fall back to AppView's `default_storage_endpoint`
271271-4. **Found:** `alice.profile.defaultHold = "https://team-hold.example.com"`
272272-5. **AppView** → team-hold: POST `/put-presigned-url`
273273- ```json
274274- {
275275- "did": "did:plc:alice123",
276276- "digest": "sha256:abc123...",
277277- "size": 1048576
278278- }
279279- ```
280280-6. **Hold service**:
281281- - Verifies alice is authorized (checks crew records)
282282- - Generates S3 presigned upload URL (15min expiry)
283283- - Returns: `{"url": "https://s3.../blob?signature=..."}`
284284-7. **AppView** → Docker: `307 Redirect` to presigned URL
285285-8. **Docker** → S3: PUT blob directly (no proxy)
286286-9. **Manifest** stored in alice's PDS with `holdEndpoint: "https://team-hold.example.com"`
181181+```
182182+1. Client: docker push atcr.io/alice/myapp:latest
287183288288-### Pull with BYOS
184184+2. AppView resolves alice → did:plc:alice123
289185290290-1. **Docker pull** `atcr.io/alice/myapp:latest`
291291-2. **AppView** fetches manifest from alice's PDS
292292-3. **Manifest** contains `holdEndpoint: "https://team-hold.example.com"`
293293-4. **AppView** caches: `(alice's DID, "myapp") → "https://team-hold.example.com"` (10min TTL)
294294-5. **Docker** requests blobs: GET `/v2/alice/myapp/blobs/sha256:abc123`
295295-6. **AppView** uses **cached hold from manifest** (not re-discovered)
296296-7. **AppView** → team-hold: POST `/get-presigned-url`
297297-8. **Hold service** returns presigned download URL
298298-9. **AppView** → Docker: `307 Redirect`
299299-10. **Docker** → S3: GET blob directly
186186+3. AppView discovers hold DID:
187187+ - Check alice's sailor profile for defaultHold
188188+ - Returns: "did:web:alice-storage.fly.dev"
300189301301-**Key insight:** Pull uses the historical `holdEndpoint` from the manifest, ensuring blobs are fetched from where they were originally pushed, even if alice later changes her profile's `defaultHold`.
190190+4. AppView gets service token from alice's PDS:
191191+ GET /xrpc/com.atproto.server.getServiceAuth?aud=did:web:alice-storage.fly.dev
192192+ Response: { "token": "eyJ..." }
302193303303-## Default Registry
194194+5. AppView initiates multipart upload to hold:
195195+ POST https://alice-storage.fly.dev/xrpc/io.atcr.hold.initiateUpload
196196+ Authorization: Bearer {serviceToken}
197197+ Body: { "digest": "sha256:abc..." }
198198+ Response: { "uploadId": "xyz" }
304199305305-The AppView can run its own storage service as the default:
200200+6. For each part:
201201+ - AppView: POST /xrpc/io.atcr.hold.getPartUploadUrl
202202+ - Hold validates service token, checks crew membership
203203+ - Hold returns: { "url": "https://s3.../presigned" }
204204+ - Client uploads directly to S3 presigned URL
306205307307-### AppView config
206206+7. AppView completes upload:
207207+ POST /xrpc/io.atcr.hold.completeUpload
208208+ Body: { "uploadId": "xyz", "digest": "sha256:abc...", "parts": [...] }
308209309309-```yaml
310310-middleware:
311311- - name: registry
312312- options:
313313- atproto-resolver:
314314- default_storage_endpoint: https://storage.atcr.io
210210+8. Manifest stored in alice's PDS:
211211+ - holdDid: "did:web:alice-storage.fly.dev"
212212+ - holdEndpoint: "https://alice-storage.fly.dev" (backward compat)
315213```
316214317317-### Default hold service config
318318-319319-```bash
320320-# Accept any authenticated DID
321321-HOLD_PUBLIC=false # Requires authentication
215215+### Pull with BYOS
322216323323-# Or allow public reads
324324-HOLD_PUBLIC=true # Public reads, auth required for writes
325217```
218218+1. Client: docker pull atcr.io/alice/myapp:latest
326219327327-This provides free-tier shared storage for users who don't want to deploy their own.
220220+2. AppView fetches manifest from alice's PDS
328221329329-## Storage Drivers Supported
222222+3. Manifest contains:
223223+ - holdDid: "did:web:alice-storage.fly.dev"
330224331331-The storage service uses distribution's storage drivers:
225225+4. AppView caches hold DID for 10 minutes (covers pull operation)
332226333333-- **S3** - AWS S3, Minio, Storj (via S3 gateway)
334334-- **Filesystem** - Local disk (for testing)
335335-- **Azure** - Azure Blob Storage
336336-- **GCS** - Google Cloud Storage
337337-- **Swift** - OpenStack Swift
338338-- **OSS** - Alibaba Cloud OSS
227227+5. Client requests blob: GET /v2/alice/myapp/blobs/sha256:abc123
339228340340-## Quotas
229229+6. AppView uses cached hold DID from manifest
341230342342-Quotas are NOT implemented in the storage service. Instead, use:
231231+7. AppView gets service token from alice's PDS
343232344344-- **S3**: Bucket policies, lifecycle rules
345345-- **Storj**: Project limits in Storj dashboard
346346-- **Minio**: Quota enforcement features
347347-- **Filesystem**: Disk quotas at OS level
233233+8. AppView calls hold XRPC:
234234+ GET /xrpc/com.atproto.sync.getBlob?did={userDID}&cid=sha256:abc123
235235+ Authorization: Bearer {serviceToken}
236236+ Response: { "url": "https://s3.../presigned-download" }
348237349349-## Security
238238+9. AppView redirects client to presigned S3 URL
350239351351-### Authorization
240240+10. Client downloads directly from S3
241241+```
352242353353-Authorization is based on ATProto's public-by-default model:
243243+**Key insight:** Pull uses the `holdDid` stored in the manifest, ensuring blobs are fetched from where they were originally pushed.
354244355355-**Read Authorization:**
356356-- **Public hold** (`public: true` in hold record):
357357- - Anonymous users: ✅ Allowed
358358- - Any authenticated user: ✅ Allowed
245245+## Access Control
359246360360-- **Private hold** (`public: false` in hold record):
361361- - Anonymous users: ❌ 401 Unauthorized
362362- - Any authenticated ATCR user: ✅ Allowed (no crew membership required)
247247+### Read Access
363248364364-**Write Authorization:**
365365-- Anonymous users: ❌ 401 Unauthorized
366366-- Authenticated non-crew: ❌ 403 Forbidden
367367-- Authenticated crew member: ✅ Allowed
368368-- Hold owner: ✅ Allowed
249249+- **Public hold** (`HOLD_PUBLIC=true`): Anonymous + authenticated users
250250+- **Private hold** (`HOLD_PUBLIC=false`): Authenticated users with crew membership
369251370370-**Implementation:**
371371-- Hold service queries owner's PDS for `io.atcr.hold.crew` records
372372-- Crew records are public ATProto records (read without authentication)
373373-- "Private" holds only gate anonymous access, not authenticated user access
374374-- This reflects ATProto's current limitation: no private PDS records
252252+### Write Access
375253376376-### Presigned URLs
254254+- Hold owner (captain) OR crew members only
255255+- Verified via `io.atcr.hold.crew` records in hold's embedded PDS
256256+- Service token proves user identity (from user's PDS)
377257378378-- 15 minute expiry
379379-- Client uploads/downloads directly to storage
380380-- No data flows through AppView or hold service
258258+### Authorization Flow
381259382382-### Private Holds
260260+```go
261261+1. AppView gets service token from user's PDS
262262+2. AppView sends request to hold with service token
263263+3. Hold validates service token (checks it's from user's PDS)
264264+4. Hold extracts user's DID from token
265265+5. Hold checks crew records in its embedded PDS
266266+6. If crew member found → allow, else → deny
267267+```
383268384384-"Private" holds gate anonymous access while remaining accessible to authenticated users:
269269+## Managing Crew Members
385270386386-**What "Private" Means:**
387387-- `HOLD_PUBLIC=false` prevents anonymous reads
388388-- Any authenticated ATCR user can still read
389389-- This aligns with ATProto's public records model
271271+### Add Crew Member
390272391391-**Write Control:**
392392-- Only hold owner and crew members can write
393393-- Crew membership managed via `io.atcr.hold.crew` records in owner's PDS
394394-- Removing crew member immediately revokes write access
273273+Use ATProto client to create crew record in hold's PDS:
395274396396-**Future: True Private Access**
397397-- When ATProto adds private PDS records, ATCR can support truly private repos
398398-- For now, "private" = "authenticated-only access"
275275+```bash
276276+# Via XRPC (if hold supports it)
277277+POST https://hold.example.com/xrpc/io.atcr.hold.requestCrew
278278+Authorization: Bearer {userOAuthToken}
399279400400-## Example: Personal Storage
280280+# Or manually via captain's OAuth to hold's PDS
281281+atproto put-record \
282282+ --pds https://hold.example.com \
283283+ --collection io.atcr.hold.crew \
284284+ --rkey "{memberDID}" \
285285+ --value '{
286286+ "$type": "io.atcr.hold.crew",
287287+ "member": "did:plc:bob456",
288288+ "role": "admin",
289289+ "permissions": ["blob:read", "blob:write"]
290290+ }'
291291+```
401292402402-Alice wants to use her own Storj account:
293293+### Remove Crew Member
403294404404-1. **Set environment variables**:
405405- ```bash
406406- export HOLD_PUBLIC_URL=https://alice-storage.fly.dev
407407- export HOLD_OWNER=did:plc:alice123
408408- export STORAGE_DRIVER=s3
409409- export AWS_ACCESS_KEY_ID=your_storj_access_key
410410- export AWS_SECRET_ACCESS_KEY=your_storj_secret_key
411411- export S3_ENDPOINT=https://gateway.storjshare.io
412412- export S3_BUCKET=alice-blobs
413413- ```
295295+```bash
296296+atproto delete-record \
297297+ --pds https://hold.example.com \
298298+ --collection io.atcr.hold.crew \
299299+ --rkey "{memberDID}"
300300+```
414301415415-2. **Deploy hold service** to Fly.io - auto-registration creates hold + crew record
302302+## Storage Drivers
416303417417-3. **Push images** - AppView automatically routes to her storage
304304+Hold service supports all distribution storage drivers:
305305+- **S3** - AWS S3, Minio, Storj (via S3 gateway)
306306+- **Filesystem** - Local disk (for testing)
307307+- **Azure** - Azure Blob Storage
308308+- **GCS** - Google Cloud Storage
309309+- **Swift** - OpenStack Swift
418310419311## Example: Team Hold
420312421421-A company wants shared storage for their team:
313313+```bash
314314+# 1. Deploy hold service
315315+export HOLD_PUBLIC_URL=https://team-hold.fly.dev
316316+export HOLD_OWNER=did:plc:admin
317317+export HOLD_PUBLIC=false # Private
318318+export STORAGE_DRIVER=s3
319319+export AWS_ACCESS_KEY_ID=...
320320+export S3_BUCKET=team-blobs
422321423423-1. **Deploy hold service** with S3 credentials and auto-registration:
424424- ```bash
425425- export HOLD_PUBLIC_URL=https://company-hold.fly.dev
426426- export HOLD_OWNER=did:plc:admin
427427- export HOLD_PUBLIC=false
428428- export STORAGE_DRIVER=s3
429429- export AWS_ACCESS_KEY_ID=...
430430- export AWS_SECRET_ACCESS_KEY=...
431431- export S3_BUCKET=company-blobs
432432- ```
322322+fly deploy
433323434434-2. **Hold service auto-registers** on first run, creating:
435435- - Hold record in admin's PDS
436436- - Crew record making admin the owner
324324+# 2. Hold auto-creates captain + crew records on first run
437325438438-3. **Admin adds crew members** via ATProto client or manually:
439439- ```bash
440440- # Using atproto client
441441- atproto put-record \
442442- --collection io.atcr.hold.crew \
443443- --rkey "company-did:plc:engineer1" \
444444- --value '{
445445- "$type": "io.atcr.hold.crew",
446446- "hold": "at://did:plc:admin/io.atcr.hold/company",
447447- "member": "did:plc:engineer1",
448448- "role": "write"
449449- }'
450450- ```
326326+# 3. Admin adds team members via hold's PDS (requires OAuth)
327327+# (TODO: Implement crew management UI/CLI)
451328452452-4. **Team members set their profile** to use the shared hold:
453453- ```bash
454454- # Engineer updates their sailor profile
455455- atproto put-record \
456456- --collection io.atcr.sailor.profile \
457457- --rkey "self" \
458458- --value '{
459459- "$type": "io.atcr.sailor.profile",
460460- "defaultHold": "https://company-hold.fly.dev"
461461- }'
462462- ```
329329+# 4. Team members set their sailor profile:
330330+atproto put-record \
331331+ --collection io.atcr.sailor.profile \
332332+ --rkey "self" \
333333+ --value '{
334334+ "$type": "io.atcr.sailor.profile",
335335+ "defaultHold": "did:web:team-hold.fly.dev"
336336+ }'
463337464464-5. **Hold service queries PDS** for crew records to authorize writes
465465-6. **Engineers push/pull** using `atcr.io/engineer1/myapp` - blobs go to company hold
338338+# 5. Team members can now push/pull using team hold
339339+```
466340467341## Limitations
468342469469-1. **No resume/partial uploads** - Storage service doesn't track upload state
470470-2. **No advanced features** - Just basic put/get, no deduplication logic
471471-3. **In-memory cache** - Hold endpoint cache is in-memory (for production, use Redis)
472472-4. **Manual profile updates** - No UI for updating sailor profile (must use ATProto client)
343343+### Current IAM Challenges
473344474474-## Performance Optimization: S3 Presigned URLs
345345+See [EMBEDDED_PDS.md](./EMBEDDED_PDS.md#iam-challenges) for detailed discussion.
475346476476-**Status:** Planned implementation (see [PRESIGNED_URLS.md](./PRESIGNED_URLS.md))
347347+**Known issues:**
348348+1. **RPC permission format**: Service tokens don't work with IP-based DIDs in local dev
349349+2. **Dynamic hold discovery**: AppView can't dynamically OAuth arbitrary holds from sailor profiles
350350+3. **Manual profile management**: No UI for updating sailor profile (must use ATProto client)
477351478478-Currently, hold services act as proxies for blob data. With presigned URLs:
479479-480480-- **Downloads:** Docker → S3 direct (via 307 redirect)
481481-- **Uploads:** Docker → AppView → S3 (via presigned URL)
482482-- **Hold service bandwidth:** Reduced by 99.98% (only orchestration)
483483-484484-**Benefits:**
485485-- Hold services can run on minimal infrastructure ($5/month instances)
486486-- Direct S3 transfers at maximum speed
487487-- Scales to arbitrarily large images
488488-- Works with Storj, MinIO, Backblaze B2, Cloudflare R2
489489-490490-See [PRESIGNED_URLS.md](./PRESIGNED_URLS.md) for complete technical details and implementation guide.
352352+**Workaround:** Use hostname-based DIDs (`did:web:hold.example.com`) and public holds for now.
491353492354## Future Improvements
493355494494-1. **S3 Presigned URLs** - Implement direct S3 URLs (see [PRESIGNED_URLS.md](./PRESIGNED_URLS.md))
495495-2. **Automatic failover** - Multiple storage endpoints, fallback to default
496496-3. **Storage analytics** - Track usage per DID
497497-4. **Quota integration** - Optional quota tracking in storage service
498498-5. **Profile management UI** - Web interface for users to manage their sailor profile
499499-6. **Distributed cache** - Redis/Memcached for hold endpoint cache in multi-instance deployments
500500-501501-## Comparison to Default Storage
502502-503503-| Feature | Default (Shared S3) | BYOS |
504504-|---------|---------------------|------|
505505-| Setup | None required | Deploy storage service |
506506-| Cost | Free (with quota) | User pays for S3/Storj |
507507-| Control | Limited | Full control |
508508-| Performance | Shared | Dedicated |
509509-| Quotas | Enforced by AppView | User managed |
510510-| Privacy | Blobs in shared bucket | Blobs in user's bucket |
356356+1. **Crew management UI** - Web interface for adding/removing crew members
357357+2. **Dynamic OAuth** - Support for arbitrary BYOS holds without pre-configuration
358358+3. **Hold migration** - Tools for moving blobs between holds
359359+4. **Storage analytics** - Track usage per user/repository
360360+5. **Distributed cache** - Redis for hold DID cache in multi-instance deployments
511361512362## References
513363364364+- [EMBEDDED_PDS.md](./EMBEDDED_PDS.md) - Embedded PDS architecture and IAM details
514365- [ATProto Lexicon Spec](https://atproto.com/specs/lexicon)
515366- [Distribution Storage Drivers](https://distribution.github.io/distribution/storage-drivers/)
516367- [S3 Presigned URLs](https://docs.aws.amazon.com/AmazonS3/latest/userguide/PresignedUrlUploadObject.html)
517517-- [Storj Documentation](https://docs.storj.io/)
+142-1123
docs/CREW_ACCESS_CONTROL.md
···2233## Overview
4455-ATCR uses a crew-based access control system for hold (storage) services. Hold owners can grant write access to other users by creating crew records in their PDS. This document describes the scalable access control system that supports:
66-77-- **Individual access** - Explicit DID-based crew membership
88-- **Wildcard access** - Allow all authenticated users
99-- **Pattern-based access** - Match users by handle patterns (e.g., `*.example.com`)
1010-- **Access revocation** - Bar (ban) specific users or patterns
1111-1212-## Problem Statement
1313-1414-The original crew system required one `io.atcr.hold.crew` record per user. This doesn't scale for:
1515-1616-1. **Public/shared holds** - Thousands of users would need individual crew records
1717-2. **Community holds** - PDS operators want to allow all their users
1818-3. **Default registries** - AppView operators want to allow all authenticated users
1919-4. **Access revocation** - No way to selectively remove access from wildcard/pattern grants
2020-2121-## Design Goals
2222-2323-1. **Preserve ATProto semantics** - Keep `member` as DID type for backlinks
2424-2. **Scalable** - Support thousands of users with minimal records
2525-3. **Flexible patterns** - Support wildcards, handle globs, future regex
2626-4. **Clear semantics** - Separate allow/deny (crew vs barred)
2727-5. **Backward compatible** - Existing crew records work unchanged
2828-6. **Performance** - Minimize PDS queries, enable caching
2929-3030-## Record Schemas
55+ATCR uses crew-based access control for hold (storage) services. Crew records are stored in the **hold's embedded PDS** (not the owner's or user's PDS), making the hold a self-contained ATProto actor with its own access control.
3163232-### io.atcr.hold.crew (Updated)
77+## Current Implementation
3383434-Crew membership grants write access to a hold. Stored in the **hold owner's PDS**.
99+### Records in Hold's PDS
35101111+**Captain record** - Hold ownership (single record at `io.atcr.hold.captain/self`):
3612```json
3713{
3838- "$type": "io.atcr.hold.crew",
3939- "hold": "at://did:plc:owner/io.atcr.hold/shared",
4040- "member": "did:plc:alice123", // Optional: Explicit DID (for backlinks)
4141- "memberPattern": "*.bsky.social", // Optional: Pattern matching
4242- "role": "write",
4343- "createdAt": "2025-10-13T12:00:00Z"
1414+ "$type": "io.atcr.hold.captain",
1515+ "owner": "did:plc:alice123",
1616+ "public": false,
1717+ "deployedAt": "2025-10-14T...",
1818+ "region": "iad",
1919+ "provider": "fly.io"
4420}
4521```
46224747-**Fields:**
4848-4949-- `hold` (string, at-uri, required) - AT-URI of the hold record
5050-- `member` (string, did, optional) - Explicit DID for individual access (enables backlinks)
5151-- `memberPattern` (string, optional) - Pattern for matching multiple users
5252-- `role` (string, required) - Role: `"owner"` or `"write"`
5353-- `expiresAt` (string, datetime, optional) - Optional expiration
5454-- `createdAt` (string, datetime, required) - Creation timestamp
5555-5656-**Validation:** Exactly one of `member` or `memberPattern` must be set.
5757-5858-**Pattern syntax:**
5959-6060-- `"*"` - Matches all authenticated users
6161-- `"*.domain.com"` - Matches handles ending with `.domain.com`
6262-- `"subdomain.*"` - Matches handles starting with `subdomain.`
6363-- `"*.bsky.*"` - Matches handles containing `.bsky.`
6464-6565-**Examples:**
6666-2323+**Crew records** - Access control (one per member at `io.atcr.hold.crew/{rkey}`):
6724```json
6868-// Explicit DID (current behavior, preserved)
6925{
7026 "$type": "io.atcr.hold.crew",
7171- "hold": "at://did:plc:owner/io.atcr.hold/team",
7272- "member": "did:plc:alice123",
7373- "role": "write",
7474- "createdAt": "2025-10-13T12:00:00Z"
7575-}
7676-7777-// Allow all authenticated users (public hold)
7878-{
7979- "$type": "io.atcr.hold.crew",
8080- "hold": "at://did:plc:owner/io.atcr.hold/shared",
8181- "memberPattern": "*",
8282- "role": "write",
8383- "createdAt": "2025-10-13T12:00:00Z"
8484-}
8585-8686-// Allow all users from a community
8787-{
8888- "$type": "io.atcr.hold.crew",
8989- "hold": "at://did:plc:owner/io.atcr.hold/community",
9090- "memberPattern": "*.my-community.social",
9191- "role": "write",
9292- "createdAt": "2025-10-13T12:00:00Z"
9393-}
9494-9595-// Allow specific subdomain
9696-{
9797- "$type": "io.atcr.hold.crew",
9898- "hold": "at://did:plc:owner/io.atcr.hold/corp",
9999- "memberPattern": "*.eng.company.com",
100100- "role": "write",
101101- "createdAt": "2025-10-13T12:00:00Z"
2727+ "member": "did:plc:bob456",
2828+ "role": "admin",
2929+ "permissions": ["blob:read", "blob:write"],
3030+ "addedAt": "2025-10-14T..."
10231}
10332```
10433105105-### io.atcr.hold.crew.barred (New)
3434+### Authorization Logic
10635107107-Barred list revokes access for specific users or patterns. Overrides crew membership. Stored in the **hold owner's PDS**.
3636+Write authorization follows this priority:
10837109109-```json
110110-{
111111- "$type": "io.atcr.hold.crew.barred",
112112- "hold": "at://did:plc:owner/io.atcr.hold/shared",
113113- "member": "did:plc:spammer", // Optional: Explicit DID
114114- "memberPattern": "*.spam-instance.com", // Optional: Pattern matching
115115- "reason": "spam/abuse/policy violation",
116116- "barredAt": "2025-10-13T12:00:00Z"
117117-}
11838```
119119-120120-**Fields:**
121121-122122-- `hold` (string, at-uri, required) - AT-URI of the hold record
123123-- `member` (string, did, optional) - Explicit DID to bar
124124-- `memberPattern` (string, optional) - Pattern for barring multiple users
125125-- `reason` (string, optional) - Human-readable reason for access revocation
126126-- `barredAt` (string, datetime, required) - When user was barred
127127-128128-**Validation:** Exactly one of `member` or `memberPattern` must be set.
129129-130130-**Pattern syntax:** Same as crew patterns (wildcards, handle globs).
131131-132132-**Limitations:** Handle-based barring can be circumvented by users changing their handle or acquiring a new domain. However, this requires significant effort (purchasing domains, changing identity), making it an acceptable deterrent for most abuse cases. DID-based barring is permanent (until user creates new DID).
133133-134134-**Examples:**
135135-136136-```json
137137-// Bar specific user
138138-{
139139- "$type": "io.atcr.hold.crew.barred",
140140- "hold": "at://did:plc:owner/io.atcr.hold/shared",
141141- "member": "did:plc:badactor",
142142- "reason": "Terms of service violation",
143143- "barredAt": "2025-10-13T12:00:00Z"
144144-}
145145-146146-// Bar all users from a spam PDS
147147-{
148148- "$type": "io.atcr.hold.crew.barred",
149149- "hold": "at://did:plc:owner/io.atcr.hold/shared",
150150- "memberPattern": "*.spam-pds.com",
151151- "reason": "Spam instance",
152152- "barredAt": "2025-10-13T14:30:00Z"
153153-}
154154-155155-// Bar pattern of suspicious accounts
156156-{
157157- "$type": "io.atcr.hold.crew.barred",
158158- "hold": "at://did:plc:owner/io.atcr.hold/shared",
159159- "memberPattern": "bot*",
160160- "reason": "Automated account abuse",
161161- "barredAt": "2025-10-13T15:00:00Z"
162162-}
3939+isAuthorizedWrite(userDID):
4040+ 1. If userDID == captain.owner → ALLOW
4141+ 2. If crew record exists for userDID → ALLOW
4242+ 3. Default → DENY
16343```
16444165165-## Authorization Logic
166166-167167-Write authorization follows this priority order:
168168-169169-```
170170-isAuthorizedWrite(did, handle):
171171- 1. If DID is hold owner → ALLOW
172172- 2. If DID or handle matches barred list → DENY
173173- 3. If DID explicitly in crew list → ALLOW
174174- 4. If handle matches crew pattern → ALLOW
175175- 5. Default → DENY
176176-```
177177-178178-**Detailed algorithm:**
179179-180180-```go
181181-func (s *HoldService) isAuthorizedWrite(did string) bool {
182182- // 1. Check if owner
183183- if did == s.config.Registration.OwnerDID {
184184- return true // Owner always has access
185185- }
186186-187187- // 2. Resolve handle from DID
188188- handle, err := resolveHandle(did)
189189- if err != nil {
190190- log.Printf("Failed to resolve handle for DID %s: %v", did, err)
191191- handle = "" // Continue without handle matching
192192- }
193193-194194- // 3. Check barred list (explicit deny overrides everything)
195195- barred, err := s.isBarred(did, handle)
196196- if err != nil {
197197- log.Printf("Error checking barred status: %v", err)
198198- return false // Fail secure
199199- }
200200- if barred {
201201- return false // Explicitly barred
202202- }
203203-204204- // 4. Check crew list (explicit allow)
205205- crew, err := s.isCrewMember(did, handle)
206206- if err != nil {
207207- log.Printf("Error checking crew status: %v", err)
208208- return false // Fail secure
209209- }
210210-211211- return crew // Allow if crew member, deny otherwise
212212-}
213213-214214-func (s *HoldService) isBarred(did, handle string) (bool, error) {
215215- records := listBarredRecords()
216216-217217- for _, record := range records {
218218- // Check explicit DID match
219219- if record.Member != "" && record.Member == did {
220220- return true, nil
221221- }
222222-223223- // Check pattern match (if handle available)
224224- if record.MemberPattern != "" && handle != "" {
225225- if matchPattern(record.MemberPattern, handle) {
226226- return true, nil
227227- }
228228- }
229229- }
230230-231231- return false, nil
232232-}
233233-234234-func (s *HoldService) isCrewMember(did, handle string) (bool, error) {
235235- records := listCrewRecords()
236236-237237- for _, record := range records {
238238- // Check explicit DID match
239239- if record.Member != "" && record.Member == did {
240240- return true, nil
241241- }
242242-243243- // Check pattern match (if handle available)
244244- if record.MemberPattern != "" && handle != "" {
245245- if matchPattern(record.MemberPattern, handle) {
246246- return true, nil
247247- }
248248- }
249249- }
250250-251251- return false, nil
252252-}
253253-```
254254-255255-**Pattern matching:**
256256-257257-```go
258258-func matchPattern(pattern, handle string) bool {
259259- if pattern == "*" {
260260- return true // Wildcard matches all
261261- }
262262-263263- // Convert glob pattern to regex
264264- // *.example.com → ^.*\.example\.com$
265265- // subdomain.* → ^subdomain\..*$
266266- // *.bsky.* → ^.*\.bsky\..*$
267267-268268- regex := globToRegex(pattern)
269269- matched, _ := regexp.MatchString(regex, handle)
270270- return matched
271271-}
272272-```
273273-274274-## Use Cases
4545+Read authorization depends on `HOLD_PUBLIC` setting:
4646+- **Public hold** (`HOLD_PUBLIC=true`): Anonymous + all authenticated users can read
4747+- **Private hold** (`HOLD_PUBLIC=false`): Requires crew membership for reads
27548276276-### 1. Public Hold (Allow All Users)
4949+### Configuration
27750278278-**Goal:** Shared storage for any authenticated ATCR user.
279279-280280-**Setup:**
28151```bash
282282-# Create crew record with wildcard
283283-atproto put-record \
284284- --collection io.atcr.hold.crew \
285285- --rkey "all-users" \
286286- --value '{
287287- "$type": "io.atcr.hold.crew",
288288- "hold": "at://did:plc:owner/io.atcr.hold/public",
289289- "memberPattern": "*",
290290- "role": "write"
291291- }'
5252+# Access control environment variables
5353+HOLD_PUBLIC=false # Require authentication for reads
5454+HOLD_ALLOW_ALL_CREW=false # Only explicit crew members can write
29255```
29356294294-**Result:** All authenticated users can push. Owner can selectively bar bad actors.
5757+### Crew Management
29558296296-### 2. Community Hold (PDS-Specific)
5959+Crew records are managed by the hold captain (owner) using standard ATProto operations on the hold's embedded PDS:
29760298298-**Goal:** Storage for all users from a specific community/PDS.
299299-300300-**Setup:**
6161+**Add crew member:**
30162```bash
302302-# Allow all community members
6363+# Via hold's PDS (requires captain's OAuth)
30364atproto put-record \
6565+ --pds https://hold.example.com \
30466 --collection io.atcr.hold.crew \
305305- --rkey "community-hold" \
6767+ --rkey "{memberDID}" \
30668 --value '{
30769 "$type": "io.atcr.hold.crew",
308308- "hold": "at://did:plc:owner/io.atcr.hold/community",
309309- "memberPattern": "*.my-community.social",
310310- "role": "write"
7070+ "member": "did:plc:bob456",
7171+ "role": "admin",
7272+ "permissions": ["blob:read", "blob:write"],
7373+ "addedAt": "2025-10-14T12:00:00Z"
31174 }'
31275```
31376314314-**Result:** Anyone with a `@someone.my-community.social` handle can push.
315315-316316-### 3. Team Hold with Selective Banning
317317-318318-**Goal:** Shared team storage, but remove access from former employees.
319319-320320-**Setup:**
7777+**Remove crew member:**
32178```bash
322322-# Allow team domain
323323-atproto put-record \
7979+atproto delete-record \
8080+ --pds https://hold.example.com \
32481 --collection io.atcr.hold.crew \
325325- --rkey "team-hold" \
326326- --value '{
327327- "$type": "io.atcr.hold.crew",
328328- "hold": "at://did:plc:owner/io.atcr.hold/team",
329329- "memberPattern": "*.company.com",
330330- "role": "write"
331331- }'
332332-333333-# Bar former employee
334334-atproto put-record \
335335- --collection io.atcr.hold.crew.barred \
336336- --rkey "bar-former-employee" \
337337- --value '{
338338- "$type": "io.atcr.hold.crew.barred",
339339- "hold": "at://did:plc:owner/io.atcr.hold/team",
340340- "member": "did:plc:former-employee",
341341- "reason": "No longer with company"
342342- }'
8282+ --rkey "{memberDID}"
34383```
34484345345-**Result:** All `@*.company.com` users can push, except the explicitly barred DID.
346346-347347-### 4. Anti-Spam with Barred Patterns
348348-349349-**Goal:** Public hold with protection against known spam instances.
350350-351351-**Setup:**
8585+**List crew members:**
35286```bash
353353-# Allow all users
354354-atproto put-record \
355355- --collection io.atcr.hold.crew \
356356- --rkey "public-hold" \
357357- --value '{
358358- "$type": "io.atcr.hold.crew",
359359- "hold": "at://did:plc:owner/io.atcr.hold/public",
360360- "memberPattern": "*",
361361- "role": "write"
362362- }'
363363-364364-# Bar spam instance
365365-atproto put-record \
366366- --collection io.atcr.hold.crew.barred \
367367- --rkey "bar-spam-pds" \
368368- --value '{
369369- "$type": "io.atcr.hold.crew.barred",
370370- "hold": "at://did:plc:owner/io.atcr.hold/public",
371371- "memberPattern": "*.known-spam.com",
372372- "reason": "Spam source"
373373- }'
8787+# Via XRPC
8888+GET https://hold.example.com/xrpc/com.atproto.repo.listRecords?repo={holdDID}&collection=io.atcr.hold.crew
37489```
37590376376-**Result:** Everyone can push except users from `*.known-spam.com`.
9191+## Authentication Flow
37792378378-### 5. Mixed Access (Explicit + Patterns)
379379-380380-**Goal:** Team pattern plus individual guests.
381381-382382-**Setup:**
383383-```bash
384384-# Team pattern
385385-atproto put-record \
386386- --collection io.atcr.hold.crew \
387387- --rkey "team-pattern" \
388388- --value '{
389389- "$type": "io.atcr.hold.crew",
390390- "hold": "at://did:plc:owner/io.atcr.hold/team",
391391- "memberPattern": "*.company.com",
392392- "role": "write"
393393- }'
394394-395395-# Individual contractor
396396-atproto put-record \
397397- --collection io.atcr.hold.crew \
398398- --rkey "contractor-alice" \
399399- --value '{
400400- "$type": "io.atcr.hold.crew",
401401- "hold": "at://did:plc:owner/io.atcr.hold/team",
402402- "member": "did:plc:alice-contractor",
403403- "role": "write"
404404- }'
40593```
406406-407407-**Result:** Team members + specific contractor all have access.
408408-409409-## Implementation Details
410410-411411-### Code Changes Required
412412-413413-**Files to modify:**
414414-415415-1. **`lexicons/io/atcr/hold/crew.json`**
416416- - Make `member` optional (remove from `required`)
417417- - Add `memberPattern` field (string, optional)
418418- - Update description
9494+1. User pushes image to atcr.io/alice/myapp
41995420420-2. **`lexicons/io/atcr/hold/crew/barred.json`** (new file)
421421- - Define new lexicon for barred records
422422- - Same structure as crew (member + memberPattern)
423423- - Add `reason` field
9696+2. AppView gets service token from alice's PDS:
9797+ GET /xrpc/com.atproto.server.getServiceAuth?aud={holdDID}
9898+ Response: { "token": "..." }
42499425425-3. **`pkg/atproto/lexicon.go`**
426426- - Update `HoldCrewRecord` struct (add `MemberPattern` field, make `Member` pointer for optional)
427427- - Add `BarredRecord` struct
428428- - Add `NewBarredRecord()` constructor
429429- - Add `BarredCollection` constant
100100+3. AppView calls hold with service token:
101101+ POST /xrpc/io.atcr.hold.initiateUpload
102102+ Authorization: Bearer {serviceToken}
430103431431-4. **`pkg/hold/authorization.go`**
432432- - Update `isCrewMember()` to check patterns
433433- - Add `isBarred()` function
434434- - Add `resolveHandle()` helper (DID → handle lookup)
435435- - Add `matchPattern()` helper (glob matching)
436436- - Update `isAuthorizedWrite()` to check barred first
437437-438438-5. **`pkg/hold/registration.go`**
439439- - Add `HOLD_ALLOW_ALL_CREW` env var handling
440440- - Check env var on every startup (not just first registration)
441441- - Reconcile desired state (env) vs actual state (PDS)
442442- - Create/delete wildcard crew record as needed
443443-444444-### Pattern Matching Implementation
445445-446446-```go
447447-// pkg/hold/patterns.go (new file)
448448-449449-package hold
450450-451451-import (
452452- "regexp"
453453- "strings"
454454-)
455455-456456-// matchPattern checks if a handle matches a pattern
457457-func matchPattern(pattern, handle string) bool {
458458- if pattern == "*" {
459459- return true
460460- }
461461-462462- // Convert glob to regex
463463- regex := globToRegex(pattern)
464464- matched, err := regexp.MatchString(regex, handle)
465465- if err != nil {
466466- return false
467467- }
468468- return matched
469469-}
470470-471471-// globToRegex converts a glob pattern to a regex
472472-// *.example.com → ^.*\.example\.com$
473473-// subdomain.* → ^subdomain\..*$
474474-// *.bsky.* → ^.*\.bsky\..*$
475475-func globToRegex(pattern string) string {
476476- // Escape special regex characters except *
477477- escaped := regexp.QuoteMeta(pattern)
104104+4. Hold validates service token:
105105+ - Checks token is from alice's PDS
106106+ - Extracts alice's DID from token
478107479479- // Replace escaped \* with .*
480480- regex := strings.ReplaceAll(escaped, "\\*", ".*")
108108+5. Hold checks crew membership:
109109+ - Queries its own PDS: com.atproto.repo.getRecord
110110+ - Collection: io.atcr.hold.crew
111111+ - Record key: alice's DID
481112482482- // Anchor to start and end
483483- return "^" + regex + "$"
484484-}
113113+6. If crew record found → allow upload
114114+ Else → deny with 403 Forbidden
485115```
486116487487-### Handle Resolution
488488-489489-```go
490490-// pkg/hold/resolve.go
117117+**Trust model:** "Trust but verify"
118118+- User OAuth'd to AppView (proves identity)
119119+- Service token from user's PDS (proves AppView is acting on behalf of user)
120120+- Crew record in hold's PDS (proves user has access to this hold)
491121492492-package hold
122122+## Use Cases
493123494494-import (
495495- "context"
496496- "github.com/bluesky-social/indigo/atproto/identity"
497497- "github.com/bluesky-social/indigo/atproto/syntax"
498498-)
124124+### 1. Personal Hold (Private)
499125500500-// resolveHandle resolves a DID to its current handle
501501-func resolveHandle(did string) (string, error) {
502502- ctx := context.Background()
503503- directory := identity.DefaultDirectory()
504504-505505- didParsed, err := syntax.ParseDID(did)
506506- if err != nil {
507507- return "", err
508508- }
509509-510510- ident, err := directory.LookupDID(ctx, didParsed)
511511- if err != nil {
512512- return "", err
513513- }
514514-515515- return ident.Handle.String(), nil
516516-}
126126+```bash
127127+# Owner only
128128+HOLD_PUBLIC=false
129129+HOLD_ALLOW_ALL_CREW=false
130130+# No additional crew records needed - captain has implicit access
517131```
518132519519-### Caching Considerations
520520-521521-**Problem:** Pattern matching requires handle resolution, which adds latency.
522522-523523-**Solution:** Cache handle lookups with TTL.
524524-525525-```go
526526-type handleCache struct {
527527- mu sync.RWMutex
528528- cache map[string]cacheEntry // did → handle
529529-}
530530-531531-type cacheEntry struct {
532532- handle string
533533- expiresAt time.Time
534534-}
535535-536536-const handleCacheTTL = 10 * time.Minute
537537-538538-func (c *handleCache) get(did string) (string, bool) {
539539- c.mu.RLock()
540540- defer c.mu.RUnlock()
541541-542542- entry, ok := c.cache[did]
543543- if !ok || time.Now().After(entry.expiresAt) {
544544- return "", false
545545- }
546546- return entry.handle, true
547547-}
133133+### 2. Team Hold (Shared)
548134549549-func (c *handleCache) set(did, handle string) {
550550- c.mu.Lock()
551551- defer c.mu.Unlock()
135135+```bash
136136+# Multiple team members
137137+HOLD_PUBLIC=false
138138+HOLD_ALLOW_ALL_CREW=false
552139553553- c.cache[did] = cacheEntry{
554554- handle: handle,
555555- expiresAt: time.Now().Add(handleCacheTTL),
556556- }
557557-}
140140+# Captain adds crew members:
141141+# - did:plc:alice (admin)
142142+# - did:plc:bob (member)
143143+# - did:plc:charlie (member)
558144```
559145560560-**Trade-offs:**
561561-- **Cache hit:** Authorization instant
562562-- **Cache miss:** One additional PDS lookup (acceptable for writes)
563563-- **TTL:** 10 minutes balances freshness vs performance
564564-565565-### HOLD_ALLOW_ALL_CREW Environment Variable
566566-567567-**Purpose:** Automatically manage wildcard crew access via environment variable.
568568-569569-**Behavior:** Checked on **every startup** (not just first registration):
570570-571571-1. **Read env var:** `HOLD_ALLOW_ALL_CREW` (true/false)
572572-2. **Query PDS:** Check for crew record with rkey `"allow-all"` and `memberPattern: "*"`
573573-3. **Reconcile state:**
574574- - If env=`true` and record missing → **Create wildcard crew record** (requires OAuth)
575575- - If env=`false` (or unset) and record exists → **Delete wildcard crew record** (requires OAuth)
576576- - Otherwise → No action needed
577577-578578-**Well-known record key:** `"allow-all"` (used exclusively for the managed wildcard record)
579579-580580-**Implementation:**
581581-582582-```go
583583-// pkg/hold/config.go
584584-type Config struct {
585585- Registration struct {
586586- OwnerDID string
587587- AllowAllCrew bool // HOLD_ALLOW_ALL_CREW
588588- }
589589- // ...
590590-}
591591-592592-// pkg/hold/registration.go
593593-func (s *HoldService) ReconcileAllowAllCrew(callbackHandler *http.HandlerFunc) error {
594594- desiredState := s.config.Registration.AllowAllCrew
595595-596596- // Query PDS for "allow-all" crew record
597597- actualState, err := s.hasAllowAllCrewRecord()
598598- if err != nil {
599599- return fmt.Errorf("failed to check allow-all crew record: %w", err)
600600- }
601601-602602- // States match - nothing to do
603603- if desiredState == actualState {
604604- log.Printf("Allow-all crew state matches desired state: %v", desiredState)
605605- return nil
606606- }
607607-608608- // State mismatch - need to reconcile
609609- if desiredState && !actualState {
610610- // Need to create wildcard crew record
611611- log.Printf("Creating allow-all crew record (HOLD_ALLOW_ALL_CREW=true)")
612612- return s.createAllowAllCrewRecord(callbackHandler)
613613- }
614614-615615- if !desiredState && actualState {
616616- // Need to delete wildcard crew record
617617- log.Printf("Deleting allow-all crew record (HOLD_ALLOW_ALL_CREW removed/false)")
618618- return s.deleteAllowAllCrewRecord(callbackHandler)
619619- }
620620-621621- return nil
622622-}
623623-624624-func (s *HoldService) hasAllowAllCrewRecord() (bool, error) {
625625- ownerDID := s.config.Registration.OwnerDID
626626- if ownerDID == "" {
627627- return false, fmt.Errorf("hold owner DID not configured")
628628- }
629629-630630- ctx := context.Background()
631631-632632- // Resolve owner's PDS
633633- pdsEndpoint, err := s.resolveOwnerPDS(ownerDID)
634634- if err != nil {
635635- return false, err
636636- }
637637-638638- // Query for specific rkey
639639- client := atproto.NewClient(pdsEndpoint, ownerDID, "")
640640- record, err := client.GetRecord(ctx, atproto.HoldCrewCollection, "allow-all")
641641-642642- if err != nil {
643643- // Record doesn't exist
644644- return false, nil
645645- }
646646-647647- // Verify it's the wildcard record (memberPattern: "*")
648648- var crewRecord atproto.HoldCrewRecord
649649- if err := json.Unmarshal(record.Value, &crewRecord); err != nil {
650650- return false, err
651651- }
652652-653653- // Check if it's the exact wildcard pattern
654654- return crewRecord.MemberPattern == "*", nil
655655-}
656656-657657-func (s *HoldService) createAllowAllCrewRecord(callbackHandler *http.HandlerFunc) error {
658658- // This requires OAuth - reuse registration OAuth flow
659659- // Need authenticated client to create record
660660-661661- ownerDID := s.config.Registration.OwnerDID
662662- pdsEndpoint, err := s.resolveOwnerPDS(ownerDID)
663663- if err != nil {
664664- return err
665665- }
666666-667667- // Get handle for OAuth
668668- handle, err := resolveHandleFromDID(ownerDID)
669669- if err != nil {
670670- return err
671671- }
672672-673673- // Run OAuth flow (similar to registration)
674674- ctx := context.Background()
675675- result, err := oauth.InteractiveFlowWithCallback(
676676- ctx,
677677- s.config.Server.PublicURL,
678678- handle,
679679- s.getCrewManagementScopes(),
680680- func(handler http.HandlerFunc) error {
681681- *callbackHandler = handler
682682- return nil
683683- },
684684- func(authURL string) error {
685685- log.Printf("\n%s", strings.Repeat("=", 80))
686686- log.Printf("OAUTH REQUIRED: Creating allow-all crew record")
687687- log.Printf("%s", strings.Repeat("=", 80))
688688- log.Printf("\nVisit: %s\n", authURL)
689689- log.Printf("Waiting for authorization...")
690690- log.Printf("%s\n", strings.Repeat("=", 80))
691691- return nil
692692- },
693693- )
694694- if err != nil {
695695- return err
696696- }
697697-698698- // Create authenticated client
699699- apiClient := result.Session.APIClient()
700700- client := atproto.NewClientWithIndigoClient(pdsEndpoint, ownerDID, apiClient)
701701-702702- // Get hold URI (need to know which hold to grant access to)
703703- holdURI, err := s.getHoldURI()
704704- if err != nil {
705705- return err
706706- }
707707-708708- // Create wildcard crew record
709709- crewRecord := atproto.HoldCrewRecord{
710710- Type: atproto.HoldCrewCollection,
711711- Hold: holdURI,
712712- MemberPattern: ptr("*"), // Wildcard - allow all
713713- Role: "write",
714714- CreatedAt: time.Now(),
715715- }
716716-717717- _, err = client.PutRecord(ctx, atproto.HoldCrewCollection, "allow-all", &crewRecord)
718718- if err != nil {
719719- return fmt.Errorf("failed to create allow-all crew record: %w", err)
720720- }
721721-722722- log.Printf("✓ Created allow-all crew record (allows all authenticated users)")
723723- return nil
724724-}
725725-726726-func (s *HoldService) deleteAllowAllCrewRecord(callbackHandler *http.HandlerFunc) error {
727727- // Similar OAuth flow for deletion
728728- // Only delete if it's the exact wildcard pattern (safety check)
729729-730730- isWildcard, err := s.hasAllowAllCrewRecord()
731731- if err != nil {
732732- return err
733733- }
734734-735735- if !isWildcard {
736736- log.Printf("Warning: 'allow-all' crew record exists but is not wildcard - skipping deletion")
737737- return nil
738738- }
739739-740740- // OAuth flow (same as create)
741741- ownerDID := s.config.Registration.OwnerDID
742742- pdsEndpoint, err := s.resolveOwnerPDS(ownerDID)
743743- if err != nil {
744744- return err
745745- }
746746-747747- handle, err := resolveHandleFromDID(ownerDID)
748748- if err != nil {
749749- return err
750750- }
751751-752752- ctx := context.Background()
753753- result, err := oauth.InteractiveFlowWithCallback(
754754- ctx,
755755- s.config.Server.PublicURL,
756756- handle,
757757- s.getCrewManagementScopes(),
758758- func(handler http.HandlerFunc) error {
759759- *callbackHandler = handler
760760- return nil
761761- },
762762- func(authURL string) error {
763763- log.Printf("\n%s", strings.Repeat("=", 80))
764764- log.Printf("OAUTH REQUIRED: Deleting allow-all crew record")
765765- log.Printf("%s", strings.Repeat("=", 80))
766766- log.Printf("\nVisit: %s\n", authURL)
767767- log.Printf("Waiting for authorization...")
768768- log.Printf("%s\n", strings.Repeat("=", 80))
769769- return nil
770770- },
771771- )
772772- if err != nil {
773773- return err
774774- }
775775-776776- // Create authenticated client
777777- apiClient := result.Session.APIClient()
778778- client := atproto.NewClientWithIndigoClient(pdsEndpoint, ownerDID, apiClient)
779779-780780- // Delete the record
781781- err = client.DeleteRecord(ctx, atproto.HoldCrewCollection, "allow-all")
782782- if err != nil {
783783- return fmt.Errorf("failed to delete allow-all crew record: %w", err)
784784- }
785785-786786- log.Printf("✓ Deleted allow-all crew record")
787787- return nil
788788-}
789789-790790-func (s *HoldService) getCrewManagementScopes() []string {
791791- return []string{
792792- "atproto",
793793- fmt.Sprintf("repo:%s?action=create", atproto.HoldCrewCollection),
794794- fmt.Sprintf("repo:%s?action=update", atproto.HoldCrewCollection),
795795- fmt.Sprintf("repo:%s?action=delete", atproto.HoldCrewCollection),
796796- }
797797-}
798798-799799-// Helper for pointer
800800-func ptr(s string) *string {
801801- return &s
802802-}
803803-```
804804-805805-**Startup sequence:**
806806-807807-```go
808808-// cmd/hold/main.go
809809-func main() {
810810- // ... load config ...
811811-812812- holdService := hold.NewHoldService(config)
813813-814814- // Register HTTP routes
815815- var oauthCallbackHandler http.HandlerFunc
816816- http.HandleFunc("/auth/oauth/callback", func(w http.ResponseWriter, r *http.Request) {
817817- if oauthCallbackHandler != nil {
818818- oauthCallbackHandler(w, r)
819819- } else {
820820- http.Error(w, "OAuth callback not initialized", http.StatusInternalServerError)
821821- }
822822- })
823823-824824- // Auto-register hold (if HOLD_OWNER set)
825825- if config.Registration.OwnerDID != "" {
826826- err := holdService.AutoRegister(&oauthCallbackHandler)
827827- if err != nil {
828828- log.Fatalf("Failed to register hold: %v", err)
829829- }
830830-831831- // Reconcile allow-all crew record
832832- err = holdService.ReconcileAllowAllCrew(&oauthCallbackHandler)
833833- if err != nil {
834834- log.Fatalf("Failed to reconcile allow-all crew: %v", err)
835835- }
836836- }
837837-838838- // Start server...
839839-}
840840-```
841841-842842-**Key properties:**
843843-844844-1. **Idempotent:** Safe to run on every startup
845845-2. **Well-known rkey:** Uses `"allow-all"` exclusively for managed record
846846-3. **Safety:** Only deletes if `memberPattern` is exactly `"*"` (won't touch custom patterns like `*.example.com`)
847847-4. **OAuth required:** Both create and delete operations need authentication
848848-5. **Reuses infrastructure:** Same OAuth flow as registration
849849-850850-**Example configurations:**
146146+### 3. Public Hold (Community)
851147852148```bash
853853-# Public hold - allow all users
149149+# Allow any authenticated user (TODO: Implement HOLD_ALLOW_ALL_CREW)
150150+HOLD_PUBLIC=true
854151HOLD_ALLOW_ALL_CREW=true
855855-856856-# Private hold - explicit crew only
857857-HOLD_ALLOW_ALL_CREW=false
858858-# (or omit the variable entirely)
859152```
860153861861-**Edge cases handled:**
862862-863863-- Record exists with different pattern → Won't delete (safety)
864864-- OAuth fails → Service won't start (explicit failure)
865865-- PDS unreachable → Startup fails (can't verify state)
866866-- Record exists but env unset → Deletes wildcard (opt-in behavior)
867867-868868-**Custom patterns preserved:**
869869-870870-Hold owners can still manually create pattern-based crew records with different rkeys:
871871-872872-```bash
873873-# Manually created pattern (rkey: "community")
874874-atproto put-record \
875875- --collection io.atcr.hold.crew \
876876- --rkey "community" \
877877- --value '{
878878- "memberPattern": "*.my-community.social",
879879- "role": "write"
880880- }'
881881-```
882882-883883-The `HOLD_ALLOW_ALL_CREW` management **only touches** the `"allow-all"` rkey with exact `memberPattern: "*"`.
154154+## Planned Features
884155885885-## Migration Path
156156+### Pattern-Based Access Control
886157887887-**Backward Compatibility:** Fully compatible with existing deployments.
158158+**Status:** Planned but not yet implemented.
888159889889-1. **Existing crew records work unchanged**
890890- - Records with `member` (DID) continue to work
891891- - No changes needed to existing records
892892-893893-2. **Opt-in patterns**
894894- - Hold owners can add pattern-based crew records
895895- - Mix explicit DIDs and patterns freely
896896-897897-3. **Barred list is optional**
898898- - Only needed for selective access revocation
899899- - Empty barred list = no blocking
900900-901901-4. **Lexicon evolution**
902902- - Making `member` optional is backward compatible (existing records still have it)
903903- - Adding `memberPattern` is additive (old clients ignore it)
904904-905905-## Future Enhancements
906906-907907-### 1. PDS-Based Access Control
908908-909909-**Goal:** Allow/bar users based on their PDS (not handle).
910910-911911-**Challenge:** ATProto doesn't give PDSes stable identifiers. PDS endpoints are mutable URLs.
912912-913913-**Potential Solutions:**
914914-915915-#### Option A: PDS DID Standard (if ATProto adds it)
916916-917917-If ATProto introduces PDS DIDs:
160160+**Concept:** Allow crew records with pattern matching instead of explicit DIDs:
918161919162```json
920163{
921164 "$type": "io.atcr.hold.crew",
922922- "hold": "at://did:plc:owner/io.atcr.hold/community",
923923- "memberPattern": "pds:did:plc:pds-id",
165165+ "memberPattern": "*.example.com",
924166 "role": "write"
925167}
926168```
927169928928-#### Option B: Accept PDS URL Mutability
929929-930930-Store PDS URLs with understanding they can change:
170170+**Use cases:**
171171+- `"*"` - Allow all authenticated users
172172+- `"*.company.com"` - Allow all users from company domain
173173+- `"*.community.social"` - Allow all community members
931174932932-```json
933933-{
934934- "$type": "io.atcr.hold.crew",
935935- "hold": "at://did:plc:owner/io.atcr.hold/community",
936936- "memberPattern": "pds:https://my-community.social",
937937- "role": "write"
938938-}
939939-```
175175+**Implementation needed:**
176176+- Add `memberPattern` field to crew record schema (make `member` optional)
177177+- Add handle resolution (DID → handle lookup)
178178+- Add pattern matching logic
179179+- Update authorization to check patterns
940180941941-**Trade-off:** User migration bypasses access control, but this requires effort.
181181+### Barred List (Access Revocation)
942182943943-#### Option C: PDS Trust Lists (Federated Model)
183183+**Status:** Planned but not yet implemented.
944184945945-Reference curated lists of trusted PDSes:
185185+**Concept:** Explicit deny list that overrides crew membership:
946186947187```json
948188{
949949- "$type": "io.atcr.hold.crew",
950950- "hold": "at://did:plc:owner/io.atcr.hold/community",
951951- "memberPattern": "trust-list:at://did:plc:curator/trust.list/vetted-pds",
952952- "role": "write"
189189+ "$type": "io.atcr.hold.crew.barred",
190190+ "member": "did:plc:former-employee",
191191+ "reason": "No longer with company",
192192+ "barredAt": "2025-10-13T12:00:00Z"
953193}
954194```
955195956956-**Status:** Experimental. Requires additional standards.
957957-958958-### 2. Advanced Pattern Matching
196196+**Priority:** Barred list checked before crew list.
959197960960-**Goal:** Support more sophisticated patterns.
961961-962962-**Potential patterns:**
198198+### HOLD_ALLOW_ALL_CREW
963199964964-- **Regex:** `memberPattern: "regex:^eng-.*@company.com$"`
965965-- **Multiple patterns:** `memberPattern: ["*.example.com", "*.other.com"]`
966966-- **NOT patterns:** `memberPattern: "!*.spam.com"` (everything except)
200200+**Status:** Environment variable exists but full implementation pending.
967201968968-**Implementation:** Extend `matchPattern()` function with pattern type detection.
202202+**Concept:** Automatically create/manage wildcard crew record via env var:
969203970970-### 3. Temporary Access
971971-972972-**Goal:** Time-limited crew membership.
973973-974974-**Current support:** `expiresAt` field already in schema (optional).
975975-976976-**Enhancement:** Hold service automatically checks expiration during authorization:
977977-978978-```go
979979-if record.ExpiresAt != nil && time.Now().After(*record.ExpiresAt) {
980980- continue // Skip expired crew record
981981-}
204204+```bash
205205+HOLD_ALLOW_ALL_CREW=true # Creates crew record with memberPattern: "*"
982206```
983207984984-### 4. Role-Based Access Control (RBAC)
985985-986986-**Goal:** Fine-grained permissions beyond read/write.
987987-988988-**Potential roles:**
989989-- `"read"` - Pull only
990990-- `"write"` - Push + pull
991991-- `"admin"` - Manage crew records
992992-- `"owner"` - Full control
993993-994994-**Current status:** `role` field exists but only `"owner"` and `"write"` are used.
995995-996996-### 5. Audit Logging
997997-998998-**Goal:** Track access grants/denials for compliance.
999999-10001000-**Implementation:**
10011001-- Log crew checks to structured log
10021002-- Include: DID, handle, result (allow/deny), reason
10031003-- Optional: Write to ATProto audit log record
10041004-10051005-## Security Considerations
10061006-10071007-### 1. Public Records
10081008-10091009-**Consideration:** Crew and barred records are public ATProto records.
10101010-10111011-**Implications:**
10121012-- Anyone can see who has access to a hold
10131013-- Anyone can see who is barred (and why)
10141014-- Similar to Bluesky block lists being public
10151015-10161016-**Mitigation:** This is intentional transparency. Hold owners should use generic reasons in barred records if privacy is a concern.
10171017-10181018-### 2. Handle Changes
10191019-10201020-**Consideration:** Handles can change, but DIDs are permanent.
10211021-10221022-**Implications:**
10231023-- Pattern matching based on handles can be bypassed by changing handle
10241024-- DID-based rules are more stable
10251025-- However, changing handles or acquiring new domains requires significant effort:
10261026- - Purchasing new domain names ($10-100+/year)
10271027- - Updating identity across platforms
10281028- - Loss of established reputation/identity
10291029-10301030-**Recommendation:**
10311031-- Use DID-based crew/barred records for critical access control (permanent)
10321032-- Use pattern-based rules for convenience and community management
10331033-- The effort required to bypass handle patterns makes them an acceptable deterrent
10341034-- Combine both approaches for defense in depth
10351035-10361036-### 3. PDS Migration
208208+**Implementation needed:**
209209+- Auto-create wildcard crew record on startup if env=true
210210+- Auto-delete wildcard crew record if env changes to false
211211+- Use well-known rkey "allow-all" for managed record
103721210381038-**Consideration:** Users can migrate to different PDSes.
213213+## Architecture Notes
103921410401040-**Implications:**
10411041-- PDS-based patterns (future) can be bypassed by migration
10421042-- Handle patterns persist across PDS migration (if handle stays same)
215215+### Why Hold's Embedded PDS?
104321610441044-**Recommendation:** Accept this as inherent trade-off. Migration requires user effort and is acceptable "escape hatch."
217217+**Key insight:** Crew records are **shared data** about the hold, not user-specific data.
104521810461046-### 4. Pattern Matching Performance
10471047-10481048-**Consideration:** Complex patterns could cause ReDoS (regex denial of service).
10491049-10501050-**Mitigation:**
10511051-- Limit pattern complexity (only basic globs in v1)
10521052-- Cache handle lookups to minimize repeated work
10531053-- Set timeout on pattern matching operations
10541054-10551055-### 5. Barred List Circumvention
10561056-10571057-**Consideration:** Barred users might create new DIDs.
10581058-10591059-**Mitigation:**
10601060-- This is fundamental to decentralized identity (users control DIDs)
10611061-- Hold owners can add new DIDs to barred list as discovered
10621062-- Pattern-based barring (handle/PDS patterns) provides broader coverage
10631063-10641064-## Testing Strategy
10651065-10661066-### Unit Tests
10671067-10681068-**Pattern matching:**
10691069-```go
10701070-func TestMatchPattern(t *testing.T) {
10711071- tests := []struct{
10721072- pattern string
10731073- handle string
10741074- want bool
10751075- }{
10761076- {"*", "anything.com", true},
10771077- {"*.example.com", "alice.example.com", true},
10781078- {"*.example.com", "bob.other.com", false},
10791079- {"eng.*", "eng.company.com", true},
10801080- {"eng.*", "sales.company.com", false},
10811081- }
10821082- // ...
10831083-}
10841084-```
219219+**Benefits:**
220220+- **Self-contained**: Hold is independent ATProto actor
221221+- **Portable**: Hold can move without coordinating with user PDSs
222222+- **Discoverable**: Query hold's PDS to see who has access
223223+- **Standard**: Uses normal ATProto sync endpoints (subscribeRepos, getRecord, listRecords)
108522410861086-**Authorization logic:**
10871087-```go
10881088-func TestIsAuthorizedWrite(t *testing.T) {
10891089- // Test: owner always allowed
10901090- // Test: explicit crew member allowed
10911091- // Test: pattern match allowed
10921092- // Test: barred user denied
10931093- // Test: barred pattern denied
10941094- // Test: barred overrides crew
10951095-}
10961096-```
225225+**Comparison:**
226226+- **User's PDS**: Stores user-specific data (manifests, sailor profile)
227227+- **Hold's PDS**: Stores hold-specific data (captain, crew, configuration)
228228+- Clear separation of concerns
109722910981098-### Integration Tests
230230+### Security Considerations
109923111001100-1. **Create hold with wildcard crew** → verify any user can write
11011101-2. **Add barred record** → verify barred user rejected
11021102-3. **Pattern-based crew** → verify matching handles allowed
11031103-4. **Mixed access** → verify explicit + pattern both work
11041104-5. **Handle resolution failure** → verify fallback to DID-only matching
232232+1. **Public Records**: Crew records are public (anyone can see who has access to a hold)
233233+2. **Service Tokens**: Hold trusts user's PDS to issue valid service tokens
234234+3. **DID-Based**: Crew membership is DID-based (permanent), not handle-based
235235+4. **Captain Control**: Only captain can modify crew records (via OAuth to hold's PDS)
110523611061106-### Performance Tests
237237+## Future Improvements
110723811081108-1. **Large crew list** (1000+ records) → measure query time
11091109-2. **Complex patterns** → measure pattern matching time
11101110-3. **Handle cache** → verify cache hit rate
11111111-4. **Concurrent requests** → verify no race conditions
239239+1. **Crew management UI** - Web interface for adding/removing crew members
240240+2. **Pattern-based matching** - Implement `memberPattern` field
241241+3. **Barred list** - Implement access revocation
242242+4. **Role-based permissions** - Fine-grained permissions beyond read/write
243243+5. **Temporary access** - Time-limited crew membership (`expiresAt` field)
244244+6. **Audit logging** - Track access grants/denials
11122451113246## References
1114247248248+- [EMBEDDED_PDS.md](./EMBEDDED_PDS.md) - Embedded PDS architecture details
249249+- [BYOS.md](./BYOS.md) - BYOS deployment and usage
1115250- [ATProto Lexicon Spec](https://atproto.com/specs/lexicon)
11161116-- [Bluesky Block Lists](https://bsky.app/profile/bsky.app/post/3l7wzyc6i622o) (analogous public records)
11171117-- [Go Glob Matching](https://pkg.go.dev/path/filepath#Match)
11181118-- [OAuth Scopes](https://atproto.com/specs/oauth#scopes) (for crew management permissions)
11191119-11201120-## Appendix: Lexicon Definitions
11211121-11221122-### lexicons/io/atcr/hold/crew.json (Updated)
11231123-11241124-```json
11251125-{
11261126- "lexicon": 1,
11271127- "id": "io.atcr.hold.crew",
11281128- "defs": {
11291129- "main": {
11301130- "type": "record",
11311131- "description": "Crew membership for a storage hold. Stored in the hold owner's PDS to maintain control over write access. Supports explicit DIDs (with backlinks), wildcard access, and handle patterns.",
11321132- "key": "any",
11331133- "record": {
11341134- "type": "object",
11351135- "required": ["hold", "role", "createdAt"],
11361136- "properties": {
11371137- "hold": {
11381138- "type": "string",
11391139- "format": "at-uri",
11401140- "description": "AT-URI of the hold record (e.g., 'at://did:plc:owner/io.atcr.hold/hold1')"
11411141- },
11421142- "member": {
11431143- "type": "string",
11441144- "format": "did",
11451145- "description": "DID of crew member (for individual access with backlinks). Exactly one of 'member' or 'memberPattern' must be set."
11461146- },
11471147- "memberPattern": {
11481148- "type": "string",
11491149- "description": "Pattern for matching multiple users. Supports wildcards: '*' (all users), '*.domain.com' (handle glob). Exactly one of 'member' or 'memberPattern' must be set."
11501150- },
11511151- "role": {
11521152- "type": "string",
11531153- "description": "Member's role/permissions. 'owner' = hold owner, 'write' = can push blobs.",
11541154- "knownValues": ["owner", "write"]
11551155- },
11561156- "expiresAt": {
11571157- "type": "string",
11581158- "format": "datetime",
11591159- "description": "Optional expiration for this membership"
11601160- },
11611161- "createdAt": {
11621162- "type": "string",
11631163- "format": "datetime",
11641164- "description": "Membership creation timestamp"
11651165- }
11661166- }
11671167- }
11681168- }
11691169- }
11701170-}
11711171-```
11721172-11731173-### lexicons/io/atcr/hold/crew/barred.json (New)
11741174-11751175-```json
11761176-{
11771177- "lexicon": 1,
11781178- "id": "io.atcr.hold.crew.barred",
11791179- "defs": {
11801180- "main": {
11811181- "type": "record",
11821182- "description": "Barred (banned) list for a storage hold. Users/patterns in this list are denied write access, overriding crew membership. Stored in the hold owner's PDS.",
11831183- "key": "any",
11841184- "record": {
11851185- "type": "object",
11861186- "required": ["hold", "barredAt"],
11871187- "properties": {
11881188- "hold": {
11891189- "type": "string",
11901190- "format": "at-uri",
11911191- "description": "AT-URI of the hold record"
11921192- },
11931193- "member": {
11941194- "type": "string",
11951195- "format": "did",
11961196- "description": "DID of user to bar. Exactly one of 'member' or 'memberPattern' must be set."
11971197- },
11981198- "memberPattern": {
11991199- "type": "string",
12001200- "description": "Pattern for barring multiple users. Supports wildcards: '*.spam.com', 'bot*', etc. Exactly one of 'member' or 'memberPattern' must be set."
12011201- },
12021202- "reason": {
12031203- "type": "string",
12041204- "maxLength": 300,
12051205- "description": "Optional human-readable reason for barring (e.g., 'spam', 'abuse', 'policy violation')"
12061206- },
12071207- "barredAt": {
12081208- "type": "string",
12091209- "format": "datetime",
12101210- "description": "When the user/pattern was barred"
12111211- }
12121212- }
12131213- }
12141214- }
12151215- }
12161216-}
12171217-```
12181218-12191219-## Summary
12201220-12211221-This design enables scalable, flexible access control for ATCR holds while:
12221222-12231223-- **Preserving ATProto semantics** (DID backlinks, public records)
12241224-- **Supporting massive scale** (one record for thousands of users)
12251225-- **Enabling selective revocation** (barred list)
12261226-- **Maintaining backward compatibility** (existing records work unchanged)
12271227-- **Planning for future enhancements** (PDS-based filtering when possible)
12281228-12291229----
12301230-12311231-**Note on terminology:** "Barred" is an ironic reversal of the idiom "no holds barred" (meaning "without restrictions"). In wrestling, when all holds are allowed, it's unrestricted. In ATCR, being "barred from a hold" means you're restricted from access. The pun works in reverse! 🥁
+220-799
docs/EMBEDDED_PDS.md
···11# Embedded PDS Architecture for Hold Services
2233-This document explores the evolution of ATCR's hold service architecture toward becoming an embedded ATProto PDS (Personal Data Server).
33+This document describes ATCR's hold service architecture using embedded ATProto PDS (Personal Data Server) for access control and federation.
4455## Motivation
6677-### Comparison to Other ATProto Projects
77+### The Fragmentation Problem
8899Several ATProto projects face similar challenges with large data storage:
10101111-| Project | Large Data | Metadata | Current Solution |
1212-|---------|-----------|----------|------------------|
1111+| Project | Large Data | Metadata | Solution |
1212+|---------|-----------|----------|----------|
1313| **tangled.org** | Git objects | Issues, PRs, comments | External knot storage |
1414| **stream.place** | Video segments | Stream info, chat | Embedded "static PDS" |
1515-| **ATCR** | Container blobs | Manifests, comments, builds | External hold service |
1616-1717-**Common problem:** Large binary data can't realistically live in user PDSs, but interaction metadata gets fragmented across different users' PDSs.
1818-1919-**Emerging pattern:** Application-specific storage services with embedded minimal PDS implementations.
2020-2121-### The Fragmentation Problem
2222-2323-#### Tangled.org Example
2424-```
2525-user/myproject repository
2626-├── Git data → Knot (external storage)
2727-├── Issues → Created by @alice → Lives in alice's PDS
2828-├── PRs → Created by @bob → Lives in bob's PDS
2929-└── Comments → Created by @charlie → Lives in charlie's PDS
3030-```
3131-3232-**Problems:**
3333-- Repo owner can't export all issues/PRs easily
3434-- No single source of truth for repo metadata
3535-- Interaction history fragmented across PDSs
3636-- Can't encrypt repo data while maintaining collaboration
3737-3838-#### ATCR's Similar Challenge
3939-```
4040-atcr.io/alice/myapp
4141-├── Manifests → alice's PDS
4242-├── Blobs → Hold service (external)
4343-└── Future: Comments, builds, attestations → Where?
4444-```
4545-4646-### Stream.place's Approach
4747-4848-Stream.place built a **minimal "static PDS"** embedded in their application with just the XRPC endpoints they need:
4949-- `com.atproto.repo.describeRepo`
5050-- `com.atproto.sync.subscribeRepos`
5151-- Minimal read methods
5252-5353-**Why:** Avoid rate-limiting Bluesky's infrastructure with video segments while staying ATProto-native.
5454-5555-## Current Hold Service Architecture
5656-5757-The current hold service is intentionally minimal:
5858-5959-```
6060-Hold Service =
6161- - OAuth token validation (call user's PDS)
6262- - Generate presigned S3 URLs
6363- - Return HTTP redirects
6464- - Optional crew membership checks
6565-```
6666-6767-**Endpoints:**
6868-- `POST /get-presigned-url` → S3 download URL
6969-- `POST /put-presigned-url` → S3 upload URL
7070-- `GET /blobs/{digest}` → Proxy fallback
7171-- `PUT /blobs/{digest}` → Proxy fallback
7272-- `GET /health` → Health check
7373-7474-**Resource footprint:**
7575-- Single Go binary (~20MB)
7676-- No database (stateless)
7777-- No PDS (validates against user's PDS)
7878-- Minimal memory/CPU (just signing URLs)
7979-- S3 does all the heavy lifting
8080-8181-This is already **as cheap as possible** for what it does - just an OAuth validation + URL signing service.
8282-8383-## Why Not Force Blobs into User PDSs?
8484-8585-### Size Considerations
1515+| **ATCR** | Container blobs | Manifests, comments, builds | Embedded PDS in hold service |
86168787-**PDS blob limits:** Default ~50MB (Bluesky may be lower)
1717+**Common problem:** Large binary data can't realistically live in user PDSs, but application metadata needs a federated home.
88188989-**Container layer sizes:**
9090-- Alpine base: ~5MB ✓
9191-- Config blobs: ~1-5KB ✓
9292-- Small Go binaries: 10-30MB ✓
9393-- Node.js base: 100-200MB ✗
9494-- Python base: 50-100MB ✗
9595-- ML models: 500MB - 10GB ✗
9696-- Large datasets: huge ✗
1919+**ATCR's approach:** Each hold service is a full ATProto actor with its own embedded PDS for **shared data** (captain + crew records, not user-specific data). This PDS stores access control and metadata about the hold itself.
97209898-**Reality:** Many/most layers exceed 50MB. A split-brain approach would be the norm, not the exception.
2121+## Current Architecture
9922100100-### Split-Brain Complexity
2323+### Hold Service Components
10124102102-```go
103103-func (s *SplitBlobStore) Create(ctx context.Context, options ...) {
104104- // Challenges:
105105- // 1. Monolithic uploads: Size known upfront ✓
106106- // 2. Chunked uploads: Size unknown until complete ✗
107107- // 3. Resumable uploads: State management across PDS/hold ✗
108108- // 4. Mount/cross-repo: Which backend to check? ✗
109109-}
11025```
111111-112112-Detection works for simple cases but breaks down with:
113113-- Multipart/chunked uploads (no size until complete)
114114-- Resumable uploads (stateful across boundaries)
115115-- Cross-repository blob mounts (which backend?)
116116-117117-### Pragmatic Decision
118118-119119-**Accept the trade-off:**
120120-- Blobs in holds (practical for large data)
121121-- Manifests in user's PDS (ownership of metadata)
122122-- Focus on making holds easy to deploy and migrate
123123-124124-Users still own the **important part** - the manifest is the source of truth for what the image is.
125125-126126-## Embedded PDS Vision
127127-128128-### Key Insight: Hold is the PDS
129129-130130-Because blobs are **content-addressed** and **deduplicated globally**, there isn't a singular owner of blob data. Multiple images share the same base layer blobs.
131131-132132-**Therefore:** The **hold itself** is the PDS (with identity `did:web:hold1.example.com`), not individual image repositories.
133133-134134-### Proposed Architecture
135135-136136-```
137137-Hold Service = Minimal PDS (did:web:hold1.example.com)
138138-├── Standard ATProto blob endpoints:
139139-│ ├── com.atproto.sync.uploadBlob
140140-│ ├── com.atproto.sync.getBlob
141141-│ └── Blob storage → S3 (like normal PDS)
142142-├── Custom XRPC methods:
143143-│ ├── io.atcr.hold.delegateAccess (IAM)
144144-│ ├── io.atcr.hold.getUploadUrl (optimization)
145145-│ ├── io.atcr.hold.getDownloadUrl (optimization)
146146-│ ├── io.atcr.hold.exportImage (data portability)
147147-│ └── io.atcr.hold.getStats (metadata)
148148-└── Records (hold's own PDS):
149149- ├── io.atcr.hold.captain (single record: ownership & metadata)
150150- ├── io.atcr.hold.crew/* (crew membership & permissions)
151151- └── io.atcr.hold.config (hold configuration)
152152-```
153153-154154-### Benefits
155155-156156-1. **ATProto-native**: Uses standard XRPC, not custom REST API
157157-2. **Discoverable**: Hold's DID document advertises capabilities
158158-3. **Portable**: Users can export images via XRPC
159159-4. **Standardized**: Blob operations use ATProto conventions
160160-5. **Future-proof**: Can add more XRPC methods as needed
161161-6. **Interoperable**: Works with ATProto tooling
162162-163163-## Implementation Details
164164-165165-### 1. SHA256 to CID Mapping
166166-167167-ATProto uses CIDs (Content Identifiers) for blobs, while OCI uses SHA256 digests. However, CIDs support SHA256 as the hash function.
168168-169169-**Key insight:** We can construct CIDs directly from SHA256 digests with no additional storage needed!
170170-171171-```go
172172-// pkg/hold/cid.go
173173-func DigestToCID(digest string) (cid.Cid, error) {
174174- // sha256:abc123... → raw bytes
175175- hash := parseDigest(digest)
176176-177177- // Construct CIDv1 with sha256 codec
178178- return cid.NewCidV1(
179179- cid.Raw, // codec
180180- multihash.SHA2_256, // hash function
181181- hash, // hash bytes
182182- )
183183-}
184184-185185-func CIDToDigest(c cid.Cid) string {
186186- // Decode multihash → sha256:abc...
187187- mh := c.Hash()
188188- return fmt.Sprintf("sha256:%x", mh)
189189-}
190190-```
191191-192192-**Mapping:**
193193-```
194194-OCI digest: sha256:abc123...
195195-ATProto CID: bafybei... (CIDv1 with sha256, base32 encoded)
196196-Storage path: s3://bucket/blobs/sha256/ab/abc123...
197197-```
198198-199199-Blobs stay in distribution's layout, we just compute CID on-the-fly. **No mapping records needed.**
200200-201201-### 2. Storage: Distribution Layout with PDS Interface
202202-203203-The hold's blob storage uses distribution's driver directly - no encoding or transformation:
204204-205205-```go
206206-type HoldBlobStore struct {
207207- storageDriver storagedriver.StorageDriver // S3, filesystem, etc
208208-}
209209-210210-// Implements ATProto blob interface
211211-func (h *HoldBlobStore) UploadBlob(ctx context.Context, data io.Reader) (cid.Cid, error) {
212212- // 1. Compute sha256 while reading
213213- digest, size := computeDigest(data)
214214-215215- // 2. Store at distribution's path: blobs/sha256/ab/abc123...
216216- path := h.blobPath(digest)
217217- h.storageDriver.PutContent(ctx, path, data)
218218-219219- // 3. Return CID (computed from sha256)
220220- return DigestToCID(digest), nil
221221-}
222222-223223-func (h *HoldBlobStore) GetBlob(ctx context.Context, c cid.Cid) (io.Reader, error) {
224224- // 1. Convert CID → sha256 digest
225225- digest := CIDToDigest(c)
226226-227227- // 2. Fetch from distribution's path
228228- path := h.blobPath(digest)
229229- return h.storageDriver.Reader(ctx, path, 0)
230230-}
231231-```
232232-233233-Storage continues to use distribution's existing S3 layout. The PDS interface is just a wrapper.
234234-235235-### 3. Authentication & IAM
236236-237237-**Challenge:** ATProto operations are authenticated AS the account owner. For hold operations, we need actions to be performed AS the hold (not individual users), but authorized BY crew members.
238238-239239-**Important context:** AppView manages the user's OAuth session. When users authenticate via the credential helper, they actually authenticate through AppView's web interface. AppView obtains and stores the user's OAuth token and DPoP key. The credential helper only receives a registry JWT.
240240-241241-**Proposed: DPoP Proof Delegation (Standard ATProto Federation)**
242242-243243-```
244244-1. User authenticates via AppView (OAuth flow)
245245- - AppView obtains: OAuth token, refresh token, DPoP key, DID
246246- - AppView stores these in its token storage
247247- - Credential helper receives: Registry JWT only
248248-249249-2. When AppView needs blob access, it calls hold:
250250- POST /xrpc/io.atcr.hold.delegateAccess
251251- Headers: Authorization: DPoP <user-oauth-token>
252252- DPoP: <proof-signed-with-user-dpop-key>
253253- Body: {
254254- "userDid": "did:plc:alice123",
255255- "purpose": "blob-upload",
256256- "duration": 900
257257- }
258258-259259-3. Hold validates (standard ATProto token validation):
260260- - Verify DPoP proof signature matches token's bound key
261261- - Call user's PDS: com.atproto.server.getSession (validates token)
262262- - Extract user's DID from validated session
263263- - Check user's DID in hold's crew records
264264- - If authorized, issue temporary token for blob operations
265265-266266-4. AppView uses delegated token for blob operations:
267267- POST /xrpc/com.atproto.sync.uploadBlob
268268- Headers: Authorization: DPoP <hold-token>
269269- DPoP: <proof>
2626+Hold Service (did:web:hold01.atcr.io)
2727+├── Embedded PDS (SQLite carstore) - Shared data only
2828+│ ├── Captain record (ownership metadata)
2929+│ ├── Crew records (access control)
3030+│ └── ATProto sync/repo endpoints
3131+├── OCI multipart upload (XRPC)
3232+│ ├── io.atcr.hold.initiateUpload
3333+│ ├── io.atcr.hold.getPartUploadUrl
3434+│ ├── io.atcr.hold.uploadPart
3535+│ ├── io.atcr.hold.completeUpload
3636+│ └── io.atcr.hold.abortUpload
3737+└── Storage driver (S3, filesystem, etc.)
27038```
27139272272-**This is standard ATProto federation** - services pass OAuth tokens with DPoP proofs between each other. Hold independently validates tokens against the user's PDS, so there's no trust relationship required.
4040+**Important distinction:**
4141+- **Hold's embedded PDS** = Shared data (crew members, hold configuration)
4242+- **User's PDS** = User-specific data (manifests, sailor profile, personal records)
4343+- Hold's PDS does NOT store user-specific container data (that stays in user's own PDS)
27344274274-**Records stored in hold's PDS:**
4545+### Records Structure
275464747+**Captain record** (hold ownership, single record at `io.atcr.hold.captain/self`):
27648```json
277277-// io.atcr.hold.captain (single record - hold metadata)
27849{
27950 "$type": "io.atcr.hold.captain",
28051 "owner": "did:plc:alice123",
···28354 "region": "iad",
28455 "provider": "fly.io"
28556}
5757+```
28658287287-// io.atcr.hold.crew/* (access control records)
5959+**Crew records** (access control, one per member at `io.atcr.hold.crew/{rkey}`):
6060+```json
28861{
28962 "$type": "io.atcr.hold.crew",
290290- "member": "did:plc:alice123",
6363+ "member": "did:plc:bob456",
29164 "role": "admin",
292292- "permissions": ["blob:read", "blob:write", "crew:manage"],
6565+ "permissions": ["blob:read", "blob:write"],
29366 "addedAt": "2025-10-14T..."
29467}
29568```
29669297297-**Semantic separation:**
298298-- **Captain record** = Hold ownership and metadata (who owns it, where it's deployed)
299299-- **Crew records** = Access control (who can use it, what permissions they have)
7070+### ATProto PDS Endpoints
30071301301-**Security considerations:**
302302-- User's OAuth token is exposed to hold during delegation
303303-- However, hold independently validates it (can't be forged)
304304-- Tokens are short-lived (15min typical)
305305-- Hold only accepts tokens for crew members
306306-- Hold validates DPoP binding (requires private key)
307307-- Standard ATProto security model
7272+Standard ATProto sync endpoints:
7373+- `GET /xrpc/com.atproto.sync.getRepo` - Download repository as CAR file
7474+- `GET /xrpc/com.atproto.sync.getBlob` - Get blob or presigned download URL
7575+- `GET /xrpc/com.atproto.sync.subscribeRepos` - Real-time crew changes
7676+- `GET /xrpc/com.atproto.sync.listRepos` - List repositories
30877309309-### 4. Presigned URLs for Optimized Egress
7878+Repository management:
7979+- `GET /xrpc/com.atproto.repo.describeRepo` - Repository metadata
8080+- `GET /xrpc/com.atproto.repo.getRecord` - Get specific record (captain/crew)
8181+- `GET /xrpc/com.atproto.repo.listRecords` - List crew members
8282+- `POST /xrpc/io.atcr.hold.requestCrew` - Request crew membership
31083311311-While standard ATProto blob endpoints work, direct S3 access is more efficient. Hold can expose custom XRPC methods:
8484+DID resolution:
8585+- `GET /.well-known/did.json` - DID document (did:web resolution)
8686+- `GET /.well-known/atproto-did` - DID for handle resolution
31287313313-```go
314314-// io.atcr.hold.getUploadUrl - Get presigned upload URL
315315-type GetUploadUrlRequest struct {
316316- Digest string // sha256:abc...
317317- Size int64
318318-}
8888+### OCI Multipart Upload Flow
31989320320-type GetUploadUrlResponse struct {
321321- UploadURL string // Presigned S3 URL
322322- ExpiresAt time.Time
323323-}
9090+```
9191+1. AppView gets service token from user's PDS:
9292+ GET /xrpc/com.atproto.server.getServiceAuth?aud={holdDID}
9393+ Response: { "token": "eyJ..." }
32494325325-// io.atcr.hold.getDownloadUrl - Get presigned download URL
326326-type GetDownloadUrlRequest struct {
327327- Digest string
328328-}
9595+2. AppView initiates multipart upload:
9696+ POST /xrpc/io.atcr.hold.initiateUpload
9797+ Authorization: Bearer {serviceToken}
9898+ Body: { "digest": "sha256:abc..." }
9999+ Response: { "uploadId": "xyz" }
329100330330-type GetDownloadUrlResponse struct {
331331- DownloadURL string // Presigned S3 URL
332332- ExpiresAt time.Time
333333-}
334334-```
101101+3. For each part:
102102+ POST /xrpc/io.atcr.hold.getPartUploadUrl
103103+ Body: { "uploadId": "xyz", "partNumber": 1 }
104104+ Response: { "url": "https://s3.../presigned" }
335105336336-**AppView uses optimized path:**
337337-```go
338338-func (a *ATProtoBlobStore) ServeBlob(ctx, w, r, dgst) error {
339339- // Try optimized presigned URL endpoint
340340- resp, err := a.client.GetDownloadUrl(ctx, dgst)
341341- if err == nil {
342342- // Redirect directly to S3
343343- http.Redirect(w, r, resp.DownloadURL, http.StatusTemporaryRedirect)
344344- return nil
345345- }
106106+4. Upload part to S3 presigned URL:
107107+ PUT {presignedURL}
108108+ Body: [part data]
346109347347- // Fallback: Standard ATProto blob endpoint (proxied)
348348- reader, _ := a.client.GetBlob(ctx, holdDID, cid)
349349- io.Copy(w, reader)
350350-}
110110+5. Complete upload:
111111+ POST /xrpc/io.atcr.hold.completeUpload
112112+ Body: { "uploadId": "xyz", "digest": "sha256:abc...", "parts": [...] }
351113```
352114353353-**Best of both worlds:** Standard ATProto interface + S3 optimization for bandwidth efficiency.
354354-355355-### 5. Image Export for Portability
115115+## Implementation Details
356116357357-Custom XRPC method enables users to export entire images:
117117+### Storage: Indigo Carstore with SQLite
358118359119```go
360360-// io.atcr.hold.exportImage - Export all blobs for an image
361361-type ExportImageRequest struct {
362362- Manifest *oci.Manifest // User provides manifest
363363-}
364364-365365-type ExportImageResponse struct {
366366- ArchiveURL string // Presigned S3 URL to tar.gz
367367- ExpiresAt time.Time
120120+type HoldPDS struct {
121121+ did string
122122+ carstore carstore.CarStore
123123+ session *carstore.DeltaSession // Provides blockstore interface
124124+ repo *repo.Repo
125125+ dbPath string
126126+ uid models.Uid // User ID for carstore (fixed: 1)
368127}
369369-370370-// Implementation:
371371-// 1. Extract all blob digests from manifest (config + layers)
372372-// 2. Create tar.gz with all blobs
373373-// 3. Upload to S3 temp location
374374-// 4. Return presigned download URL (15min expiry)
375128```
376129377377-Users can request all blobs for their images and migrate to different holds.
378378-379379-## Changes Required
380380-381381-### AppView Changes
130130+**Storage location:** Single SQLite file (`/var/lib/atcr-hold/hold.db`)
131131+- Contains MST nodes, records, commits in carstore tables
132132+- Handles compaction/cleanup automatically
133133+- Migration path to Postgres if needed (same carstore API)
382134383383-**Current:**
384384-```go
385385-type ProxyBlobStore struct {
386386- holdURL string // HTTP endpoint
387387-}
135135+### Key Implementation Lessons
388136389389-func (p *ProxyBlobStore) ServeBlob(...) {
390390- // POST /put-presigned-url
391391- // Return redirect
392392-}
393393-```
137137+#### 1. Custom Record Types Need Manual CBOR Decoding
394138395395-**New:**
396139```go
397397-type ATProtoBlobStore struct {
398398- holdDID string // did:web:hold1.example.com
399399- holdURL string // Resolved from DID document
400400- client *atproto.Client // XRPC client
401401- delegatedToken string // From io.atcr.hold.delegateAccess
402402-}
140140+// ❌ WRONG - Fails with "unrecognized lexicon type"
141141+record, err := repo.GetRecord(ctx, path, &CrewRecord{})
403142404404-func (a *ATProtoBlobStore) ServeBlob(ctx, w, r, dgst) error {
405405- // Try optimized: io.atcr.hold.getDownloadUrl
406406- // Fallback: com.atproto.sync.getBlob
407407-}
143143+// ✅ CORRECT - Manual CBOR decoding
144144+recordCID, recBytes, err := repo.GetRecordBytes(ctx, path)
145145+var crewRecord CrewRecord
146146+err = crewRecord.UnmarshalCBOR(bytes.NewReader(*recBytes))
408147```
409148410410-### Hold Service Changes
149149+Indigo's lexicon system doesn't know about custom types like `io.atcr.hold.crew`.
411150412412-Transform from simple HTTP server to minimal PDS:
151151+#### 2. JSON and CBOR Struct Tags Must Match
413152414153```go
415415-// cmd/hold/main.go
416416-func main() {
417417- // Storage driver (unchanged)
418418- storageDriver := buildStorageDriver()
419419-420420- // NEW: Embedded PDS
421421- pds := hold.NewEmbeddedPDS(hold.Config{
422422- DID: "did:web:hold1.example.com",
423423- BlobStore: storageDriver,
424424- Collections: []string{
425425- "io.atcr.hold.crew",
426426- "io.atcr.hold.config",
427427- },
428428- })
429429-430430- // Serve XRPC endpoints
431431- mux.Handle("/xrpc/", pds.Handler())
432432-433433- // Legacy endpoints (optional for backwards compat)
434434- // mux.Handle("/get-presigned-url", legacyHandler)
154154+// ✅ CORRECT - JSON tags match CBOR tags
155155+type CrewRecord struct {
156156+ Type string `json:"$type" cborgen:"$type"`
157157+ Member string `json:"member" cborgen:"member"`
158158+ Role string `json:"role" cborgen:"role"`
159159+ Permissions []string `json:"permissions" cborgen:"permissions"`
160160+ AddedAt string `json:"addedAt" cborgen:"addedAt"`
435161}
436162```
437163438438-## Open Questions
439439-440440-### 1. Docker Hub Size Limits
441441-442442-**Research findings:** Docker Hub has soft limits around 10-20GB per layer, with practical issues beyond that. No hard-coded enforcement.
443443-444444-**For ATCR:** Hold services can theoretically support larger blobs if S3 and network infrastructure allows. May want configurable limits to prevent abuse.
445445-446446-### 2. Token Delegation Security Model
164164+CID verification requires identical bytes from JSON and CBOR encodings.
447165448448-**Recommended approach:** DPoP proof delegation (standard ATProto federation pattern)
166166+#### 3. MST ForEach Returns Full Paths
449167450450-Open questions:
451451-- How long should delegated tokens last? (15min like presigned URLs?)
452452-- Should delegation be per-operation or session-based?
453453-- Do we need audit logs for delegated operations?
454454-- Can AppView cache delegated tokens across requests?
455455-- Should we implement token refresh for long-running operations?
456456-457457-### 3. Migration Path
458458-459459-- Do we support both HTTP and XRPC APIs during transition?
460460-- How do existing manifests with `holdEndpoint: "https://..."` migrate to `holdDid: "did:web:..."`?
461461-- Can AppView auto-detect if hold supports XRPC vs legacy?
462462-463463-### 4. PDS Implementation Scope
464464-465465-**Minimal endpoints needed:**
466466-- `com.atproto.sync.uploadBlob`
467467-- `com.atproto.sync.getBlob`
468468-- `com.atproto.repo.describeRepo` (discovery)
469469-- Custom XRPC methods (delegation, presigned URLs, export)
470470-471471-**Not needed:**
472472-- `com.atproto.repo.*` (no user repos)
473473-- `com.atproto.server.*` (no user sessions)
474474-- Most sync/admin endpoints
475475-476476-Can we build a reusable "static PDS" library for apps like ATCR, tangled.org, stream.place?
477477-478478-### 5. Crew Management
479479-480480-- How are crew members added/removed?
481481-- UI in AppView? CLI tool? Direct XRPC calls?
482482-- Can crew members delegate to other crew members?
483483-- Role hierarchy (owner > admin > member)?
484484-485485-### 6. Hold Discovery & Registration
486486-487487-**Decision: No registration records needed in owner's PDS.**
488488-489489-Since holds are ATProto actors with did:web identity, they are self-describing:
490490-491491-**Hold's PDS contains everything:**
492492-```
493493-did:web:hold01.atcr.io
494494-├── io.atcr.hold.captain → { owner: "did:plc:alice123", ... }
495495-└── io.atcr.hold.crew/* → Access control records
496496-```
497497-498498-**DID Document with Multiple Services:**
499499-500500-Holds expose multiple service endpoints to distinguish themselves from generic PDSs:
501501-502502-```json
503503-{
504504- "@context": ["https://www.w3.org/ns/did/v1", ...],
505505- "id": "did:web:hold01.atcr.io",
506506- "service": [
507507- {
508508- "id": "#atproto_pds",
509509- "type": "AtprotoPersonalDataServer",
510510- "serviceEndpoint": "https://hold01.atcr.io"
511511- },
512512- {
513513- "id": "#atcr_hold",
514514- "type": "AtcrHoldService",
515515- "serviceEndpoint": "https://hold01.atcr.io"
516516- }
517517- ]
518518-}
519519-```
520520-521521-**Service semantics:**
522522-- **`#atproto_pds`** - Standard ATProto PDS operations (crew queries, record sync)
523523-- **`#atcr_hold`** - ATCR-specific operations (blob storage, presigned URLs)
524524-525525-**Discovery patterns:**
526526-527527-1. **Direct deployment** - Owner deploys hold, knows the DID
528528-2. **Sailor profiles** - Users reference holds by DID in their profile
529529-3. **DID resolution** - `did:web:hold01.atcr.io` → `https://hold01.atcr.io/.well-known/did.json`
530530-4. **Service lookup** - Check for `#atcr_hold` service to identify ATCR holds
531531-5. **Crew queries** - AppView queries hold's PDS directly via `#atproto_pds` endpoint
532532-533533-**AppView resolution flow:**
534168```go
535535-// 1. Get hold DID from sailor profile
536536-holdDID := profile.DefaultHold // "did:web:hold01.atcr.io"
537537-538538-// 2. Resolve DID document
539539-didDoc := resolveDidWeb(holdDID)
540540-541541-// 3. Extract service endpoints
542542-pdsEndpoint := didDoc.GetService("#atproto_pds") // XRPC operations
543543-holdEndpoint := didDoc.GetService("#atcr_hold") // Blob operations
544544-545545-// 4. Query crew list via PDS endpoint
546546-crew := xrpcClient.ListRecords(pdsEndpoint, "io.atcr.hold.crew")
547547-548548-// 5. Check if user has access
549549-hasAccess := crew.Contains(userDID)
550550-```
551551-552552-**No need for reverse lookup** (owner → holds). Users know their holds because they deployed them.
553553-554554-**Benefits:**
555555-- ✅ Single source of truth (hold's PDS)
556556-- ✅ No cross-PDS writes during registration
557557-- ✅ Self-describing ATProto actors
558558-- ✅ Standard DID resolution patterns
559559-- ✅ Clear service semantics (PDS vs ATCR-specific)
560560-- ✅ Discoverable via service type
561561-562562-**OAuth implications:**
563563-- ✅ OAuth registration removed completely (hold is self-describing)
564564-- Hold creates captain + crew records in its own embedded PDS
565565-- No cross-PDS writes or OAuth flows needed
566566-567567-### 7. Multi-Tenancy
568568-569569-Could one hold PDS serve multiple "logical holds" for different organizations?
570570-571571-```
572572-did:web:hold-provider.com/org1
573573-did:web:hold-provider.com/org2
169169+// ✅ CORRECT - Extract just the rkey
170170+err := repo.ForEach(ctx, "io.atcr.hold.crew", func(k string, v cid.Cid) error {
171171+ // k = "io.atcr.hold.crew/3m37dr2ddit22"
172172+ parts := strings.Split(k, "/")
173173+ rkey := parts[len(parts)-1] // "3m37dr2ddit22"
174174+ return nil
175175+})
574176```
575177576576-Or should each hold be a separate deployment?
577577-578578-### 8. Blob Deduplication
579579-580580-Current behavior: Global deduplication (same layer shared across all images).
581581-582582-With embedded PDS:
583583-- Does dedup stay global across all crew/users?
584584-- Or is it per-hold (isolated storage)?
585585-- How do we track blob references for garbage collection?
586586-587587-### 9. Cost Model
178178+#### 4. CAR Files Must Include Full MST Path
588179589589-- Who pays for S3 storage/egress?
590590-- Hold operator? Image owner? Per-pull?
591591-- How to implement metering/billing via XRPC?
180180+For `com.atproto.sync.getRecord`, return CAR with:
181181+1. **Commit block** - Repo head with signature
182182+2. **MST tree nodes** - Path from root to record
183183+3. **Record block** - The actual record data
592184593593-### 10. Disaster Recovery
185185+Use `util.NewLoggingBstore()` to capture all accessed blocks.
594186595595-- How to backup hold's PDS (crew records, config)?
596596-- Can holds replicate to other holds?
597597-- Image export handles blobs - what about metadata?
187187+## IAM Challenges
598188599599-## Implementation Plan
189189+### Current Implementation: Service Tokens
600190601601-### Phase 1: Basic PDS with Carstore ✅ COMPLETED
602602-603603-**Implementation: Using indigo's carstore with SQLite + DeltaSession**
191191+AppView uses `com.atproto.server.getServiceAuth` to get tokens for calling holds:
604192605193```go
606606-import (
607607- "github.com/bluesky-social/indigo/carstore"
608608- "github.com/bluesky-social/indigo/models"
609609- "github.com/bluesky-social/indigo/repo"
610610-)
611611-612612-type HoldPDS struct {
613613- did string
614614- carstore carstore.CarStore
615615- session *carstore.DeltaSession // Provides blockstore interface
616616- repo *repo.Repo
617617- dbPath string
618618- uid models.Uid // User ID for carstore (fixed: 1)
619619-}
620620-621621-func NewHoldPDS(ctx context.Context, did, dbPath string) (*HoldPDS, error) {
622622- // Create SQLite-backed carstore
623623- sqlStore, err := carstore.NewSqliteStore(dbPath)
624624- sqlStore.Open(dbPath)
625625- cs := sqlStore.CarStore()
626626-627627- // For single-hold use, fixed UID
628628- uid := models.Uid(1)
629629-630630- // Create DeltaSession (provides blockstore interface)
631631- session, err := cs.NewDeltaSession(ctx, uid, nil)
194194+// AppView requests service token from user's PDS
195195+GET /xrpc/com.atproto.server.getServiceAuth?aud={holdDID}&lxm=com.atproto.repo.getRecord
632196633633- // Create repo with session as blockstore
634634- r := repo.NewRepo(ctx, did, session)
197197+// PDS returns short-lived token (60 seconds)
198198+{ "token": "eyJ..." }
635199636636- return &HoldPDS{
637637- did: did,
638638- carstore: cs,
639639- session: session,
640640- repo: r,
641641- dbPath: dbPath,
642642- uid: uid,
643643- }, nil
644644-}
200200+// AppView uses token to authenticate to hold
201201+Authorization: Bearer eyJ...
645202```
646203647647-**Key learnings:**
648648-- ✅ Carstore provides blockstore via `DeltaSession` (not direct access)
649649-- ✅ `models.Uid` is the user ID type (we use fixed UID(1))
650650-- ✅ DeltaSession needs to be a pointer (`*carstore.DeltaSession`)
651651-- ✅ `repo.NewRepo()` accepts the session directly as blockstore
204204+### Known Issues
652205653653-**Storage:**
654654-- Single file: `/var/lib/atcr-hold/hold.db` (SQLite)
655655-- Contains MST nodes, records, commits in carstore tables
656656-- Proper indigo repo/MST implementation (production-tested)
206206+#### 1. RPC Permission Format with IP Addresses
657207658658-**Why SQLite carstore:**
659659-- ✅ Single file persistence (like appview's SQLite)
660660-- ✅ Official indigo storage backend
661661-- ✅ Handles compaction/cleanup automatically
662662-- ✅ Migration path to Postgres/Scylla if needed
663663-- ✅ Easy to replicate (Litestream, LiteFS, rsync)
664664-- ✅ CAR import/export support built-in
208208+**Problem:** Service token RPC permissions don't work with IP addresses in the audience (`aud`) field:
665209666666-**Scale considerations:**
667667-- SQLite carstore marked "experimental" but suitable for single-hold use
668668-- MST designed for massive scale (O(log n) operations)
669669-- 1000 crew records = ~1-2MB database (trivial)
670670-- Bluesky PDSs use carstore for millions of records
671671-- If needed: migrate to Postgres-backed carstore (same API)
672672-673673-### Hold as Proper ATProto User
674674-675675-**Decision:** Make holds full ATProto actors for discoverability and ecosystem integration.
676676-677677-**What this enables:**
678678-- Hold becomes discoverable via ATProto directory
679679-- Can have profile (`app.bsky.actor.profile`)
680680-- Can post status updates (`app.bsky.feed.post`)
681681-- Users can follow holds
682682-- Social proof/reputation via ATProto social graph
683683-684684-**MVP Scope:**
685685-We're building the minimal PDS needed for discoverability, not a full social client:
686686-- ✅ Signing keys (ES256K via `atproto/atcrypto`)
687687-- ✅ DID document (did:web at `/.well-known/did.json`)
688688-- ✅ Standard XRPC endpoints (`describeRepo`, `getRecord`, `listRecords`)
689689-- ✅ Profile record (`app.bsky.actor.profile`)
690690-- ⏸️ Posting functionality (later - other services can read our records)
691691-692692-**Key insight:** Other ATProto services will "just work" as long as they can retrieve records from the hold's PDS. We don't need to implement full social features for the hold to participate in the ecosystem.
693693-694694-### Crew Management: Captain + Individual Records
695695-696696-**Decision: Captain record (ownership) + Individual crew records (access control)**
697697-698698-```json
699699-// io.atcr.hold.captain (single record - hold metadata)
700700-{
701701- "$type": "io.atcr.hold.captain",
702702- "owner": "did:plc:alice123",
703703- "public": false,
704704- "deployedAt": "2025-10-14T...",
705705- "region": "iad",
706706- "provider": "fly.io"
707707-}
708708-709709-// io.atcr.hold.crew/{rkey} (access control)
710710-{
711711- "$type": "io.atcr.hold.crew",
712712- "member": "did:plc:alice123",
713713- "role": "admin", // or "member"
714714- "permissions": ["blob:read", "blob:write"],
715715- "addedAt": "2025-10-14T..."
716716-}
717717-718718-// io.atcr.hold.config/policy (optional)
719719-{
720720- "$type": "io.atcr.hold.config",
721721- "access": "public", // or "allowlist"
722722- "allowAny": true, // public: allow any authenticated user
723723- "requireAuth": true, // require authentication (no anonymous)
724724- "maxUsers": 1000 // optional limit
725725-}
726210```
727727-728728-**Semantic separation:**
729729-- **Captain record** = Who owns/deployed the hold (billing, deletion, migration rights)
730730-- **Crew records** = Who can use the hold (access control, permissions)
731731-- **Config record** = Hold-wide policies
732732-733733-**Authorization logic:**
734734-```go
735735-func (p *HoldPDS) CheckAccess(ctx context.Context, userDID string) (bool, error) {
736736- policy := p.GetPolicy(ctx)
737737-738738- if policy.Access == "public" && policy.AllowAny {
739739- // Public hold - any authenticated ATCR user allowed
740740- // No individual crew record needed
741741- return true, nil
742742- }
743743-744744- if policy.Access == "allowlist" {
745745- // Check explicit crew membership
746746- _, err := p.GetCrewMember(ctx, userDID)
747747- return err == nil, nil
748748- }
749749-750750- return false, nil
751751-}
211211+Error: RPC permission format invalid
212212+Permission: rpc:com.atproto.repo.getRecord?aud=172.28.0.3:8080#atcr_hold
213213+Issue: IP address with port not supported in aud field
752214```
753215754754-**Benefits of individual records:**
755755-- Auditability (track who has access)
756756-- Per-user permissions (admin vs member)
757757-- Explicit revocation capabilities
758758-- Analytics (usage tracking)
759759-- Rate limiting (per-user quotas)
760760-- subscribeRepos events on crew changes
216216+**Impact:** Local development with IP-based hold DIDs (e.g., `did:web:172.28.0.3:8080`) fails.
761217762762-**Use cases:**
763763-- **Public community hold:** `access: "public", allowAny: true` - no crew records needed
764764-- **Private team hold:** `access: "allowlist"` - explicit crew membership
765765-- **Hybrid:** Public access + explicit admin crew records for elevated permissions
218218+**Workaround:** Falls back to unauthenticated requests (works for public holds only) or use hostname-based DIDs.
766219767767-### Phase 2: XRPC Endpoints Implementation ✅ COMPLETED
220220+#### 2. Dynamic Hold Discovery Limitation
768221769769-**Critical Implementation Lessons Learned:**
222222+**Problem:** AppView can only OAuth a user's default hold (configured in AppView), not dynamically discovered holds from sailor profiles.
770223771771-#### 1. Custom Record Types Require Manual CBOR Decoding
224224+**Current limitation:**
225225+- User sets `defaultHold = "did:web:alice-storage.fly.dev"` in sailor profile
226226+- AppView discovers hold DID when user pushes
227227+- AppView tries to get service token for alice's hold from user's PDS
228228+- BUT: User never OAuth'd through alice's hold, only through AppView's default hold
229229+- Result: No service token available, can't authenticate to alice's hold
772230773773-Indigo's `repo.GetRecord()` uses its lexicon decoder which only knows about built-in ATProto types. For custom types, you must use `GetRecordBytes()` and decode manually:
231231+**Why this matters:**
232232+- Users can't seamlessly use BYOS (Bring Your Own Storage)
233233+- Hold references in sailor profiles are non-functional
234234+- Limits portability and decentralization goals
774235775775-```go
776776-// ❌ WRONG - Fails with "unrecognized lexicon type"
777777-record, err := repo.GetRecord(ctx, path, &CrewRecord{})
236236+#### 3. Trust Model: "Trust but Verify"
778237779779-// ✅ CORRECT - Manual CBOR decoding
780780-recordCID, recBytes, err := repo.GetRecordBytes(ctx, path)
781781-var crewRecord CrewRecord
782782-err = crewRecord.UnmarshalCBOR(bytes.NewReader(*recBytes))
783783-```
238238+**Current approach:**
239239+1. User OAuth's to AppView (credential helper flow)
240240+2. Hold has crew member record for user (authorization)
241241+3. AppView requests service token from user's PDS (proof)
242242+4. Hold validates service token from user's PDS (verification)
784243785785-**Why:** Indigo's lexicon system doesn't know about `io.atcr.hold.crew` or other custom types.
244244+**Philosophy:** "Trust but verify"
245245+- IF user OAuth'd to AppView AND hold has crew member record for user → generally trust
246246+- BUT don't want AppView to lie → need proof from user's PDS that it's actually them
247247+- Service tokens provide this proof (user's PDS says "yes, I authorized this")
786248787787-#### 2. JSON Struct Tags Must Match CBOR Tags Exactly
249249+**Challenge:** Service tokens work for this model, but scope/permission format issues (see #1, #2) make it fragile in practice.
788250789789-For CID verification to work, JSON and CBOR encodings must produce identical bytes:
251251+### Potential Solutions
790252791791-```go
792792-// ❌ WRONG - JSON uses capital field names (Member, Role)
793793-type CrewRecord struct {
794794- Type string `cborgen:"$type"`
795795- Member string `cborgen:"member"`
796796- Role string `cborgen:"role"`
797797- Permissions []string `cborgen:"permissions"`
798798- AddedAt string `cborgen:"addedAt"`
799799-}
253253+#### Option A: Direct User-to-Hold Authentication
800254801801-// ✅ CORRECT - JSON tags match CBOR tags
802802-type CrewRecord struct {
803803- Type string `json:"$type" cborgen:"$type"`
804804- Member string `json:"member" cborgen:"member"`
805805- Role string `json:"role" cborgen:"role"`
806806- Permissions []string `json:"permissions" cborgen:"permissions"`
807807- AddedAt string `json:"addedAt" cborgen:"addedAt"`
808808-}
809809-```
255255+Users authenticate directly to holds (bypassing AppView service tokens).
810256811811-**Why:** Verification code CBOR-encodes the JSON record and compares the CID. Mismatched field names produce different bytes and thus different CIDs.
257257+**Pros:**
258258+- ✅ Clear trust model (user ↔ hold)
259259+- ✅ Works with any hold (BYOS friendly)
260260+- ✅ No OAuth scope issues
812261813813-#### 3. MST ForEach Returns Full Paths
262262+**Cons:**
263263+- ❌ Multiple OAuth flows (user's PDS + each hold)
264264+- ❌ Complex credential management
265265+- ❌ Poor UX (authenticate to each hold separately)
814266815815-The `repo.ForEach()` callback receives full collection paths, not just record keys:
267267+#### Option B: AppView as OAuth Client
816268817817-```go
818818-// ❌ WRONG - Prepends collection prefix again
819819-err := repo.ForEach(ctx, "io.atcr.hold.crew", func(k string, v cid.Cid) error {
820820- // k is already "io.atcr.hold.crew/3m37dr2ddit22"
821821- path := fmt.Sprintf("%s/%s", collection, k) // Double path!
822822- return nil
823823-})
824824-825825-// ✅ CORRECT - Extract just the rkey
826826-err := repo.ForEach(ctx, "io.atcr.hold.crew", func(k string, v cid.Cid) error {
827827- // k = "io.atcr.hold.crew/3m37dr2ddit22"
828828- parts := strings.Split(k, "/")
829829- rkey := parts[len(parts)-1] // "3m37dr2ddit22"
830830- return nil
831831-})
832832-```
833833-834834-#### 4. All Record Endpoints Must Return CIDs
269269+AppView pre-registers with holds and uses its own credentials (not user's).
835270836836-Per ATProto spec, `com.atproto.repo.getRecord` and `listRecords` must include the record's CID:
271271+**Pros:**
272272+- ✅ No OAuth scope issues
273273+- ✅ Single OAuth flow for user
274274+- ✅ Simpler credential management
837275838838-```go
839839-// ✅ CORRECT - Include CID in response
840840-response := map[string]any{
841841- "uri": fmt.Sprintf("at://%s/%s/%s", did, collection, rkey),
842842- "cid": recordCID.String(), // Required!
843843- "value": record,
844844-}
845845-```
276276+**Cons:**
277277+- ❌ Holds must trust AppView (centralization)
278278+- ❌ Doesn't work for unknown holds
279279+- ❌ Requires registration process
846280847847-**Why:** Clients need the CID to verify record integrity via `com.atproto.sync.getRecord`.
281281+#### Option C: Public Hold API
848282849849-#### 5. sync.getRecord CAR Files Must Include Full MST Path
283283+Simplify by making holds public for reads, auth only for writes.
850284851851-The `com.atproto.sync.getRecord` endpoint must return a CAR file with ALL blocks needed to verify the record:
285285+**Pros:**
286286+- ✅ No OAuth complexity for reads
287287+- ✅ Works offline (no PDS dependency)
852288853853-```go
854854-// ❌ WRONG - Only includes the record block
855855-blk, _ := repo.Blockstore().Get(ctx, recordCID)
856856-// Write single block to CAR
289289+**Cons:**
290290+- ❌ Private holds still need auth
291291+- ❌ Not standard ATProto pattern
857292858858-// ✅ CORRECT - Capture all accessed blocks
859859-loggingBS := util.NewLoggingBstore(session)
860860-tempRepo, _ := repo.OpenRepo(ctx, loggingBS, repoHead)
861861-_, _, _ = tempRepo.GetRecordBytes(ctx, path)
862862-blocks := loggingBS.GetLoggedBlocks() // Commit + MST nodes + record
863863-// Write all blocks to CAR
864864-```
293293+#### Option D: Hybrid Service Token + API Key
865294866866-**Components included:**
867867-1. **Commit block** - Repo head with signature, data root, version
868868-2. **MST tree nodes** - Path from root to record (log N depth)
869869-3. **Record block** - The actual record data
295295+Use service tokens when available, fall back to API keys for BYOS holds.
870296871871-**Why:** Clients need the full Merkle path to cryptographically verify the record against the repo head.
297297+**Pros:**
298298+- ✅ Optimal for default holds
299299+- ✅ BYOS works with API keys
300300+- ✅ Backward compatible
872301873873-#### 6. CAR Root Must Be Repo Head, Not Record CID
302302+**Cons:**
303303+- ❌ Two auth mechanisms
304304+- ❌ Not pure ATProto
874305875875-The CAR file's root CID must be the repo head (commit), not the record:
306306+### Recommended Approach
876307877877-```go
878878-// ❌ WRONG - Uses record CID as root
879879-header := &car.CarHeader{
880880- Roots: []cid.Cid{recordCID},
881881- Version: 1,
882882-}
308308+**Short-term (MVP):**
309309+1. Public holds (no auth needed for reads)
310310+2. Default hold with service tokens (AppView-managed)
311311+3. Document BYOS limitation
883312884884-// ✅ CORRECT - Uses repo head as root
885885-repoHead, _ := carstore.GetUserRepoHead(ctx, uid)
886886-header := &car.CarHeader{
887887- Roots: []cid.Cid{repoHead}, // Commit CID
888888- Version: 1,
889889-}
890890-```
313313+**Medium-term:**
314314+1. Hybrid approach (service tokens + API key fallback)
315315+2. Clear security model for hold operators
891316892892-**Why:** The CAR represents a slice of the repo from head to record, not just the record itself.
317317+**Long-term:**
318318+1. Explore direct user-to-hold OAuth
319319+2. Credential helper manages multiple hold sessions
320320+3. Auto-discover and authenticate to new holds
893321894894-#### 7. Empty Collections Should Return Empty Arrays
322322+### Understanding getServiceAuth
895323896896-Handle empty collections gracefully instead of returning errors:
324324+**Purpose:** `com.atproto.server.getServiceAuth` gives a JWT to a service with access to specific functions in the user's PDS. It's a **temporary grant to a service outside of what you OAuth'd to**.
897325898898-```go
899899-// ✅ CORRECT - Return empty array for missing collection
900900-err := repo.ForEach(ctx, collection, func(k string, v cid.Cid) error {
901901- // ...
902902-})
903903-if err != nil {
904904- if err.Error() == "mst: not found" {
905905- return []*CrewMemberWithKey{}, nil // Empty collection
906906- }
907907- return nil, err // Real error
908908-}
909909-```
326326+**How ATCR uses it:**
327327+- User OAuth's to AppView (gets broad access to their account)
328328+- AppView needs to prove to hold that user authorized it
329329+- AppView calls user's PDS: "give me a token scoped for this hold"
330330+- User's PDS issues service token with narrow scope (e.g., `rpc:com.atproto.repo.getRecord?aud={holdDID}`)
331331+- AppView presents this token to hold as proof
910332911911-**Why:** ATProto expects empty arrays for non-existent collections, not 404 errors.
333333+**Industry usage:**
334334+- `getServiceAuth` appears to be the intended pattern for inter-service auth
335335+- Not widely used yet (ATProto ecosystem is young)
336336+- Most apps use `transition:generic` scope for everything (too broad, not ideal)
337337+- RPC permission scopes are finicky and not well documented
912338913913-### Next Steps
339339+### Open Questions
914340915915-1. ~~**Add indigo dependencies**~~ ✅
916916-2. ~~**Implement HoldPDS with carstore**~~ ✅
917917-3. ~~**Add crew management**~~ ✅
918918-4. ~~**Implement standard PDS endpoints**~~ ✅
919919-5. ~~**Add DID document**~~ ✅
920920-6. **Custom XRPC methods** - getUploadUrl, getDownloadUrl (presigned URLs)
921921-7. **Wire up in cmd/hold** - Serve XRPC alongside existing HTTP
922922-8. **Test basic operations** - Add/list crew, policy checks
923923-9. **Design delegation/IAM** - Token exchange for authenticated operations
924924-10. **Implement AppView XRPC client** - Support PDS-based holds
341341+1. **RPC permission format:** Can the `aud` field in RPC permissions support IP addresses? Is this a spec limitation or implementation bug?
342342+2. **Scope granularity:** What's the right balance between `transition:generic` (too broad) and fine-grained RPC scopes (finicky)?
343343+3. **Dynamic discovery + auth:** How should AppView authenticate to arbitrary holds discovered from sailor profiles without pre-registration?
344344+4. **Service token caching:** Should service tokens be cached across multiple requests? Current: 50 second cache, is this optimal?
925345926346## References
927347928348- **Stream.place embedded PDS:** https://streamplace.leaflet.pub/3lut7mgni5s2k/l-quote/6_318-6_554#6
929349- **ATProto OAuth spec:** https://atproto.com/specs/oauth
930350- **ATProto XRPC spec:** https://atproto.com/specs/xrpc
351351+- **ATProto Service Auth:** https://docs.bsky.app/docs/api/com-atproto-server-get-service-auth
931352- **CID spec:** https://github.com/multiformats/cid
932353- **OCI Distribution Spec:** https://github.com/opencontainers/distribution-spec
+733
docs/IMAGE_SIGNING.md
···11+# Image Signing with ATProto
22+33+ATCR can support cryptographic signing of container images to ensure authenticity and integrity. This document explores different approaches and recommends a design based on Notary v2's plugin architecture adapted for ATProto.
44+55+## Background: Why Not Cosign?
66+77+[Sigstore Cosign](https://github.com/sigstore/cosign) is the most popular OCI image signing tool, but has several incompatibilities with ATProto:
88+99+### 1. Key Format Mismatch
1010+1111+**ATProto PDS keys:**
1212+- Format: secp256k1 (K256) for signing
1313+- Purpose: ATProto record signatures, DID authentication
1414+- Access: Private keys never leave the PDS server
1515+- Standard: ATProto specification
1616+1717+**Cosign expected keys:**
1818+- Format: ECDSA P-256, RSA, or Ed25519
1919+- Purpose: Image signing (not ATProto records)
2020+- Access: User-controlled private keys
2121+- Standard: Sigstore/PKIX
2222+2323+**Problem:** Can't use PDS keys directly for Cosign signing - wrong curve, wrong access model, wrong security boundary.
2424+2525+### 2. No Direct PDS Key Access
2626+2727+**Security model:**
2828+- PDS private keys are server-side secrets
2929+- Never exposed to clients (even authenticated users)
3030+- Used only by PDS for ATProto operations
3131+- Exposing them would compromise entire account security
3232+3333+**Cosign requirement:**
3434+- Needs access to private key for signing operations
3535+- Expects user-controlled keys or KMS integration
3636+3737+**Problem:** Can't sign images client-side with PDS keys without fundamentally breaking ATProto security model.
3838+3939+### 3. Keyless Signing Complexity
4040+4141+Cosign supports "keyless" signing via OIDC + Fulcio CA:
4242+4343+**What it requires:**
4444+- OIDC identity provider (Google, GitHub, etc.)
4545+- Fulcio certificate authority (issues short-lived certs)
4646+- Rekor transparency log (immutable signature log)
4747+- All infrastructure managed by Sigstore
4848+4949+**ATProto adaptation would need:**
5050+- **OIDC bridge**: Make ATProto DIDs look like OIDC identities
5151+ - Map `did:plc:alice123` → OIDC claims
5252+ - PDS as OIDC provider? (not in spec)
5353+ - Requires custom OIDC server wrapping ATProto auth
5454+- **Fulcio adaptation**: Issue certs based on ATProto identities
5555+ - Deploy and manage CA infrastructure
5656+ - Handle DID resolution in cert issuance
5757+ - Trust anchor distribution
5858+- **Rekor instance**: Public transparency log for signatures
5959+ - High availability requirements
6060+ - Storage and indexing at scale
6161+ - Replication and backup
6262+6363+**Problem:** Too much infrastructure for ATCR to host and manage. Defeats the purpose of decentralized architecture.
6464+6565+### 4. Signature Storage
6666+6767+**Cosign storage:**
6868+- OCI registry artifacts (signatures as ORAS manifests)
6969+- Stored alongside images in registry
7070+7171+**ATCR ideal:**
7272+- Signatures in ATProto records (user's PDS)
7373+- Discoverable via ATProto queries
7474+- Integrated with ATProto's existing signature/verification model
7575+7676+**Problem:** Would need to patch Cosign or run dual storage (OCI + ATProto) which creates consistency issues.
7777+7878+### Conclusion: Cosign Doesn't Fit
7979+8080+While Cosign is excellent for traditional registries, forcing it into ATProto would require:
8181+- Breaking ATProto security model (exposing PDS keys), OR
8282+- Building massive OIDC/Fulcio/Rekor infrastructure, OR
8383+- Running parallel storage systems with consistency problems
8484+8585+**Better approach:** Use a more flexible signing framework designed for extensibility.
8686+8787+## Notary v2: Plugin-Based Architecture
8888+8989+[Notary v2](https://notaryproject.dev/) (also called "Notation" or "Notary Project") is a CNCF signature specification with a plugin architecture that fits ATProto better.
9090+9191+### Why Notary v2?
9292+9393+**Flexible plugin system:**
9494+- **Trust store plugins**: Custom key resolution (e.g., from ATProto records)
9595+- **Signature plugins**: Custom signature storage (e.g., in PDS)
9696+- **Verification plugins**: Custom verification logic
9797+- Plugins written in any language, communicate via stdio
9898+9999+**Multiple key types supported:**
100100+- ECDSA, RSA, Ed25519 out of box
101101+- Can support custom key types via plugins
102102+- Signature envelope format is extensible
103103+104104+**Designed for extensibility:**
105105+- Not tied to specific PKI (unlike Cosign/Sigstore)
106106+- Trust policies are configurable
107107+- Storage backend is pluggable
108108+- Works with custom identity systems
109109+110110+**Standard CLI:**
111111+- `notation sign` / `notation verify` commands
112112+- Users don't need to learn new tools
113113+- Integration with Docker/containerd
114114+115115+### Notary v2 Architecture
116116+117117+```
118118+┌─────────────────────┐
119119+│ notation CLI │ User signs/verifies images
120120+└──────────┬──────────┘
121121+ │
122122+ ├─────────────────────────────────────┐
123123+ │ │
124124+┌──────────▼─────────┐ ┌───────────▼──────────┐
125125+│ Signing Plugin │ │ Trust Store Plugin │
126126+│ │ │ │
127127+│ - Read private key │ │ - Resolve DID → PDS │
128128+│ - Generate sig │ │ - Fetch public keys │
129129+│ - Store in PDS │ │ - Verify trust │
130130+└────────────────────┘ └──────────────────────┘
131131+ │ │
132132+ ▼ ▼
133133+┌─────────────────────────────────────────────────────────┐
134134+│ User's PDS (ATProto) │
135135+│ │
136136+│ io.atcr.signing.key (public keys) │
137137+│ io.atcr.signature (signatures) │
138138+└─────────────────────────────────────────────────────────┘
139139+```
140140+141141+## Proposed Design: ATProto Signing
142142+143143+### Key Management
144144+145145+**Separate signing keys from PDS keys:**
146146+147147+1. **User generates signing key pair locally:**
148148+ ```bash
149149+ notation key generate --id alice-signing-key --type ecdsa
150150+ # Or: --type ed25519, --type rsa
151151+ ```
152152+153153+2. **Public key published to ATProto:**
154154+ ```json
155155+ {
156156+ "$type": "io.atcr.signing.key",
157157+ "keyId": "alice-signing-key",
158158+ "keyType": "ecdsa-p256",
159159+ "publicKey": "-----BEGIN PUBLIC KEY-----\nMFkw...",
160160+ "validFrom": "2025-10-20T12:00:00Z",
161161+ "expiresAt": "2026-10-20T12:00:00Z",
162162+ "revoked": false,
163163+ "createdAt": "2025-10-20T12:00:00Z"
164164+ }
165165+ ```
166166+167167+3. **Private key stored locally:**
168168+ - Docker credential store
169169+ - OS keychain (macOS Keychain, Windows Credential Manager)
170170+ - File with restrictive permissions
171171+ - Hardware security module (future)
172172+173173+**Why separate keys?**
174174+- ✅ No need to access PDS private keys
175175+- ✅ Standard key formats (ECDSA, Ed25519, RSA)
176176+- ✅ User controls key lifecycle
177177+- ✅ Can use hardware tokens (YubiKey, etc.)
178178+- ✅ Security boundary separation (signing ≠ identity)
179179+- ✅ Key rotation without changing DID
180180+181181+### Signing Flow
182182+183183+```
184184+1. User: notation sign atcr.io/alice/myapp:latest --key alice-signing-key
185185+186186+2. notation-atproto plugin:
187187+ a. Resolve image → manifest digest
188188+ b. Read private key from local keystore
189189+ c. Generate signature over manifest digest
190190+ d. Get OAuth token for alice's PDS
191191+ e. Create signature record in alice's PDS
192192+193193+3. Signature stored in alice's PDS:
194194+ {
195195+ "$type": "io.atcr.signature",
196196+ "repository": "alice/myapp",
197197+ "digest": "sha256:abc123...",
198198+ "signature": "MEUCIQDx...", // base64 signature bytes
199199+ "keyId": "alice-signing-key",
200200+ "signatureAlgorithm": "ecdsa-p256-sha256",
201201+ "signedAt": "2025-10-20T12:34:56Z"
202202+ }
203203+204204+4. Record key: sha256 of (digest + keyId) for deduplication
205205+```
206206+207207+### Verification Flow
208208+209209+```
210210+1. User: notation verify atcr.io/alice/myapp:latest
211211+212212+2. notation-atproto plugin:
213213+ a. Resolve "alice" → did:plc:alice123 → pds.alice.com
214214+ b. Fetch manifest digest: sha256:abc123
215215+ c. Query alice's PDS for signatures:
216216+ GET /xrpc/com.atproto.repo.listRecords?
217217+ repo=did:plc:alice123&
218218+ collection=io.atcr.signature
219219+ d. Filter records matching digest: sha256:abc123
220220+ e. For each signature:
221221+ - Fetch public key from io.atcr.signing.key record
222222+ - Check key not revoked, not expired
223223+ - Verify signature bytes over digest
224224+ - Check trust policy (is this key trusted?)
225225+226226+3. Trust policy evaluation:
227227+ - Signature valid cryptographically? ✓
228228+ - Key belongs to image owner (alice)? ✓
229229+ - Key not revoked? ✓
230230+ - Key not expired? ✓
231231+ - Trust policy satisfied? ✓
232232+233233+4. Output: Verification succeeded ✓
234234+```
235235+236236+### Trust Policies
237237+238238+Notary v2 uses trust policies to define what signatures are required:
239239+240240+```json
241241+{
242242+ "version": "1.0",
243243+ "trustPolicies": [
244244+ {
245245+ "name": "atcr-images",
246246+ "registryScopes": ["atcr.io/*/*"],
247247+ "signatureVerification": {
248248+ "level": "strict"
249249+ },
250250+ "trustStores": ["atproto:default"],
251251+ "trustedIdentities": [
252252+ "did:plc:*" // Trust any ATProto DID
253253+ ]
254254+ }
255255+ ]
256256+}
257257+```
258258+259259+**Policy options:**
260260+- `level: strict` - Signature required, verification must pass
261261+- `level: permissive` - Signature optional, but verified if present
262262+- `level: audit` - Signature logged but doesn't block
263263+- `level: skip` - No verification
264264+265265+**Trust store resolution:**
266266+- `atproto:default` - Use ATProto plugin to resolve keys
267267+- Plugin queries user's PDS for `io.atcr.signing.key` records
268268+- Verifies key is owned by the image owner (DID match)
269269+270270+### ATProto Records
271271+272272+**io.atcr.signing.key** - Public signing keys
273273+274274+```json
275275+{
276276+ "$type": "io.atcr.signing.key",
277277+ "keyId": "alice-signing-key",
278278+ "keyType": "ecdsa-p256",
279279+ "publicKey": "-----BEGIN PUBLIC KEY-----\nMFkwEwYHKoZI...",
280280+ "validFrom": "2025-10-20T12:00:00Z",
281281+ "expiresAt": "2026-10-20T12:00:00Z",
282282+ "revoked": false,
283283+ "purpose": ["image-signing"],
284284+ "createdAt": "2025-10-20T12:00:00Z"
285285+}
286286+```
287287+288288+**Record key:** `keyId` (user-chosen identifier)
289289+290290+**Fields:**
291291+- `keyId`: Unique identifier for this key
292292+- `keyType`: Algorithm (ecdsa-p256, ed25519, rsa-2048, rsa-4096)
293293+- `publicKey`: PEM-encoded public key
294294+- `validFrom`: Key becomes valid at this time
295295+- `expiresAt`: Key expires at this time (null = no expiry)
296296+- `revoked`: Key has been revoked (true/false)
297297+- `purpose`: Array of purposes (image-signing, sbom-signing, etc.)
298298+299299+**io.atcr.signature** - Image signatures
300300+301301+```json
302302+{
303303+ "$type": "io.atcr.signature",
304304+ "repository": "alice/myapp",
305305+ "digest": "sha256:abc123...",
306306+ "signature": "MEUCIQDxH7...",
307307+ "keyId": "alice-signing-key",
308308+ "signatureAlgorithm": "ecdsa-p256-sha256",
309309+ "signedAt": "2025-10-20T12:34:56Z",
310310+ "createdAt": "2025-10-20T12:34:56Z"
311311+}
312312+```
313313+314314+**Record key:** SHA256 hash of `(digest || keyId)` for deduplication
315315+316316+**Fields:**
317317+- `repository`: Image repository (alice/myapp)
318318+- `digest`: Manifest digest being signed
319319+- `signature`: Base64-encoded signature bytes
320320+- `keyId`: Reference to signing key record
321321+- `signatureAlgorithm`: Algorithm used for signing
322322+- `signedAt`: When signature was created
323323+324324+### Plugin Implementation
325325+326326+**notation-atproto** - Notary v2 plugin for ATProto
327327+328328+**Trust store plugin:**
329329+```go
330330+// Implements: notation trust store plugin spec
331331+// https://notaryproject.dev/docs/user-guides/how-to/plugin-management/
332332+333333+type ATProtoTrustStore struct {
334334+ resolver *atproto.Resolver
335335+ client *atproto.Client
336336+}
337337+338338+// GetKeys resolves public keys for a given identity (DID)
339339+func (t *ATProtoTrustStore) GetKeys(did string) ([]PublicKey, error) {
340340+ // 1. Resolve DID → PDS endpoint
341341+ pds, err := t.resolver.ResolvePDS(did)
342342+343343+ // 2. Query PDS for io.atcr.signing.key records
344344+ records, err := t.client.ListRecords(pds, did, "io.atcr.signing.key")
345345+346346+ // 3. Filter active keys (not revoked, not expired)
347347+ keys := []PublicKey{}
348348+ for _, record := range records {
349349+ if !record.Revoked && !record.Expired() {
350350+ keys = append(keys, ParsePublicKey(record.PublicKey))
351351+ }
352352+ }
353353+354354+ return keys, nil
355355+}
356356+```
357357+358358+**Signature store plugin:**
359359+```go
360360+// Store signature in user's PDS
361361+func (s *ATProtoSignatureStore) StoreSignature(sig Signature) error {
362362+ // 1. Get OAuth token for user's PDS
363363+ token, err := s.oauthClient.GetToken()
364364+365365+ // 2. Create signature record
366366+ record := SignatureRecord{
367367+ Type: "io.atcr.signature",
368368+ Repository: sig.Repository,
369369+ Digest: sig.Digest,
370370+ Signature: base64.Encode(sig.Bytes),
371371+ KeyId: sig.KeyId,
372372+ SignatureAlgorithm: sig.Algorithm,
373373+ SignedAt: time.Now(),
374374+ }
375375+376376+ // 3. Generate record key (hash of digest + keyId)
377377+ rkey := sha256.Sum256([]byte(sig.Digest + sig.KeyId))
378378+379379+ // 4. Write to PDS
380380+ err = s.client.PutRecord(pds, did, "io.atcr.signature", hex.Encode(rkey), record)
381381+382382+ return err
383383+}
384384+385385+// Retrieve signatures for a digest
386386+func (s *ATProtoSignatureStore) GetSignatures(did, digest string) ([]Signature, error) {
387387+ // Query PDS for matching signatures
388388+ records, err := s.client.ListRecords(pds, did, "io.atcr.signature")
389389+390390+ // Filter by digest
391391+ sigs := []Signature{}
392392+ for _, record := range records {
393393+ if record.Digest == digest {
394394+ sigs = append(sigs, ParseSignature(record))
395395+ }
396396+ }
397397+398398+ return sigs, nil
399399+}
400400+```
401401+402402+**Plugin installation:**
403403+```bash
404404+# Install notation CLI
405405+brew install notation
406406+407407+# Install ATProto plugin
408408+notation plugin install notation-atproto --version v1.0.0
409409+410410+# Configure trust policy
411411+cat > ~/.config/notation/trustpolicy.json <<EOF
412412+{
413413+ "version": "1.0",
414414+ "trustPolicies": [
415415+ {
416416+ "name": "atcr-images",
417417+ "registryScopes": ["atcr.io/*/*"],
418418+ "signatureVerification": {"level": "strict"},
419419+ "trustStores": ["atproto:default"],
420420+ "trustedIdentities": ["did:plc:*"]
421421+ }
422422+ ]
423423+}
424424+EOF
425425+```
426426+427427+## User Workflows
428428+429429+### Initial Setup
430430+431431+```bash
432432+# 1. Generate signing key pair
433433+notation key generate --id alice-signing-key --type ecdsa
434434+435435+# Private key stored in: ~/.config/notation/keys/
436436+# Public key extracted by plugin
437437+438438+# 2. Publish public key to PDS
439439+notation-atproto key publish alice-signing-key
440440+441441+# Plugin uploads io.atcr.signing.key record to alice's PDS
442442+# Requires OAuth authentication to alice's PDS
443443+444444+# 3. Verify key is published
445445+notation-atproto key list
446446+447447+# Output:
448448+# alice-signing-key (ecdsa-p256) - Active
449449+# Published: 2025-10-20T12:00:00Z
450450+# Expires: 2026-10-20T12:00:00Z
451451+# DID: did:plc:alice123
452452+```
453453+454454+### Signing Images
455455+456456+```bash
457457+# Sign an image after pushing
458458+docker push atcr.io/alice/myapp:latest
459459+460460+notation sign atcr.io/alice/myapp:latest \
461461+ --key alice-signing-key \
462462+ --plugin atproto
463463+464464+# Plugin:
465465+# 1. Reads private key from ~/.config/notation/keys/
466466+# 2. Signs manifest digest
467467+# 3. Uploads signature to alice's PDS (io.atcr.signature record)
468468+# 4. Returns success
469469+470470+# Output:
471471+# Successfully signed atcr.io/alice/myapp:latest
472472+# Signature stored in PDS: did:plc:alice123
473473+```
474474+475475+### Verifying Images
476476+477477+```bash
478478+# Verify before running
479479+notation verify atcr.io/alice/myapp:latest
480480+481481+# Plugin:
482482+# 1. Resolves "alice" → did:plc:alice123 → pds.alice.com
483483+# 2. Fetches manifest digest
484484+# 3. Queries alice's PDS for signatures
485485+# 4. Fetches public key from io.atcr.signing.key
486486+# 5. Verifies signature cryptographically
487487+# 6. Checks trust policy
488488+489489+# Output:
490490+# ✓ Signature verification succeeded
491491+#
492492+# Signed by: did:plc:alice123
493493+# Key ID: alice-signing-key
494494+# Signed at: 2025-10-20T12:34:56Z
495495+# Algorithm: ecdsa-p256-sha256
496496+```
497497+498498+### Key Rotation
499499+500500+```bash
501501+# Generate new key
502502+notation key generate --id alice-signing-key-2 --type ecdsa
503503+504504+# Publish new key
505505+notation-atproto key publish alice-signing-key-2
506506+507507+# Re-sign images with new key
508508+notation sign atcr.io/alice/myapp:latest --key alice-signing-key-2
509509+510510+# Revoke old key
511511+notation-atproto key revoke alice-signing-key
512512+513513+# Plugin updates io.atcr.signing.key record:
514514+# { ..., "revoked": true, "revokedAt": "2025-11-01T..." }
515515+516516+# Old signatures still exist but verification will fail
517517+# (revoked key = untrusted)
518518+```
519519+520520+### Key Expiration
521521+522522+```bash
523523+# Generate key with expiration
524524+notation key generate \
525525+ --id alice-signing-key \
526526+ --type ecdsa \
527527+ --expires 365d # 1 year
528528+529529+# Publish with expiration
530530+notation-atproto key publish alice-signing-key
531531+532532+# PDS record:
533533+# {
534534+# "validFrom": "2025-10-20T12:00:00Z",
535535+# "expiresAt": "2026-10-20T12:00:00Z"
536536+# }
537537+538538+# After expiration, verification fails:
539539+notation verify atcr.io/alice/myapp:latest
540540+# ✗ Signature verification failed
541541+# Signing key expired on 2026-10-20T12:00:00Z
542542+```
543543+544544+## Security Considerations
545545+546546+### Key Storage
547547+548548+**Private keys must be protected:**
549549+- File permissions: `0600` (owner read/write only)
550550+- Use OS keychain when possible (macOS Keychain, Windows Credential Manager)
551551+- Consider hardware tokens (YubiKey, TPM) for production
552552+- Never commit private keys to git
553553+554554+**Public keys are public:**
555555+- Stored in user's PDS (publicly readable)
556556+- Anyone can verify signatures
557557+- Revocation is public and immediate
558558+559559+### Trust Model
560560+561561+**What signatures prove:**
562562+- ✅ Image manifest hasn't been tampered with since signing
563563+- ✅ Signer had access to private key at signing time
564564+- ✅ Signer's DID matches image owner (alice signed alice/myapp)
565565+566566+**What signatures don't prove:**
567567+- ❌ Image is free of vulnerabilities
568568+- ❌ Image contents are safe to run
569569+- ❌ Signer's identity is verified (depends on DID trust)
570570+571571+**Trust anchors:**
572572+- Trust PDS to correctly serve signing key records
573573+- Trust DID resolution (PLC directory, did:web DNS)
574574+- Trust signature algorithms (ECDSA, Ed25519, RSA)
575575+- Trust user to protect their private keys
576576+577577+### Key Compromise
578578+579579+If a private signing key is compromised:
580580+581581+```bash
582582+# 1. Immediately revoke the key
583583+notation-atproto key revoke alice-signing-key --reason "Key compromised"
584584+585585+# 2. Generate new key
586586+notation key generate --id alice-signing-key-new --type ecdsa
587587+588588+# 3. Publish new key
589589+notation-atproto key publish alice-signing-key-new
590590+591591+# 4. Re-sign all images with new key
592592+for image in $(docker images --format "{{.Repository}}:{{.Tag}}"); do
593593+ notation sign $image --key alice-signing-key-new
594594+done
595595+596596+# 5. Alert users to only trust new key
597597+# (Old signatures will fail verification due to revocation)
598598+```
599599+600600+**Revocation is immediate:**
601601+- PDS record updated with `"revoked": true`
602602+- All verification attempts fail instantly
603603+- No need to update certificate revocation lists (CRLs)
604604+- ATProto record queries are always fresh
605605+606606+### Multiple Signatures
607607+608608+Images can have multiple signatures:
609609+610610+```bash
611611+# Alice signs with her key
612612+notation sign atcr.io/alice/myapp:latest --key alice-signing-key
613613+614614+# CI/CD system signs with separate key
615615+notation sign atcr.io/alice/myapp:latest --key ci-signing-key
616616+617617+# Both signatures stored in alice's PDS
618618+# Verification requires both (configurable in trust policy)
619619+```
620620+621621+**Trust policy:**
622622+```json
623623+{
624624+ "trustPolicies": [{
625625+ "name": "require-dual-signature",
626626+ "registryScopes": ["atcr.io/alice/*"],
627627+ "signatureVerification": {
628628+ "level": "strict",
629629+ "verifyTimestamp": true,
630630+ "override": {
631631+ "all": ["alice-signing-key", "ci-signing-key"]
632632+ }
633633+ }
634634+ }]
635635+}
636636+```
637637+638638+## Implementation Roadmap
639639+640640+### Phase 1: Core Plugin (4-6 weeks)
641641+642642+**Week 1-2: Trust store plugin**
643643+- Implement DID resolution
644644+- Query `io.atcr.signing.key` records
645645+- Parse and validate public keys
646646+- Handle revocation and expiration
647647+648648+**Week 3-4: Signature store plugin**
649649+- OAuth integration for PDS writes
650650+- Create `io.atcr.signature` records
651651+- Query signatures for verification
652652+- Handle record key generation
653653+654654+**Week 5-6: Integration testing**
655655+- End-to-end sign/verify workflows
656656+- Key rotation scenarios
657657+- Revocation handling
658658+- Multi-signature support
659659+660660+### Phase 2: Tooling (2-3 weeks)
661661+662662+**CLI commands:**
663663+```bash
664664+notation-atproto key generate
665665+notation-atproto key publish
666666+notation-atproto key list
667667+notation-atproto key revoke
668668+notation-atproto signature list <image>
669669+notation-atproto signature inspect <image>
670670+```
671671+672672+**Helper utilities:**
673673+- Bulk re-signing for key rotation
674674+- Signature audit logs
675675+- Trust policy generators
676676+- Key lifecycle management
677677+678678+### Phase 3: AppView Integration (2-3 weeks)
679679+680680+**Web UI features:**
681681+- Display signature status on repository pages
682682+- Show signing keys for users
683683+- Signature verification badges
684684+- Key management interface
685685+686686+**API endpoints:**
687687+- `GET /v2/alice/myapp/signatures` - List signatures for image
688688+- `GET /v2/alice/keys` - List user's signing keys
689689+- `POST /v2/alice/keys/revoke` - Revoke key via web UI
690690+691691+### Phase 4: Advanced Features (ongoing)
692692+693693+**Hardware token support:**
694694+- YubiKey integration
695695+- TPM-backed keys
696696+- Hardware-backed keystores
697697+698698+**Timestamp verification:**
699699+- Trusted timestamp authorities
700700+- Prove signature was created at specific time
701701+- Long-term signature validity
702702+703703+**SBOM signing:**
704704+- Sign SBOMs with same keys
705705+- Link SBOM signatures to image signatures
706706+- Unified verification workflow
707707+708708+## Comparison: Cosign vs Notary v2 for ATCR
709709+710710+| Feature | Cosign | Notary v2 | Winner |
711711+|---------|--------|-----------|--------|
712712+| **ATProto integration** | Requires OIDC bridge | Plugin system | ✅ Notary |
713713+| **Key format flexibility** | Limited | Extensible | ✅ Notary |
714714+| **Custom storage** | OCI only | Pluggable | ✅ Notary |
715715+| **Infrastructure needs** | Fulcio + Rekor | None | ✅ Notary |
716716+| **Keyless signing** | Yes (complex) | No | ⚠️ Cosign* |
717717+| **Ecosystem maturity** | High | Medium | ⚠️ Cosign* |
718718+| **CLI simplicity** | Very simple | Simple | ⚠️ Cosign* |
719719+| **Plugin development** | N/A | Required | ⚠️ Mixed |
720720+721721+*Cosign advantages don't outweigh ATProto incompatibilities
722722+723723+**Recommendation: Notary v2 with ATProto plugin**
724724+725725+## References
726726+727727+- [Notary v2 Specification](https://notaryproject.dev/)
728728+- [Notation CLI](https://github.com/notaryproject/notation)
729729+- [Notary Plugin Specification](https://notaryproject.dev/docs/user-guides/how-to/plugin-management/)
730730+- [Sigstore Cosign](https://github.com/sigstore/cosign) (for comparison)
731731+- [ATProto Specification](https://atproto.com/)
732732+- [OCI Artifacts](https://github.com/opencontainers/artifacts)
733733+- [RFC 7515 - JSON Web Signature](https://datatracker.ietf.org/doc/html/rfc7515) (signature formats)
+365-72
docs/SAILOR.md
···11-User approved Claude's plan:
22- ⎿ Implement Sailor Profile & Hold Discovery System
11+# Sailor Profile System
22+33+## Overview
44+55+The sailor profile system allows users to choose which hold (storage service) to use for their container images. This enables:
66+- **Personal holds** - Use your own S3/Storj/Minio storage
77+- **Shared holds** - Join a team or community hold
88+- **Default holds** - Use AppView's default storage (free tier)
99+- **Transparent infrastructure** - Hold choice doesn't affect image URL
1010+1111+## Concepts
1212+1313+**Sailor Profile** (`io.atcr.sailor.profile`):
1414+- Record stored in user's PDS
1515+- Contains `defaultHold` preference (DID or URL)
1616+- Created automatically on first authentication
1717+- Managed via web UI or ATProto client
1818+1919+**Hold Discovery Priority**:
2020+1. User's sailor profile `defaultHold` (if set)
2121+2. User's own hold records (`io.atcr.hold`) - legacy
2222+3. AppView's `default_hold_did` configuration
2323+2424+## Sailor Profile Record
2525+2626+```json
2727+{
2828+ "$type": "io.atcr.sailor.profile",
2929+ "defaultHold": "did:web:hold.example.com",
3030+ "createdAt": "2025-10-02T12:00:00Z",
3131+ "updatedAt": "2025-10-02T12:00:00Z"
3232+}
3333+```
3434+3535+**Fields:**
3636+- `defaultHold` (string, optional) - Hold DID or URL (auto-normalized to DID)
3737+- `createdAt` (datetime, required) - Profile creation timestamp
3838+- `updatedAt` (datetime, required) - Last update timestamp
3939+4040+**Record key:** Always `"self"` (only one profile per user)
4141+4242+**Collection:** `io.atcr.sailor.profile`
4343+4444+## Profile Management
4545+4646+### Automatic Creation
4747+4848+Profiles are created automatically on first authentication:
4949+5050+```go
5151+// During OAuth login or Basic Auth token exchange
5252+func (h *Handler) HandleCallback(w http.ResponseWriter, r *http.Request) {
5353+ // ... OAuth flow ...
5454+5555+ // Create ATProto client with user's OAuth session
5656+ client := atproto.NewClientWithIndigoClient(pdsEndpoint, did, apiClient)
5757+5858+ // Ensure profile exists (creates with AppView's default if not)
5959+ err := atproto.EnsureProfile(ctx, client, appViewDefaultHoldDID)
6060+}
6161+```
6262+6363+**Behavior:**
6464+- If profile exists → no-op
6565+- If profile doesn't exist → creates with `defaultHold` set to AppView's default
6666+- If AppView has no default configured → creates with empty `defaultHold`
6767+6868+### Web UI Management
6969+7070+Users can update their profile via the settings page (`/settings`):
7171+7272+**View current profile:**
7373+```
7474+GET /settings
7575+→ Shows current defaultHold value
7676+```
7777+7878+**Update defaultHold:**
7979+```
8080+POST /api/settings/update-hold
8181+Form data: hold_endpoint=did:web:team-hold.fly.dev
8282+8383+→ Updates sailor profile in user's PDS
8484+→ Returns success confirmation
8585+```
8686+8787+**Implementation** (`pkg/appview/handlers/settings.go`):
8888+- Requires OAuth session (user must be logged in)
8989+- Fetches existing profile or creates new one
9090+- Normalizes URLs to DIDs automatically
9191+- Updates `updatedAt` timestamp
9292+9393+### ATProto Client Management
9494+9595+Users can also manage their profile using standard ATProto tools:
9696+9797+**Get profile:**
9898+```bash
9999+atproto get-record \
100100+ --collection io.atcr.sailor.profile \
101101+ --rkey self
102102+```
103103+104104+**Update profile:**
105105+```bash
106106+atproto put-record \
107107+ --collection io.atcr.sailor.profile \
108108+ --rkey self \
109109+ --value '{
110110+ "$type": "io.atcr.sailor.profile",
111111+ "defaultHold": "did:web:my-hold.example.com",
112112+ "updatedAt": "2025-10-20T12:00:00Z"
113113+ }'
114114+```
115115+116116+**Clear default hold** (opt out):
117117+```bash
118118+atproto put-record \
119119+ --collection io.atcr.sailor.profile \
120120+ --rkey self \
121121+ --value '{
122122+ "$type": "io.atcr.sailor.profile",
123123+ "defaultHold": "",
124124+ "updatedAt": "2025-10-20T12:00:00Z"
125125+ }'
126126+```
127127+128128+## URL-to-DID Migration
129129+130130+The system automatically migrates old URL-based `defaultHold` values to DID format for consistency:
131131+132132+**Old format (deprecated):**
133133+```json
134134+{
135135+ "defaultHold": "https://hold.example.com"
136136+}
137137+```
138138+139139+**New format (preferred):**
140140+```json
141141+{
142142+ "defaultHold": "did:web:hold.example.com"
143143+}
144144+```
145145+146146+**Migration behavior:**
147147+- `GetProfile()` detects URL format automatically
148148+- Converts URL → DID transparently (strips protocol, converts to `did:web:`)
149149+- Persists migration to PDS in background goroutine
150150+- Uses locks to prevent duplicate migrations
151151+- Completely transparent to user
152152+153153+**Why DIDs?**
154154+- **Portable**: DIDs work offline, URLs require DNS
155155+- **Canonical**: One DID per hold, multiple URLs possible
156156+- **Standard**: ATProto uses DIDs for identity
157157+158158+## Hold Discovery Flow
159159+160160+When a user pushes an image, AppView discovers which hold to use:
161161+162162+```
163163+1. User: docker push atcr.io/alice/myapp:latest
164164+165165+2. AppView resolves alice → did:plc:alice123
166166+167167+3. AppView calls findHoldDID(did, pdsEndpoint):
168168+ a. Query alice's PDS for io.atcr.sailor.profile/self
169169+ b. If profile.defaultHold is set → use it
170170+ c. Else check alice's io.atcr.hold records (legacy)
171171+ d. Else use AppView's default_hold_did
172172+173173+4. Found: alice.profile.defaultHold = "did:web:team-hold.fly.dev"
174174+175175+5. AppView uses team-hold.fly.dev for blob storage
176176+177177+6. Manifest stored in alice's PDS includes:
178178+ - holdDid: "did:web:team-hold.fly.dev" (for future pulls)
179179+ - holdEndpoint: "https://team-hold.fly.dev" (backward compat)
180180+```
181181+182182+**Implementation** (`pkg/appview/middleware/registry.go:findHoldDID()`):
183183+184184+```go
185185+func (nr *NamespaceResolver) findHoldDID(ctx context.Context, did, pdsEndpoint string) string {
186186+ client := atproto.NewClient(pdsEndpoint, did, "")
187187+188188+ // 1. Check sailor profile
189189+ profile, err := atproto.GetProfile(ctx, client)
190190+ if profile != nil && profile.DefaultHold != "" {
191191+ return profile.DefaultHold // DID or URL (auto-normalized)
192192+ }
193193+194194+ // 2. Check own hold records (legacy)
195195+ records, _ := client.ListRecords(ctx, "io.atcr.hold", 10)
196196+ for _, record := range records {
197197+ // Return first hold's endpoint
198198+ if holdRecord.Endpoint != "" {
199199+ return atproto.ResolveHoldDIDFromURL(holdRecord.Endpoint)
200200+ }
201201+ }
202202+203203+ // 3. Use AppView default
204204+ return nr.defaultHoldDID
205205+}
206206+```
207207+208208+## Use Cases
209209+210210+### 1. Default Hold (Free Tier)
211211+212212+User doesn't need to do anything:
213213+214214+```
215215+1. User authenticates to atcr.io
216216+2. Profile created with defaultHold = AppView's default
217217+3. User pushes images → blobs go to default hold
218218+```
219219+220220+**Profile:**
221221+```json
222222+{
223223+ "defaultHold": "did:web:hold01.atcr.io"
224224+}
225225+```
322644- Summary
227227+### 2. Join Team Hold
522866- Add io.atcr.sailor.profile record type to manage user's default hold preference, and update manifest to store historical hold endpoint reference. This enables transparent hold
77- routing while preserving image ownership semantics.
229229+User joins a shared team hold:
823099- Changes Required
231231+```
232232+1. Team admin deploys hold service (did:web:team-hold.fly.dev)
233233+2. Team admin adds user to crew (via hold's PDS)
234234+3. User updates profile:
235235+ - Via web UI: /settings → set hold to "did:web:team-hold.fly.dev"
236236+ - Or via ATProto client: put-record
237237+4. User pushes images → blobs go to team hold
238238+```
102391111- 1. Create Sailor Profile Lexicon
240240+**Profile:**
241241+```json
242242+{
243243+ "defaultHold": "did:web:team-hold.fly.dev"
244244+}
245245+```
122461313- File: lexicons/io/atcr/sailor/profile.json
1414- - New record type: io.atcr.sailor.profile
1515- - Fields: defaultHold (string, nullable), createdAt, updatedAt
247247+**Benefits:**
248248+- Team pays for storage (not individual users)
249249+- Centralized access control
250250+- Shared bandwidth limits
162511717- 2. Update Manifest Lexicon
252252+### 3. Personal Hold (BYOS)
182531919- File: lexicons/io/atcr/manifest.json
2020- - Add holdEndpoint field (string, required)
2121- - This is historical reference (immutable per manifest)
254254+User deploys their own hold:
222552323- 3. Update Go Types
256256+```
257257+1. User deploys hold service to Fly.io (did:web:alice-hold.fly.dev)
258258+2. Hold auto-creates captain + crew records on first run
259259+3. User updates profile to use their hold
260260+4. User pushes images → blobs go to personal hold
261261+```
242622525- File: pkg/atproto/lexicon.go
2626- - Add SailorProfileCollection = "io.atcr.sailor.profile"
2727- - Add SailorProfileRecord struct
2828- - Add NewSailorProfileRecord() constructor
2929- - Update ManifestRecord struct to include HoldEndpoint field
263263+**Profile:**
264264+```json
265265+{
266266+ "defaultHold": "did:web:alice-hold.fly.dev"
267267+}
268268+```
302693131- 4. Create Profile Management
270270+**Benefits:**
271271+- Full control over storage
272272+- Choose storage provider (S3, Storj, Minio, etc.)
273273+- No quotas/limits (except what you pay for)
322743333- File: pkg/atproto/profile.go (new file)
3434- - EnsureProfile(ctx, client, defaultHoldDID) function
3535- - Logic: check if profile exists, create with default if not
275275+### 4. Opt Out of Defaults
362763737- 5. Update Auth Handlers
277277+User wants to use only their own hold records (legacy model):
382783939- Files: pkg/auth/exchange/handler.go and pkg/auth/token/service.go
4040- - Call EnsureProfile() after token validation
4141- - Use authenticated client (has write access to user's PDS)
4242- - Pass AppView's default_hold_did config (format: "did:web:hold01.atcr.io")
279279+```json
280280+{
281281+ "defaultHold": ""
282282+}
283283+```
432844444- 6. Update Hold Resolution
285285+**Behavior:**
286286+- Skips profile's defaultHold (set to empty/null)
287287+- Falls back to `io.atcr.hold` records in user's PDS
288288+- If no hold records found → uses AppView default
452894646- File: pkg/middleware/registry.go
4747- - Update findStorageEndpoint() priority:
4848- a. Check io.atcr.sailor.profile.defaultHold
4949- b. If null (opted out): check user's io.atcr.hold, then AppView default
5050- c. If no profile: check user's io.atcr.hold, then AppView default
290290+## Architecture Notes
512915252- 7. Store Hold in Manifest
292292+### Why Sailor Profile?
532935454- File: pkg/atproto/manifest_store.go
5555- - When creating manifest, include resolved holdEndpoint
5656- - Pass hold endpoint through context or parameter
294294+**Problem solved:**
295295+- Users can be crew members of multiple holds
296296+- Need explicit way to choose which hold to use
297297+- Want to support both personal and shared holds
572985858- 8. Update Pull to Use Manifest Hold
299299+**Without sailor profile:**
300300+```
301301+Alice is crew of:
302302+- team-hold.fly.dev (team storage)
303303+- community-hold.fly.dev (community storage)
593046060- File: pkg/atproto/manifest_store.go and pkg/storage/routing_repository.go
6161- - On pull, extract holdEndpoint from manifest
6262- - Route blob requests to that hold (not via discovery)
305305+Which one should AppView use? 🤔
306306+```
633076464- 9. Update Documentation
308308+**With sailor profile:**
309309+```
310310+Alice sets profile.defaultHold = "did:web:team-hold.fly.dev"
311311+→ AppView knows to use team hold
312312+→ Alice can change anytime via settings
313313+```
314314+315315+### Image Ownership vs Hold Choice
316316+317317+**Key insight:** Image ownership stays with the user, hold is just infrastructure.
318318+319319+**URL structure:** `atcr.io/<owner>/<image>:<tag>`
320320+- Owner = Alice (clear ownership)
321321+- Hold = Team storage (infrastructure detail)
322322+323323+**Analogy:** Like choosing an S3 region
324324+- Your files, your ownership
325325+- Region is just where bits live
326326+- Can move regions without changing ownership
327327+328328+### Historical Hold References
329329+330330+Manifests store `holdDid` for immutable blob location tracking:
331331+332332+```json
333333+{
334334+ "digest": "sha256:abc123",
335335+ "holdDid": "did:web:team-hold.fly.dev",
336336+ "holdEndpoint": "https://team-hold.fly.dev",
337337+ "layers": [...]
338338+}
339339+```
340340+341341+**Why store hold in manifest?**
342342+- Pull uses historical reference (not re-discovered)
343343+- Image stays pullable even if user changes defaultHold
344344+- Blobs fetched from where they were originally pushed
345345+- Immutable references (manifests don't change)
346346+347347+**Hold cache:**
348348+- In-memory cache: `(userDID, repository) → holdDid`
349349+- TTL: 10 minutes (covers typical pull operation)
350350+- Avoids re-querying PDS for every blob
351351+352352+## Configuration
353353+354354+### AppView Configuration
653556666- Files: CLAUDE.md, docs/BYOS.md, .env.example
6767- - Document sailor profile concept
6868- - Explain hold resolution priority
6969- - Update examples for shared holds
7070- - Document how crew members configure profile
356356+```bash
357357+# Default hold for new users
358358+ATCR_DEFAULT_HOLD_DID=did:web:hold01.atcr.io
359359+360360+# Test mode: fallback to default if user's hold unreachable
361361+ATCR_TEST_MODE=false
362362+```
713637272- Benefits
364364+**Test mode behavior:**
365365+- Checks if user's defaultHold is reachable (HTTP/HTTPS)
366366+- Falls back to AppView default if unreachable
367367+- Useful for local development (prevents errors from unreachable holds)
733687474- - ✅ URL structure remains atcr.io/<owner>/<image> (ownership clear)
7575- - ✅ Hold is transparent infrastructure (like S3 region)
7676- - ✅ Supports personal, shared, and public holds
7777- - ✅ Historical reference in manifest (pull works even if defaults change)
7878- - ✅ User can opt-out (set defaultHold to null)
7979- - ✅ Future: UI for self-service profile management
369369+### Legacy Support
80370371371+**Old hold registration model** (`io.atcr.hold` records in user's PDS):
372372+- Still supported for backward compatibility
373373+- Checked if profile.defaultHold is empty
374374+- New deployments should use sailor profiles instead
813758282-Progress Summary
376376+**Migration path:**
377377+- Existing holds continue to work
378378+- Users with `io.atcr.hold` records can set profile.defaultHold
379379+- Profile takes priority over hold records
833808484- ✅ Completed:
8585- 1. Created io.atcr.sailor.profile lexicon
8686- 2. Updated io.atcr.manifest lexicon with holdEndpoint field
8787- 3. Updated Go types in pkg/atproto/lexicon.go
8888- 4. Created profile management in pkg/atproto/profile.go
8989- 5. Updated /auth/exchange handler to manage profile
381381+## Future Improvements
903829191- ⏳ In Progress:
9292- - Need to update /auth/token handler similarly (add defaultHoldDID parameter and profile management)
9393- - Fix compilation error in extractDefaultHoldDID() - should use configuration.Middleware type not any
383383+1. **Multi-hold support** - Set different holds for different repositories
384384+2. **Hold suggestions** - Recommend holds based on geography/cost
385385+3. **Hold migration tools** - Move blobs between holds
386386+4. **Profile templates** - Pre-configured profiles for teams
387387+5. **Hold analytics** - Show storage usage per hold in UI
943889595- 🔜 Remaining:
9696- - Update findStorageEndpoint() for new priority logic (check profile → own hold → default)
9797- - Update manifest_store.go to include holdEndpoint when creating manifests
9898- - Update pull flow to use manifest holdEndpoint
9999- - Update documentation
389389+## References
100390101101- The architecture is solid - just need to finish the token handler update and fix the type issue in the config extraction. Would you like me to continue?
391391+- [BYOS.md](./BYOS.md) - BYOS deployment and hold management
392392+- [EMBEDDED_PDS.md](./EMBEDDED_PDS.md) - Hold's embedded PDS architecture
393393+- [CREW_ACCESS_CONTROL.md](./CREW_ACCESS_CONTROL.md) - Crew membership and permissions
394394+- [ATProto Lexicon Spec](https://atproto.com/specs/lexicon)
+568
docs/SBOM_SCANNING.md
···11+# SBOM Scanning
22+33+ATCR supports optional Software Bill of Materials (SBOM) generation for container images stored in holds. This feature enables automated security scanning and vulnerability analysis while maintaining the decentralized architecture.
44+55+## Overview
66+77+When enabled, holds automatically generate SBOMs for uploaded container images in the background. The scanning process:
88+99+- **Async execution**: Scanning happens after upload completes (non-blocking)
1010+- **ORAS artifacts**: SBOMs stored as OCI Registry as Storage (ORAS) artifacts
1111+- **ATProto integration**: Scan results stored as `io.atcr.manifest` records in hold's embedded PDS
1212+- **Tool agnostic**: Results accessible via XRPC, ATProto queries, and direct blob URLs
1313+- **Opt-in**: Disabled by default, enabled per-hold via configuration
1414+1515+### Default Scanner: Syft
1616+1717+ATCR uses [Anchore Syft](https://github.com/anchore/syft) for SBOM generation:
1818+- Industry-standard SBOM generator
1919+- Supports SPDX and CycloneDX formats
2020+- Comprehensive package detection (OS packages, language libraries, etc.)
2121+- Active maintenance and CVE database updates
2222+2323+Future enhancements may include [Grype](https://github.com/anchore/grype) for vulnerability scanning and [Trivy](https://github.com/aquasecurity/trivy) for comprehensive security analysis.
2424+2525+## Trust Model
2626+2727+### Same Trust as Docker Hub
2828+2929+SBOM scanning follows the same trust model as Docker Hub or other centralized registries:
3030+3131+**Docker Hub model:**
3232+- Docker Hub scans your image on their infrastructure
3333+- Results stored in their database
3434+- You trust Docker Hub's scanner version and scan integrity
3535+3636+**ATCR hold model:**
3737+- Hold scans image on their infrastructure
3838+- Results stored in hold's embedded PDS
3939+- You trust hold operator's scanner version and scan integrity
4040+4141+The security comes from **reproducibility** and **transparency**, not storage location:
4242+- Anyone can re-scan the same digest and verify results
4343+- Multiple holds scanning the same image provide independent verification
4444+- Scanner version and scan timestamp are recorded in ATProto records
4545+4646+### Why Hold's PDS?
4747+4848+Scan results are stored in the **hold's embedded PDS** rather than the user's PDS:
4949+5050+**Advantages:**
5151+1. **No OAuth expiry issues**: Hold owns its PDS, no service tokens needed
5252+2. **Hold-scoped metadata**: Scanner version, scan time, hold configuration
5353+3. **Multiple perspectives**: Different holds can scan the same image independently
5454+4. **Simpler auth**: Hold writes directly to its own PDS
5555+5. **Keeps user PDS lean**: Potentially large SBOM data doesn't bloat user's repo
5656+5757+**Security properties:**
5858+- Same trust level as trusting hold to serve correct blobs
5959+- DID signatures prove which hold generated the SBOM
6060+- Reproducible scans enable independent verification
6161+- Multiple holds scanning same digest → compare results for tampering detection
6262+6363+## ORAS Manifest Format
6464+6565+SBOMs are stored as ORAS artifacts that reference their subject image using the OCI referrers specification.
6666+6767+### Example Manifest Record
6868+6969+```json
7070+{
7171+ "$type": "io.atcr.manifest",
7272+ "repository": "alice/myapp",
7373+ "digest": "sha256:4a5e...",
7474+ "holdDid": "did:web:hold01.atcr.io",
7575+ "holdEndpoint": "https://hold01.atcr.io",
7676+ "schemaVersion": 2,
7777+ "mediaType": "application/vnd.oci.image.manifest.v1+json",
7878+ "artifactType": "application/spdx+json",
7979+ "subject": {
8080+ "mediaType": "application/vnd.oci.image.manifest.v1+json",
8181+ "digest": "sha256:abc123...",
8282+ "size": 1234
8383+ },
8484+ "config": {
8585+ "mediaType": "application/vnd.oci.empty.v1+json",
8686+ "digest": "sha256:44136f...",
8787+ "size": 2
8888+ },
8989+ "layers": [
9090+ {
9191+ "mediaType": "application/spdx+json",
9292+ "digest": "sha256:def456...",
9393+ "size": 5678,
9494+ "annotations": {
9595+ "org.opencontainers.image.title": "sbom.spdx.json"
9696+ }
9797+ }
9898+ ],
9999+ "manifestBlob": {
100100+ "$type": "blob",
101101+ "ref": { "$link": "bafyrei..." },
102102+ "mimeType": "application/vnd.oci.image.manifest.v1+json",
103103+ "size": 789
104104+ },
105105+ "ownerDid": "did:plc:alice123",
106106+ "scannedAt": "2025-10-20T12:34:56.789Z",
107107+ "scannerVersion": "syft-v1.0.0",
108108+ "createdAt": "2025-10-20T12:34:56.789Z"
109109+}
110110+```
111111+112112+### Key Fields
113113+114114+- `artifactType`: Distinguishes SBOM artifact from regular image manifest
115115+ - `application/spdx+json` for SPDX format
116116+ - `application/vnd.cyclonedx+json` for CycloneDX format
117117+- `subject`: Reference to the original image manifest
118118+- `ownerDid`: DID of the image owner (for multi-tenant holds)
119119+- `scannedAt`: ISO 8601 timestamp of when scan completed
120120+- `scannerVersion`: Tool version for reproducibility tracking
121121+122122+### SBOM Blob
123123+124124+The actual SBOM document is stored as a blob in the hold's storage backend and referenced in the manifest's `layers` array. The blob contains the full SPDX or CycloneDX JSON document.
125125+126126+## Configuration
127127+128128+SBOM scanning is configured via environment variables on the hold service.
129129+130130+### Environment Variables
131131+132132+```bash
133133+# Enable SBOM scanning (opt-in)
134134+HOLD_SBOM_ENABLED=true
135135+136136+# Number of concurrent scan workers (default: 2)
137137+# Higher values = faster scanning, more CPU/memory usage
138138+HOLD_SBOM_WORKERS=4
139139+140140+# SBOM output format (default: spdx-json)
141141+# Options: spdx-json, cyclonedx-json
142142+HOLD_SBOM_FORMAT=spdx-json
143143+144144+# Future: Enable vulnerability scanning with Grype
145145+# HOLD_VULN_ENABLED=true
146146+```
147147+148148+### Example Configuration
149149+150150+```bash
151151+# .env.hold
152152+HOLD_PUBLIC_URL=https://hold01.atcr.io
153153+STORAGE_DRIVER=s3
154154+S3_BUCKET=my-hold-blobs
155155+HOLD_OWNER=did:plc:xyz123
156156+HOLD_DATABASE_PATH=/var/lib/atcr/hold.db
157157+158158+# Enable SBOM scanning
159159+HOLD_SBOM_ENABLED=true
160160+HOLD_SBOM_WORKERS=2
161161+HOLD_SBOM_FORMAT=spdx-json
162162+```
163163+164164+## Scanning Workflow
165165+166166+### 1. Upload Completes
167167+168168+When a container image is successfully pushed to a hold:
169169+170170+```
171171+1. Client: docker push atcr.io/alice/myapp:latest
172172+2. AppView routes blobs to hold service
173173+3. Hold receives multipart upload via XRPC
174174+4. Hold completes upload and stores blobs
175175+5. Hold checks: HOLD_SBOM_ENABLED=true?
176176+6. If yes: enqueue scan job (non-blocking)
177177+7. Upload completes immediately
178178+```
179179+180180+### 2. Background Scanning
181181+182182+Scan workers process jobs from the queue:
183183+184184+```
185185+1. Worker pulls job from queue
186186+2. Extracts image layers from storage
187187+3. Runs Syft on extracted filesystem
188188+4. Generates SBOM in configured format
189189+5. Uploads SBOM blob to storage
190190+6. Creates ORAS manifest record in hold's PDS
191191+7. Job complete
192192+```
193193+194194+### 3. Result Storage
195195+196196+SBOM results are stored in two places:
197197+198198+1. **SBOM blob**: Full JSON document in hold's blob storage
199199+2. **ORAS manifest**: Metadata record in hold's embedded PDS
200200+ - Collection: `io.atcr.manifest`
201201+ - Record key: SBOM manifest digest
202202+ - Contains reference to subject image
203203+204204+## Accessing SBOMs
205205+206206+Multiple methods for discovering and retrieving SBOM data.
207207+208208+### 1. XRPC Query Endpoint
209209+210210+Query for SBOMs by image digest:
211211+212212+```bash
213213+# Get SBOM for a specific image
214214+curl "https://hold01.atcr.io/xrpc/io.atcr.hold.getSBOM?\
215215+ digest=sha256:abc123&\
216216+ ownerDid=did:plc:alice123&\
217217+ repository=alice/myapp"
218218+219219+# Response: ORAS manifest JSON
220220+{
221221+ "manifest": {
222222+ "schemaVersion": 2,
223223+ "mediaType": "application/vnd.oci.image.manifest.v1+json",
224224+ "artifactType": "application/spdx+json",
225225+ "subject": { "digest": "sha256:abc123...", ... },
226226+ "layers": [ { "digest": "sha256:def456...", ... } ]
227227+ },
228228+ "scannedAt": "2025-10-20T12:34:56.789Z",
229229+ "scannerVersion": "syft-v1.0.0"
230230+}
231231+```
232232+233233+### 2. ATProto Repository Queries
234234+235235+Use standard ATProto XRPC to list all SBOMs:
236236+237237+```bash
238238+# List all SBOM manifests in hold's PDS
239239+curl "https://hold01.atcr.io/xrpc/com.atproto.repo.listRecords?\
240240+ repo=did:web:hold01.atcr.io&\
241241+ collection=io.atcr.manifest"
242242+243243+# Filter by artifactType (requires AppView indexing)
244244+# Returns all SBOM artifacts
245245+```
246246+247247+### 3. Direct SBOM Blob Download
248248+249249+Download the full SBOM JSON file:
250250+251251+```bash
252252+# Get SBOM blob CID from manifest layers[0].digest
253253+SBOM_DIGEST="sha256:def456..."
254254+255255+# Request presigned download URL
256256+curl "https://hold01.atcr.io/xrpc/com.atproto.sync.getBlob?\
257257+ did=did:web:hold01.atcr.io&\
258258+ cid=$SBOM_DIGEST"
259259+260260+# Response: presigned S3 URL or direct blob
261261+{
262262+ "url": "https://s3.amazonaws.com/bucket/blob?signature=...",
263263+ "expiresAt": "2025-10-20T12:49:56Z"
264264+}
265265+266266+# Download SBOM JSON
267267+curl "$URL" > sbom.spdx.json
268268+```
269269+270270+### 4. ORAS CLI Integration
271271+272272+Use the ORAS CLI to discover and pull SBOMs:
273273+274274+```bash
275275+# Discover referrers (SBOMs) for an image
276276+oras discover atcr.io/alice/myapp:latest
277277+278278+# Output shows SBOM artifacts:
279279+# digest: sha256:abc123...
280280+# referrers:
281281+# - artifactType: application/spdx+json
282282+# digest: sha256:4a5e...
283283+284284+# Pull SBOM artifact
285285+oras pull atcr.io/alice/myapp@sha256:4a5e...
286286+287287+# Downloads sbom.spdx.json to current directory
288288+```
289289+290290+### 5. AppView Web UI (Future)
291291+292292+Future enhancement: AppView web interface will display SBOM information on repository pages:
293293+294294+- Link to SBOM JSON download
295295+- Vulnerability count (if Grype enabled)
296296+- Scanner version and scan timestamp
297297+- Comparison across multiple holds
298298+299299+## Tool Integration
300300+301301+### SPDX/CycloneDX Tools
302302+303303+Any tool that understands SPDX or CycloneDX formats can consume the SBOMs:
304304+305305+**Example tools:**
306306+- [OSV Scanner](https://github.com/google/osv-scanner) - Vulnerability scanning
307307+- [Grype](https://github.com/anchore/grype) - Vulnerability scanning
308308+- [Dependency-Track](https://dependencytrack.org/) - Software composition analysis
309309+- [SBOM Quality Score](https://github.com/eBay/sbom-scorecard) - SBOM completeness
310310+311311+**Usage:**
312312+```bash
313313+# Download SBOM
314314+curl "https://hold01.atcr.io/xrpc/io.atcr.hold.getSBOM?..." | \
315315+ jq -r '.manifest.layers[0].digest' | \
316316+ # ... fetch blob ... > sbom.spdx.json
317317+318318+# Scan with OSV
319319+osv-scanner --sbom sbom.spdx.json
320320+321321+# Scan with Grype
322322+grype sbom:./sbom.spdx.json
323323+```
324324+325325+### OCI Registry API
326326+327327+ORAS manifests are fully OCI-compliant and discoverable via standard registry APIs:
328328+329329+```bash
330330+# Discover referrers for an image
331331+curl -H "Accept: application/vnd.oci.image.index.v1+json" \
332332+ "https://atcr.io/v2/alice/myapp/referrers/sha256:abc123"
333333+334334+# Returns referrers index with SBOM manifests
335335+{
336336+ "schemaVersion": 2,
337337+ "mediaType": "application/vnd.oci.image.index.v1+json",
338338+ "manifests": [
339339+ {
340340+ "mediaType": "application/vnd.oci.image.manifest.v1+json",
341341+ "digest": "sha256:4a5e...",
342342+ "artifactType": "application/spdx+json"
343343+ }
344344+ ]
345345+}
346346+```
347347+348348+### Programmatic Access
349349+350350+Use the ATProto SDK to query SBOMs:
351351+352352+```go
353353+import "github.com/bluesky-social/indigo/atproto"
354354+355355+// List all SBOMs for a hold
356356+records, err := client.RepoListRecords(ctx,
357357+ "did:web:hold01.atcr.io",
358358+ "io.atcr.manifest",
359359+ 100, // limit
360360+ "", // cursor
361361+)
362362+363363+// Filter for SBOM artifacts
364364+for _, record := range records.Records {
365365+ manifest := record.Value.(ManifestRecord)
366366+ if manifest.ArtifactType == "application/spdx+json" {
367367+ // Process SBOM manifest
368368+ }
369369+}
370370+```
371371+372372+## Future Enhancements
373373+374374+### Vulnerability Scanning (Grype)
375375+376376+Add vulnerability scanning to SBOM generation:
377377+378378+```bash
379379+# Configuration
380380+HOLD_VULN_ENABLED=true
381381+HOLD_VULN_DB_UPDATE_INTERVAL=24h
382382+383383+# Extended manifest with vulnerability count
384384+{
385385+ "artifactType": "application/spdx+json",
386386+ "annotations": {
387387+ "io.atcr.vuln.critical": "2",
388388+ "io.atcr.vuln.high": "15",
389389+ "io.atcr.vuln.medium": "42",
390390+ "io.atcr.vuln.low": "8",
391391+ "io.atcr.vuln.scannedWith": "grype-v0.74.0",
392392+ "io.atcr.vuln.dbVersion": "2025-10-20"
393393+ }
394394+}
395395+```
396396+397397+### Multi-Scanner Support (Trivy)
398398+399399+Support multiple scanner backends:
400400+401401+```bash
402402+HOLD_SBOM_SCANNER=trivy # syft (default), trivy, grype
403403+HOLD_TRIVY_SCAN_TYPE=os,library,config,secret
404404+```
405405+406406+### Multi-Hold Verification
407407+408408+Compare SBOMs from different holds for the same image:
409409+410410+```bash
411411+# Alice pushes to hold1 and hold2
412412+docker push atcr.io/alice/myapp:latest
413413+414414+# Both holds scan independently
415415+# Compare results:
416416+atcr-cli compare-sboms \
417417+ --image atcr.io/alice/myapp:latest \
418418+ --holds hold1.atcr.io,hold2.atcr.io
419419+420420+# Output: Package count differences, version mismatches, etc.
421421+```
422422+423423+### Signature Verification (Cosign)
424424+425425+Sign SBOMs with Sigstore Cosign:
426426+427427+```bash
428428+HOLD_SBOM_SIGN=true
429429+HOLD_COSIGN_KEY_PATH=/var/lib/atcr/cosign.key
430430+431431+# SBOM artifacts get signed
432432+# Verification:
433433+cosign verify --key cosign.pub atcr.io/alice/myapp@sha256:4a5e...
434434+```
435435+436436+## Security Considerations
437437+438438+### Reproducibility
439439+440440+SBOMs should be reproducible for the same image digest:
441441+442442+**Best practices:**
443443+- Pin scanner versions in production holds
444444+- Record scanner version in manifest annotations
445445+- Document vulnerability database versions
446446+- Re-scan periodically to catch new CVEs
447447+448448+**Validation:**
449449+```bash
450450+# Compare SBOMs from different holds
451451+diff <(curl hold1/sbom.json | jq -S) \
452452+ <(curl hold2/sbom.json | jq -S)
453453+454454+# Differences indicate:
455455+# - Different scanner versions
456456+# - Different scan times (new CVEs discovered)
457457+# - Potential tampering (investigate)
458458+```
459459+460460+### Multiple Hold Verification
461461+462462+Running multiple holds provides defense in depth:
463463+464464+1. User pushes to hold1 (uses hold1 by default)
465465+2. User also pushes to hold2 (backup/verification)
466466+3. Both holds scan independently
467467+4. Compare SBOM results:
468468+ - Similar results = confidence in accuracy
469469+ - Divergent results = investigate discrepancy
470470+471471+### Transparency
472472+473473+Hold operators should publish scanning policies:
474474+475475+- Scanner version and update schedule
476476+- Vulnerability database update frequency
477477+- SBOM format and schema version
478478+- Data retention policies
479479+480480+### Trust Anchors
481481+482482+Users can verify scanner integrity:
483483+484484+1. **Scanner version**: Check `scannerVersion` field matches expected version
485485+2. **DID signature**: ATProto record signed by hold's DID
486486+3. **Timestamp**: Check `scannedAt` for stale scans
487487+4. **Reproducibility**: Re-scan locally and compare results
488488+489489+## Example Workflows
490490+491491+### Enable Scanning on Your Hold
492492+493493+```bash
494494+# 1. Configure hold with SBOM enabled
495495+cat > .env.hold <<EOF
496496+HOLD_PUBLIC_URL=https://myhold.example.com
497497+STORAGE_DRIVER=s3
498498+S3_BUCKET=my-blobs
499499+HOLD_OWNER=did:plc:myid
500500+501501+# Enable SBOM scanning
502502+HOLD_SBOM_ENABLED=true
503503+HOLD_SBOM_WORKERS=2
504504+HOLD_SBOM_FORMAT=spdx-json
505505+EOF
506506+507507+# 2. Start hold service
508508+./bin/atcr-hold
509509+510510+# 3. Push an image
511511+docker push atcr.io/alice/myapp:latest
512512+513513+# 4. Wait for background scan (check logs)
514514+# 2025-10-20T12:34:56Z INFO Scanning image sha256:abc123...
515515+# 2025-10-20T12:35:12Z INFO SBOM generated sha256:def456...
516516+517517+# 5. Query for SBOM
518518+curl "https://myhold.example.com/xrpc/io.atcr.hold.getSBOM?..."
519519+```
520520+521521+### Consume SBOMs in CI/CD
522522+523523+```yaml
524524+# .github/workflows/security-scan.yml
525525+name: Security Scan
526526+on: push
527527+528528+jobs:
529529+ scan:
530530+ runs-on: ubuntu-latest
531531+ steps:
532532+ - name: Pull image
533533+ run: docker pull atcr.io/alice/myapp:latest
534534+535535+ - name: Get SBOM from hold
536536+ run: |
537537+ IMAGE_DIGEST=$(docker inspect atcr.io/alice/myapp:latest \
538538+ --format='{{.RepoDigests}}')
539539+540540+ curl "https://hold01.atcr.io/xrpc/io.atcr.hold.getSBOM?\
541541+ digest=$IMAGE_DIGEST&\
542542+ ownerDid=did:plc:alice123&\
543543+ repository=alice/myapp" \
544544+ -o sbom-manifest.json
545545+546546+ SBOM_DIGEST=$(jq -r '.manifest.layers[0].digest' sbom-manifest.json)
547547+548548+ curl "https://hold01.atcr.io/xrpc/com.atproto.sync.getBlob?\
549549+ did=did:web:hold01.atcr.io&\
550550+ cid=$SBOM_DIGEST" \
551551+ | jq -r '.url' | xargs curl -o sbom.spdx.json
552552+553553+ - name: Scan with Grype
554554+ uses: anchore/scan-action@v3
555555+ with:
556556+ sbom: sbom.spdx.json
557557+ fail-build: true
558558+ severity-cutoff: high
559559+```
560560+561561+## References
562562+563563+- [ORAS Specification](https://oras.land/)
564564+- [OCI Artifacts](https://github.com/opencontainers/artifacts)
565565+- [SPDX Specification](https://spdx.dev/)
566566+- [CycloneDX Specification](https://cyclonedx.org/)
567567+- [Syft Documentation](https://github.com/anchore/syft)
568568+- [ATProto Specification](https://atproto.com/)
-821
docs/XRPC_BLOB_MIGRATION.md
···11-# XRPC Blob Upload Migration
22-33-This document describes how to migrate from separate legacy multipart upload endpoints to a unified `com.atproto.repo.uploadBlob` endpoint that supports both standard single-blob uploads and OCI container layer multipart uploads.
44-55-## Current State
66-77-### Legacy HTTP Endpoints (cmd/hold/main.go)
88-99-```go
1010-// Unified presigned URL endpoint (handles upload AND download)
1111-mux.HandleFunc("/presigned-url", service.HandlePresignedURL)
1212-1313-// Internal move operation (used by multipart complete)
1414-mux.HandleFunc("/move", service.HandleMove)
1515-1616-// Multipart upload endpoints
1717-mux.HandleFunc("/start-multipart", service.HandleStartMultipart)
1818-mux.HandleFunc("/part-presigned-url", service.HandleGetPartURL)
1919-mux.HandleFunc("/complete-multipart", service.HandleCompleteMultipart)
2020-mux.HandleFunc("/abort-multipart", service.HandleAbortMultipart)
2121-2222-// Buffered part upload (when presigned URLs unavailable)
2323-mux.HandleFunc("/multipart-parts/", func(w http.ResponseWriter, r *http.Request) {
2424- // Parse URL: /multipart-parts/{uploadID}/{partNumber}
2525- // ...
2626- service.HandleMultipartPartUpload(w, r, uploadID, partNumber, did, service.MultipartMgr)
2727-})
2828-```
2929-3030-### Existing XRPC Endpoint (pkg/hold/pds/xrpc.go)
3131-3232-```go
3333-// Current implementation - redirects to presigned URL
3434-func (h *XRPCHandler) HandleUploadBlob(w http.ResponseWriter, r *http.Request) {
3535- digest := r.URL.Query().Get("digest")
3636- uploadURL, err := h.blobStore.GetPresignedUploadURL(digest)
3737- http.Redirect(w, r, uploadURL, http.StatusFound)
3838-}
3939-```
4040-4141-### Supporting Code
4242-4343-**pkg/hold/multipart.go:**
4444-- `MultipartManager` - Tracks upload sessions
4545-- `MultipartSession` - State for each upload (parts, mode, etc.)
4646-- Modes: `S3Native` (presigned URLs), `Buffered` (proxy uploads)
4747-4848-**pkg/hold/blobstore_adapter.go:**
4949-- `HoldServiceBlobStore` - Adapter wrapping HoldService for XRPC handlers
5050-- Implements presigned URL generation
5151-- Currently not used by XRPC handlers
5252-5353-**pkg/hold/handlers.go:**
5454-- `HandlePresignedURL()` - Unified endpoint for GET/HEAD/PUT presigned URLs
5555-- `HandleMove()` - Moves blob from temp to final location (internal operation)
5656-- `HandleStartMultipart()` - Starts upload, returns uploadID
5757-- `HandleGetPartURL()` - Returns presigned URL for part
5858-- `HandleCompleteMultipart()` - Finalizes upload, assembles parts (calls Move internally)
5959-- `HandleAbortMultipart()` - Cancels upload
6060-- `HandleMultipartPartUpload()` - Buffered part upload fallback
6161-6262-## Legacy Endpoint Mapping
6363-6464-### `/presigned-url` → Multiple XRPC Operations
6565-6666-The legacy `/presigned-url` endpoint is a **unified endpoint** that handles both upload and download operations based on the `operation` field in the JSON body:
6767-6868-**Legacy format:**
6969-```
7070-POST /presigned-url
7171-Content-Type: application/json
7272-7373-{
7474- "operation": "GET", // or "HEAD" or "PUT"
7575- "did": "did:plc:alice123",
7676- "digest": "sha256:abc123...",
7777- "size": 1234567890 // Only for PUT operations
7878-}
7979-8080-Response:
8181-{
8282- "url": "https://s3.amazonaws.com/...",
8383- "expires_at": "2025-10-16T..."
8484-}
8585-```
8686-8787-**XRPC mapping:**
8888-- `operation: "GET"` → `GET /xrpc/com.atproto.sync.getBlob?did=...&cid=sha256:abc...`
8989-- `operation: "HEAD"` → `HEAD /xrpc/com.atproto.sync.getBlob?did=...&cid=sha256:abc...`
9090-- `operation: "PUT"` → `com.atproto.repo.uploadBlob` (single upload via presigned URL)
9191-9292-**Note:** For GET/HEAD operations, AppView passes OCI digest directly as `cid` parameter. Hold detects `sha256:` prefix and uses digest directly (no CID conversion needed).
9393-9494-### `/move` → Internal to Multipart Complete
9595-9696-The legacy `/move` endpoint moves a blob from temporary location to final digest-based location:
9797-9898-**Legacy format:**
9999-```
100100-POST /move?from=uploads/temp-123&to=sha256:abc123...&did=did:plc:alice123
101101-102102-Response: 200 OK
103103-```
104104-105105-**Purpose:** Server-side S3 copy after multipart assembly. Used in this flow:
106106-107107-1. Multipart parts uploaded → `uploads/temp-{uploadID}/part-1`, `part-2`, etc.
108108-2. Complete multipart → S3 assembles parts at `uploads/temp-{uploadID}`
109109-3. **Move operation** → S3 copy from `uploads/temp-{uploadID}` → `blobs/sha256/ab/abc123...`
110110-111111-**XRPC mapping:**
112112-- **Not a separate endpoint** - becomes internal operation in `uploadBlob?action=complete`
113113-- The `complete` action automatically handles the move after multipart assembly
114114-- AppView doesn't need to call move explicitly in XRPC flow
115115-116116-## New Unified Design
117117-118118-### Single Endpoint: `com.atproto.repo.uploadBlob`
119119-120120-Content-Type discrimination determines operation:
121121-- `application/octet-stream` → Standard blob upload (profile images, small media)
122122-- `application/json` → Multipart operations (large OCI layers)
123123-124124-### Complementary Endpoint: `com.atproto.sync.getBlob`
125125-126126-For blob downloads (maps from legacy `/presigned-url` with operation=GET/HEAD):
127127-128128-**Standard ATProto blobs (CID):**
129129-```
130130-GET /xrpc/com.atproto.sync.getBlob?did={holdDID}&cid=bafyreib...
131131-132132-Response: 307 Temporary Redirect
133133-Location: https://s3.amazonaws.com/bucket/...?presigned-params
134134-```
135135-136136-**OCI container layers (digest):**
137137-```
138138-GET /xrpc/com.atproto.sync.getBlob?did={holdDID}&cid=sha256:abc123...
139139-140140-Response: 307 Temporary Redirect
141141-Location: https://s3.amazonaws.com/bucket/...?presigned-params
142142-```
143143-144144-**Implementation - Flexible CID parameter:**
145145-```go
146146-func (h *XRPCHandler) HandleGetBlob(w http.ResponseWriter, r *http.Request) {
147147- cidOrDigest := r.URL.Query().Get("cid")
148148-149149- var digest string
150150- if strings.HasPrefix(cidOrDigest, "sha256:") {
151151- // OCI digest - use directly (no conversion needed)
152152- digest = cidOrDigest
153153- } else {
154154- // Standard CID - convert to digest
155155- c, _ := cid.Decode(cidOrDigest)
156156- digest = cidToDigest(c) // bafyreib... → sha256:abc...
157157- }
158158-159159- // Generate presigned URL for S3
160160- url := h.blobStore.GetPresignedDownloadURL(digest)
161161- http.Redirect(w, r, url, http.StatusTemporaryRedirect)
162162-}
163163-```
164164-165165-**Key insight:** The `cid` parameter accepts both formats. Hold service checks prefix and handles accordingly. This keeps the endpoint spec-compliant (GET with query params) while supporting OCI digests natively.
166166-167167-### API Specification
168168-169169-#### Standard Single Upload (ATProto Spec Compliant)
170170-171171-```
172172-POST /xrpc/com.atproto.repo.uploadBlob
173173-Content-Type: application/octet-stream
174174-175175-[raw blob bytes]
176176-177177-Response (200 OK):
178178-{
179179- "blob": {
180180- "$type": "blob",
181181- "ref": {
182182- "$link": "bafyreib..." // CID
183183- },
184184- "mimeType": "application/octet-stream",
185185- "size": 12345
186186- }
187187-}
188188-```
189189-190190-**Use case:** Profile images, small media (< 10MB), standard ATProto blobs
191191-192192-#### Multipart Start (ATCR Extension)
193193-194194-```
195195-POST /xrpc/com.atproto.repo.uploadBlob
196196-Content-Type: application/json
197197-198198-{
199199- "action": "start",
200200- "digest": "sha256:abc123...",
201201- "size": 1234567890 // Optional hint for storage allocation
202202-}
203203-204204-Response (200 OK):
205205-{
206206- "uploadId": "upload-1634567890",
207207- "expiresAt": "2025-10-16T12:00:00Z",
208208- "mode": "s3-native" // or "buffered"
209209-}
210210-```
211211-212212-**Implementation:**
213213-- Calls `service.StartMultipartUploadWithManager(ctx, digest, multipartMgr)`
214214-- Returns uploadID and mode from MultipartSession
215215-216216-#### Multipart Get Part URL (ATCR Extension)
217217-218218-```
219219-POST /xrpc/com.atproto.repo.uploadBlob
220220-Content-Type: application/json
221221-222222-{
223223- "action": "part",
224224- "uploadId": "upload-1634567890",
225225- "partNumber": 1,
226226- "digest": "sha256:abc123..."
227227-}
228228-229229-Response (200 OK):
230230-{
231231- "url": "https://s3.amazonaws.com/bucket/...?X-Amz-...",
232232- "expiresAt": "2025-10-16T12:15:00Z",
233233- "method": "PUT"
234234-}
235235-236236-// OR for buffered mode:
237237-{
238238- "url": "https://hold01.atcr.io/xrpc/com.atproto.repo.uploadBlob",
239239- "method": "PUT",
240240- "headers": {
241241- "X-Upload-Id": "upload-1634567890",
242242- "X-Part-Number": "1"
243243- },
244244- "expiresAt": "2025-10-16T12:15:00Z"
245245-}
246246-```
247247-248248-**Implementation:**
249249-- Retrieve session: `multipartMgr.GetSession(uploadID)`
250250-- S3Native mode: Call `service.GetPartUploadURL(ctx, session, partNumber, did)`
251251-- Buffered mode: Return self-referential URL with headers
252252-253253-#### Multipart Upload Part (Buffered Mode)
254254-255255-```
256256-PUT /xrpc/com.atproto.repo.uploadBlob
257257-Content-Type: application/octet-stream
258258-X-Upload-Id: upload-1634567890
259259-X-Part-Number: 1
260260-261261-[part data bytes]
262262-263263-Response (200 OK):
264264-{
265265- "etag": "abc123def456",
266266- "partNumber": 1
267267-}
268268-```
269269-270270-**Implementation:**
271271-- Extract headers: `X-Upload-Id`, `X-Part-Number`
272272-- Call `service.HandleMultipartPartUpload(w, r, uploadID, partNumber, did, multipartMgr)`
273273-- Return ETag for completion
274274-275275-#### Multipart Complete (ATCR Extension)
276276-277277-```
278278-POST /xrpc/com.atproto.repo.uploadBlob
279279-Content-Type: application/json
280280-281281-{
282282- "action": "complete",
283283- "uploadId": "upload-1634567890",
284284- "digest": "sha256:abc123...",
285285- "parts": [
286286- { "partNumber": 1, "etag": "abc123" },
287287- { "partNumber": 2, "etag": "def456" }
288288- ]
289289-}
290290-291291-Response (200 OK):
292292-{
293293- "status": "completed",
294294- "blob": {
295295- "$type": "blob",
296296- "ref": {
297297- "$link": "bafyreib..." // CID computed from digest
298298- },
299299- "mimeType": "application/octet-stream",
300300- "size": 1234567890
301301- }
302302-}
303303-```
304304-305305-**Implementation:**
306306-- Retrieve session: `multipartMgr.GetSession(uploadID)`
307307-- For S3Native: Record parts via `session.RecordS3Part()`
308308-- Call `service.CompleteMultipartUploadWithManager(ctx, session, multipartMgr)`
309309- - This internally calls S3 CompleteMultipartUpload to assemble parts
310310- - Then performs server-side S3 copy from temp location to final digest location
311311- - Equivalent to legacy `/move` endpoint operation
312312-- Convert digest to CID for response
313313-314314-#### Multipart Abort (ATCR Extension)
315315-316316-```
317317-POST /xrpc/com.atproto.repo.uploadBlob
318318-Content-Type: application/json
319319-320320-{
321321- "action": "abort",
322322- "uploadId": "upload-1634567890",
323323- "digest": "sha256:abc123..."
324324-}
325325-326326-Response (200 OK):
327327-{
328328- "status": "aborted"
329329-}
330330-```
331331-332332-**Implementation:**
333333-- Retrieve session: `multipartMgr.GetSession(uploadID)`
334334-- Call `service.AbortMultipartUploadWithManager(ctx, session, multipartMgr)`
335335-336336-## Implementation Strategy
337337-338338-### Phase 1: Add Unified Handler (Keep Legacy Endpoints)
339339-340340-**File:** `pkg/hold/pds/xrpc.go`
341341-342342-```go
343343-// HandleUploadBlob unified handler supporting both single and multipart uploads
344344-func (h *XRPCHandler) HandleUploadBlob(w http.ResponseWriter, r *http.Request) {
345345- if r.Method != http.MethodPost && r.Method != http.MethodPut {
346346- http.Error(w, "method not allowed", http.StatusMethodNotAllowed)
347347- return
348348- }
349349-350350- contentType := r.Header.Get("Content-Type")
351351-352352- // Buffered multipart part upload (PUT with headers)
353353- if r.Method == http.MethodPut && r.Header.Get("X-Upload-Id") != "" {
354354- h.handleBufferedPartUpload(w, r)
355355- return
356356- }
357357-358358- // Multipart operations (JSON body)
359359- if strings.Contains(contentType, "application/json") {
360360- h.handleMultipartOperation(w, r)
361361- return
362362- }
363363-364364- // Standard single blob upload (raw bytes)
365365- h.handleSingleBlobUpload(w, r)
366366-}
367367-368368-func (h *XRPCHandler) handleMultipartOperation(w http.ResponseWriter, r *http.Request) {
369369- var req struct {
370370- Action string `json:"action"`
371371- Digest string `json:"digest,omitempty"`
372372- Size int64 `json:"size,omitempty"`
373373- UploadID string `json:"uploadId,omitempty"`
374374- PartNumber int `json:"partNumber,omitempty"`
375375- Parts []struct {
376376- PartNumber int `json:"partNumber"`
377377- ETag string `json:"etag"`
378378- } `json:"parts,omitempty"`
379379- }
380380-381381- if err := json.NewDecoder(r.Body).Decode(&req); err != nil {
382382- http.Error(w, fmt.Sprintf("invalid JSON: %v", err), http.StatusBadRequest)
383383- return
384384- }
385385-386386- // TODO: Add authentication check
387387- // user, err := ValidateDPoPRequest(r)
388388-389389- ctx := r.Context()
390390-391391- switch req.Action {
392392- case "start":
393393- h.handleMultipartStart(w, r, req.Digest, req.Size)
394394- case "part":
395395- h.handleMultipartPart(w, r, req.UploadID, req.PartNumber, req.Digest)
396396- case "complete":
397397- h.handleMultipartComplete(w, r, req.UploadID, req.Digest, req.Parts)
398398- case "abort":
399399- h.handleMultipartAbort(w, r, req.UploadID, req.Digest)
400400- default:
401401- http.Error(w, "invalid action", http.StatusBadRequest)
402402- }
403403-}
404404-405405-func (h *XRPCHandler) handleMultipartStart(w http.ResponseWriter, r *http.Request, digest string, size int64) {
406406- ctx := r.Context()
407407-408408- // Use HoldService multipart manager
409409- // Note: h.blobStore is HoldServiceBlobStore which wraps the service
410410- uploadID, mode, err := h.blobStore.StartMultipart(ctx, digest, size)
411411- if err != nil {
412412- http.Error(w, fmt.Sprintf("failed to start upload: %v", err), http.StatusInternalServerError)
413413- return
414414- }
415415-416416- response := map[string]any{
417417- "uploadId": uploadID,
418418- "expiresAt": time.Now().Add(24 * time.Hour),
419419- "mode": mode, // "s3-native" or "buffered"
420420- }
421421-422422- w.Header().Set("Content-Type", "application/json")
423423- json.NewEncoder(w).Encode(response)
424424-}
425425-426426-func (h *XRPCHandler) handleMultipartPart(w http.ResponseWriter, r *http.Request, uploadID string, partNumber int, digest string) {
427427- ctx := r.Context()
428428-429429- // Get part upload URL (presigned S3 or buffered endpoint)
430430- partURL, err := h.blobStore.GetPartUploadURL(ctx, uploadID, partNumber, digest)
431431- if err != nil {
432432- http.Error(w, fmt.Sprintf("failed to get part URL: %v", err), http.StatusInternalServerError)
433433- return
434434- }
435435-436436- response := map[string]any{
437437- "url": partURL,
438438- "expiresAt": time.Now().Add(15 * time.Minute),
439439- "method": "PUT",
440440- }
441441-442442- w.Header().Set("Content-Type", "application/json")
443443- json.NewEncoder(w).Encode(response)
444444-}
445445-446446-func (h *XRPCHandler) handleMultipartComplete(w http.ResponseWriter, r *http.Request, uploadID string, digest string, parts []struct{ PartNumber int; ETag string }) {
447447- ctx := r.Context()
448448-449449- // Convert parts format
450450- completedParts := make([]hold.CompletedPart, len(parts))
451451- for i, p := range parts {
452452- completedParts[i] = hold.CompletedPart{
453453- PartNumber: p.PartNumber,
454454- ETag: p.ETag,
455455- }
456456- }
457457-458458- // Complete upload
459459- if err := h.blobStore.CompleteMultipart(ctx, uploadID, digest, completedParts); err != nil {
460460- http.Error(w, fmt.Sprintf("failed to complete upload: %v", err), http.StatusInternalServerError)
461461- return
462462- }
463463-464464- // Convert digest to CID for ATProto response format
465465- cid, err := digestToCID(digest)
466466- if err != nil {
467467- http.Error(w, fmt.Sprintf("failed to generate CID: %v", err), http.StatusInternalServerError)
468468- return
469469- }
470470-471471- response := map[string]any{
472472- "status": "completed",
473473- "blob": map[string]any{
474474- "$type": "blob",
475475- "ref": map[string]any{
476476- "$link": cid.String(),
477477- },
478478- "mimeType": "application/octet-stream",
479479- // Size would need to be tracked in session
480480- },
481481- }
482482-483483- w.Header().Set("Content-Type", "application/json")
484484- json.NewEncoder(w).Encode(response)
485485-}
486486-487487-func (h *XRPCHandler) handleMultipartAbort(w http.ResponseWriter, r *http.Request, uploadID string, digest string) {
488488- ctx := r.Context()
489489-490490- if err := h.blobStore.AbortMultipart(ctx, uploadID, digest); err != nil {
491491- http.Error(w, fmt.Sprintf("failed to abort upload: %v", err), http.StatusInternalServerError)
492492- return
493493- }
494494-495495- response := map[string]any{
496496- "status": "aborted",
497497- }
498498-499499- w.Header().Set("Content-Type", "application/json")
500500- json.NewEncoder(w).Encode(response)
501501-}
502502-503503-func (h *XRPCHandler) handleBufferedPartUpload(w http.ResponseWriter, r *http.Request) {
504504- uploadID := r.Header.Get("X-Upload-Id")
505505- partNumberStr := r.Header.Get("X-Part-Number")
506506-507507- partNumber, err := strconv.Atoi(partNumberStr)
508508- if err != nil {
509509- http.Error(w, "invalid part number", http.StatusBadRequest)
510510- return
511511- }
512512-513513- // Stream part data to storage
514514- etag, err := h.blobStore.UploadPart(r.Context(), uploadID, partNumber, r.Body)
515515- if err != nil {
516516- http.Error(w, fmt.Sprintf("failed to upload part: %v", err), http.StatusInternalServerError)
517517- return
518518- }
519519-520520- response := map[string]any{
521521- "etag": etag,
522522- "partNumber": partNumber,
523523- }
524524-525525- w.Header().Set("Content-Type", "application/json")
526526- json.NewEncoder(w).Encode(response)
527527-}
528528-529529-func (h *XRPCHandler) handleSingleBlobUpload(w http.ResponseWriter, r *http.Request) {
530530- // Standard ATProto uploadBlob behavior
531531- // Read blob data
532532- data, err := io.ReadAll(r.Body)
533533- if err != nil {
534534- http.Error(w, "failed to read blob", http.StatusInternalServerError)
535535- return
536536- }
537537-538538- // Upload to storage (single operation)
539539- cid, size, err := h.blobStore.UploadBlob(r.Context(), bytes.NewReader(data))
540540- if err != nil {
541541- http.Error(w, fmt.Sprintf("failed to upload blob: %v", err), http.StatusInternalServerError)
542542- return
543543- }
544544-545545- // Standard ATProto blob response format
546546- response := map[string]any{
547547- "blob": map[string]any{
548548- "$type": "blob",
549549- "ref": map[string]any{
550550- "$link": cid.String(),
551551- },
552552- "mimeType": "application/octet-stream",
553553- "size": size,
554554- },
555555- }
556556-557557- w.Header().Set("Content-Type", "application/json")
558558- json.NewEncoder(w).Encode(response)
559559-}
560560-561561-// digestToCID converts OCI digest (sha256:abc...) to ATProto CID
562562-func digestToCID(digest string) (cid.Cid, error) {
563563- // Implementation in pkg/hold/cid.go or similar
564564- // Strip "sha256:" prefix, decode hex, construct CIDv1 with sha256 multihash
565565- return cid.Undef, fmt.Errorf("not implemented")
566566-}
567567-```
568568-569569-### Phase 2: Extend HoldServiceBlobStore (pkg/hold/blobstore_adapter.go)
570570-571571-The `HoldServiceBlobStore` currently wraps HoldService for presigned URLs. Extend it to support multipart operations:
572572-573573-```go
574574-// Add multipart methods to HoldServiceBlobStore
575575-576576-func (h *HoldServiceBlobStore) StartMultipart(ctx context.Context, digest string, size int64) (uploadID string, mode string, err error) {
577577- uploadID, uploadMode, err := h.service.StartMultipartUploadWithManager(ctx, digest, h.service.MultipartMgr)
578578- if err != nil {
579579- return "", "", err
580580- }
581581-582582- modeStr := "s3-native"
583583- if uploadMode == hold.Buffered {
584584- modeStr = "buffered"
585585- }
586586-587587- return uploadID, modeStr, nil
588588-}
589589-590590-func (h *HoldServiceBlobStore) GetPartUploadURL(ctx context.Context, uploadID string, partNumber int, digest string) (string, error) {
591591- session, err := h.service.MultipartMgr.GetSession(uploadID)
592592- if err != nil {
593593- return "", err
594594- }
595595-596596- // For S3Native: return presigned URL
597597- // For Buffered: return self-referential URL with upload instructions
598598- if session.Mode == hold.S3Native {
599599- return h.service.GetPartUploadURL(ctx, session, partNumber, h.holdDID)
600600- }
601601-602602- // Buffered mode: client will PUT to uploadBlob with headers
603603- return fmt.Sprintf("%s/xrpc/com.atproto.repo.uploadBlob", h.publicURL), nil
604604-}
605605-606606-func (h *HoldServiceBlobStore) UploadPart(ctx context.Context, uploadID string, partNumber int, data io.Reader) (string, error) {
607607- // Buffered part upload - streams data to storage
608608- // Used when client PUTs to uploadBlob with X-Upload-Id header
609609- session, err := h.service.MultipartMgr.GetSession(uploadID)
610610- if err != nil {
611611- return "", err
612612- }
613613-614614- // Stream to storage, return ETag
615615- // This wraps HandleMultipartPartUpload logic
616616- etag, err := h.service.UploadPartBuffered(ctx, session, partNumber, data)
617617- return etag, err
618618-}
619619-620620-func (h *HoldServiceBlobStore) CompleteMultipart(ctx context.Context, uploadID string, digest string, parts []hold.CompletedPart) error {
621621- session, err := h.service.MultipartMgr.GetSession(uploadID)
622622- if err != nil {
623623- return err
624624- }
625625-626626- // For S3Native: record parts ETags
627627- if session.Mode == hold.S3Native {
628628- for _, p := range parts {
629629- session.RecordS3Part(p.PartNumber, p.ETag, 0)
630630- }
631631- }
632632-633633- return h.service.CompleteMultipartUploadWithManager(ctx, session, h.service.MultipartMgr)
634634-}
635635-636636-func (h *HoldServiceBlobStore) AbortMultipart(ctx context.Context, uploadID string, digest string) error {
637637- session, err := h.service.MultipartMgr.GetSession(uploadID)
638638- if err != nil {
639639- return err
640640- }
641641-642642- return h.service.AbortMultipartUploadWithManager(ctx, session, h.service.MultipartMgr)
643643-}
644644-645645-func (h *HoldServiceBlobStore) UploadBlob(ctx context.Context, data io.Reader) (cid.Cid, int64, error) {
646646- // Single blob upload for standard ATProto use case
647647- // Compute digest, store via service driver
648648- // Return CID and size
649649- // Implementation TBD
650650- return cid.Undef, 0, fmt.Errorf("not implemented")
651651-}
652652-```
653653-654654-### Phase 3: Update AppView Client (pkg/appview/storage/)
655655-656656-Create new XRPC client or update ProxyBlobStore to use unified endpoint:
657657-658658-**Download (GET/HEAD):**
659659-```go
660660-func (p *ProxyBlobStore) ServeBlob(ctx context.Context, w http.ResponseWriter, r *http.Request, dgst digest.Digest) error {
661661- // Pass digest directly as cid parameter (no conversion)
662662- url := fmt.Sprintf("%s/xrpc/com.atproto.sync.getBlob?did=%s&cid=%s",
663663- p.storageEndpoint, p.holdDID, dgst.String()) // cid=sha256:abc...
664664-665665- http.Redirect(w, r, url, http.StatusTemporaryRedirect)
666666- return nil
667667-}
668668-```
669669-670670-**Multipart Upload:**
671671-```go
672672-func (p *ProxyBlobStore) startMultipartUpload(ctx context.Context, digest string) (string, error) {
673673- reqBody := map[string]any{
674674- "action": "start",
675675- "digest": digest,
676676- }
677677-678678- body, _ := json.Marshal(reqBody)
679679- url := fmt.Sprintf("%s/xrpc/com.atproto.repo.uploadBlob", p.storageEndpoint)
680680- req, _ := http.NewRequestWithContext(ctx, "POST", url, bytes.NewReader(body))
681681- req.Header.Set("Content-Type", "application/json")
682682-683683- resp, err := p.httpClient.Do(req)
684684- // ... parse response, return uploadID
685685-}
686686-687687-func (p *ProxyBlobStore) getPartPresignedURL(ctx context.Context, digest, uploadID string, partNumber int) (string, error) {
688688- reqBody := map[string]any{
689689- "action": "part",
690690- "uploadId": uploadID,
691691- "partNumber": partNumber,
692692- "digest": digest,
693693- }
694694-695695- body, _ := json.Marshal(reqBody)
696696- url := fmt.Sprintf("%s/xrpc/com.atproto.repo.uploadBlob", p.storageEndpoint)
697697- req, _ := http.NewRequestWithContext(ctx, "POST", url, bytes.NewReader(body))
698698- req.Header.Set("Content-Type", "application/json")
699699-700700- resp, err := p.httpClient.Do(req)
701701- // ... parse response, return presigned URL
702702-}
703703-704704-// Similar for complete, abort
705705-```
706706-707707-### Phase 4: Testing Period
708708-709709-**During transition:**
710710-- Both legacy HTTP endpoints AND new XRPC endpoint active
711711-- AppView can use either based on configuration/feature flag
712712-- New deployments use XRPC
713713-- Old deployments continue with legacy
714714-715715-**Detection logic:**
716716-```go
717717-func (r *RoutingRepository) Blobs(ctx context.Context) distribution.BlobStore {
718718- // Try XRPC first (check for /.well-known/did.json)
719719- if supportsXRPC(storageEndpoint) {
720720- return NewXRPCBlobStore(storageEndpoint, ...)
721721- }
722722- // Fallback to legacy
723723- return NewProxyBlobStore(storageEndpoint, ...)
724724-}
725725-```
726726-727727-### Phase 5: Remove Legacy Endpoints
728728-729729-Once all holds migrated and tested:
730730-731731-**cmd/hold/main.go - Remove:**
732732-```go
733733-// DELETE these lines
734734-mux.HandleFunc("/presigned-url", service.HandlePresignedURL)
735735-mux.HandleFunc("/move", service.HandleMove)
736736-mux.HandleFunc("/start-multipart", service.HandleStartMultipart)
737737-mux.HandleFunc("/part-presigned-url", service.HandleGetPartURL)
738738-mux.HandleFunc("/complete-multipart", service.HandleCompleteMultipart)
739739-mux.HandleFunc("/abort-multipart", service.HandleAbortMultipart)
740740-mux.HandleFunc("/multipart-parts/", ...)
741741-```
742742-743743-**pkg/hold/handlers.go - Remove HTTP handler wrappers:**
744744-```go
745745-// DELETE these functions:
746746-// - HandlePresignedURL() - replaced by uploadBlob + getBlob XRPC endpoints
747747-// - HandleMove() - now internal operation in CompleteMultipartUploadWithManager()
748748-// - HandleStartMultipart() - replaced by uploadBlob?action=start
749749-// - HandleGetPartURL() - replaced by uploadBlob?action=part
750750-// - HandleCompleteMultipart() - replaced by uploadBlob?action=complete
751751-// - HandleAbortMultipart() - replaced by uploadBlob?action=abort
752752-// - HandleMultipartPartUpload() - replaced by uploadBlob PUT with headers
753753-754754-// KEEP internal service methods:
755755-// - s.getPresignedURL() - still used by blobstore_adapter
756756-// - s.driver.Move() - still used for temp→final move
757757-// - s.StartMultipartUploadWithManager() - core multipart logic
758758-// - s.GetPartUploadURL() - presigned URL generation
759759-// - s.CompleteMultipartUploadWithManager() - includes move operation
760760-// - s.AbortMultipartUploadWithManager() - cleanup logic
761761-```
762762-763763-## Key Design Decisions
764764-765765-1. **Content-Type discrimination**: Natural way to distinguish single vs multipart uploads
766766-2. **JSON bodies for all parameters**: Follows XRPC conventions (like putRecord, deleteRecord)
767767- - **No query parameters** - all operation details in request body
768768- - Makes requests more inspectable and debuggable
769769- - Easier to extend with new fields
770770-3. **Preserve standard uploadBlob**: Raw bytes still work for profile images, small media
771771-4. **Reuse existing code**: HoldService multipart logic unchanged, just new HTTP layer
772772-5. **Backward compatibility**: Both endpoints active during transition
773773-6. **Action-based routing**: Clear, extensible JSON structure
774774-7. **Move is internal**: `/move` endpoint logic absorbed into multipart complete operation
775775- - No separate XRPC endpoint needed
776776- - Simplifies AppView client code
777777-8. **Unified presigned URL handling**: Single `uploadBlob`/`getBlob` pair replaces operation-based routing
778778-9. **Flexible CID parameter**: `getBlob` accepts both standard CIDs and OCI digests via prefix detection
779779- - Keeps endpoint spec-compliant (GET with query params)
780780- - No conversion overhead on AppView side
781781- - Hold does simple prefix check: `sha256:` → use directly, else → convert CID
782782-783783-## Benefits
784784-785785-- ✅ Single endpoint for all blob operations
786786-- ✅ Standard ATProto uploadBlob preserved
787787-- ✅ XRPC-like JSON request/response
788788-- ✅ Reuses existing multipart.go logic
789789-- ✅ Gradual migration path
790790-- ✅ Less endpoints to maintain
791791-- ✅ Cleaner AppView client code
792792-793793-## Testing Checklist
794794-795795-- [ ] Single blob upload (< 10MB, raw bytes)
796796-- [ ] Multipart start → part → complete flow
797797-- [ ] S3Native mode (presigned URLs)
798798-- [ ] Buffered mode (proxy uploads)
799799-- [ ] Multipart abort
800800-- [ ] Large blob upload (> 5GB, many parts)
801801-- [ ] Concurrent uploads
802802-- [ ] Upload resume after network failure
803803-- [ ] Legacy endpoint backward compatibility
804804-- [ ] AppView XRPC client integration
805805-- [ ] Performance comparison (XRPC vs legacy)
806806-807807-## Migration Timeline
808808-809809-1. **Week 1**: Implement unified uploadBlob handler (Phase 1-2)
810810-2. **Week 2**: Update AppView client, feature flag (Phase 3)
811811-3. **Week 3**: Deploy to dev/staging, test both paths (Phase 4)
812812-4. **Week 4**: Roll out to production (gradual)
813813-5. **Week 5-6**: Monitor, verify all holds migrated
814814-6. **Week 7**: Remove legacy endpoints (Phase 5)
815815-816816-## References
817817-818818-- ATProto uploadBlob spec: https://docs.bsky.app/docs/api/com-atproto-repo-upload-blob
819819-- XRPC conventions: https://atproto.com/specs/xrpc
820820-- Existing multipart implementation: pkg/hold/multipart.go
821821-- Blob store adapter: pkg/hold/blobstore_adapter.go
···11+description: Normalize hold_endpoint column to store DIDs instead of URLs
22+query: |
33+ -- Convert any URL-formatted hold_endpoint values to DID format
44+ -- This ensures all hold identifiers are stored consistently as did:web:hostname
55+66+ -- Convert HTTPS URLs to did:web: format
77+ -- https://hold.example.com → did:web:hold.example.com
88+ UPDATE manifests
99+ SET hold_endpoint = 'did:web:' || substr(hold_endpoint, 9)
1010+ WHERE hold_endpoint LIKE 'https://%';
1111+1212+ -- Convert HTTP URLs to did:web: format
1313+ -- http://172.28.0.3:8080 → did:web:172.28.0.3:8080
1414+ UPDATE manifests
1515+ SET hold_endpoint = 'did:web:' || substr(hold_endpoint, 8)
1616+ WHERE hold_endpoint LIKE 'http://%';
1717+1818+ -- Entries already in did:web: format are left unchanged
1919+ -- did:web:hold.example.com → did:web:hold.example.com (no change)
+15-11
pkg/appview/db/models.go
···65656666// Push represents a combined tag and manifest for the recent pushes view
6767type Push struct {
6868- DID string
6969- Handle string
7070- Repository string
7171- Tag string
7272- Digest string
7373- Title string
7474- Description string
7575- IconURL string
7676- StarCount int
7777- PullCount int
7878- CreatedAt time.Time
6868+ DID string
6969+ Handle string
7070+ Repository string
7171+ Tag string
7272+ Digest string
7373+ Title string
7474+ Description string
7575+ IconURL string
7676+ StarCount int
7777+ PullCount int
7878+ CreatedAt time.Time
7979+ HoldEndpoint string // Hold endpoint for health checking
8080+ Reachable bool // Whether the hold endpoint is reachable
7981}
80828183// Repository represents an aggregated view of a user's repository
···156158 Platforms []PlatformInfo
157159 PlatformCount int
158160 IsManifestList bool
161161+ Reachable bool // Whether the hold endpoint is reachable
162162+ Pending bool // Whether health check is still in progress
159163}
+6-4
pkg/appview/db/queries.go
···4444 COALESCE(m.icon_url, ''),
4545 COALESCE(rs.pull_count, 0),
4646 COALESCE((SELECT COUNT(*) FROM stars WHERE owner_did = u.did AND repository = t.repository), 0),
4747- t.created_at
4747+ t.created_at,
4848+ m.hold_endpoint
4849 FROM tags t
4950 JOIN users u ON t.did = u.did
5051 JOIN manifests m ON t.did = m.did AND t.repository = m.repository AND t.digest = m.digest
···7071 var pushes []Push
7172 for rows.Next() {
7273 var p Push
7373- if err := rows.Scan(&p.DID, &p.Handle, &p.Repository, &p.Tag, &p.Digest, &p.Title, &p.Description, &p.IconURL, &p.PullCount, &p.StarCount, &p.CreatedAt); err != nil {
7474+ if err := rows.Scan(&p.DID, &p.Handle, &p.Repository, &p.Tag, &p.Digest, &p.Title, &p.Description, &p.IconURL, &p.PullCount, &p.StarCount, &p.CreatedAt, &p.HoldEndpoint); err != nil {
7475 return nil, 0, err
7576 }
7677 pushes = append(pushes, p)
···113114 COALESCE(m.icon_url, ''),
114115 COALESCE(rs.pull_count, 0),
115116 COALESCE((SELECT COUNT(*) FROM stars WHERE owner_did = u.did AND repository = t.repository), 0),
116116- t.created_at
117117+ t.created_at,
118118+ m.hold_endpoint
117119 FROM tags t
118120 JOIN users u ON t.did = u.did
119121 JOIN manifests m ON t.did = m.did AND t.repository = m.repository AND t.digest = m.digest
···136138 var pushes []Push
137139 for rows.Next() {
138140 var p Push
139139- if err := rows.Scan(&p.DID, &p.Handle, &p.Repository, &p.Tag, &p.Digest, &p.Title, &p.Description, &p.IconURL, &p.PullCount, &p.StarCount, &p.CreatedAt); err != nil {
141141+ if err := rows.Scan(&p.DID, &p.Handle, &p.Repository, &p.Tag, &p.Digest, &p.Title, &p.Description, &p.IconURL, &p.PullCount, &p.StarCount, &p.CreatedAt, &p.HoldEndpoint); err != nil {
140142 return nil, 0, err
141143 }
142144 pushes = append(pushes, p)
+1-1
pkg/appview/db/schema.go
···3838 did TEXT NOT NULL,
3939 repository TEXT NOT NULL,
4040 digest TEXT NOT NULL,
4141- hold_endpoint TEXT NOT NULL,
4141+ hold_endpoint TEXT NOT NULL, -- Stored as DID (e.g., did:web:hold.example.com)
4242 schema_version INTEGER NOT NULL,
4343 media_type TEXT NOT NULL,
4444 config_digest TEXT,
+34-3
pkg/appview/handlers/home.go
···77 "strconv"
8899 "atcr.io/pkg/appview/db"
1010+ "atcr.io/pkg/appview/holdhealth"
1011)
11121213// HomeHandler handles the home page
···54555556// RecentPushesHandler handles the HTMX request for recent pushes
5657type RecentPushesHandler struct {
5757- DB *sql.DB
5858- Templates *template.Template
5959- RegistryURL string
5858+ DB *sql.DB
5959+ Templates *template.Template
6060+ RegistryURL string
6161+ HealthChecker *holdhealth.Checker
6062}
61636264func (h *RecentPushesHandler) ServeHTTP(w http.ResponseWriter, r *http.Request) {
···7678 if err != nil {
7779 http.Error(w, err.Error(), http.StatusInternalServerError)
7880 return
8181+ }
8282+8383+ // Check health status and filter out unreachable manifests for home page
8484+ // Use GetCachedStatus only (no blocking) - background worker keeps cache fresh
8585+ if h.HealthChecker != nil {
8686+ reachablePushes := []db.Push{}
8787+ for i := range pushes {
8888+ if pushes[i].HoldEndpoint != "" {
8989+ // Use cached status only - don't block on health checks
9090+ cached := h.HealthChecker.GetCachedStatus(pushes[i].HoldEndpoint)
9191+ if cached != nil {
9292+ pushes[i].Reachable = cached.Reachable
9393+ // Only show reachable pushes on home page
9494+ if cached.Reachable {
9595+ reachablePushes = append(reachablePushes, pushes[i])
9696+ }
9797+ } else {
9898+ // No cached status - optimistically show it (background worker will check)
9999+ pushes[i].Reachable = true
100100+ reachablePushes = append(reachablePushes, pushes[i])
101101+ }
102102+ }
103103+ }
104104+ pushes = reachablePushes
105105+ } else {
106106+ // If no health checker, assume all are reachable (backward compatibility)
107107+ for i := range pushes {
108108+ pushes[i].Reachable = true
109109+ }
79110 }
8011181112 data := struct {
+76
pkg/appview/handlers/manifest_health.go
···11+package handlers
22+33+import (
44+ "context"
55+ "net/http"
66+ "net/url"
77+ "time"
88+99+ "atcr.io/pkg/appview/holdhealth"
1010+)
1111+1212+// ManifestHealthHandler handles HTMX polling for manifest health status
1313+type ManifestHealthHandler struct {
1414+ HealthChecker *holdhealth.Checker
1515+}
1616+1717+func (h *ManifestHealthHandler) ServeHTTP(w http.ResponseWriter, r *http.Request) {
1818+ // Get endpoint from query parameter
1919+ endpoint := r.URL.Query().Get("endpoint")
2020+ if endpoint == "" {
2121+ http.Error(w, "endpoint parameter required", http.StatusBadRequest)
2222+ return
2323+ }
2424+2525+ // Decode URL-encoded endpoint
2626+ endpoint, err := url.QueryUnescape(endpoint)
2727+ if err != nil {
2828+ http.Error(w, "invalid endpoint parameter", http.StatusBadRequest)
2929+ return
3030+ }
3131+3232+ // Try to get cached status first (instant if background worker has checked it)
3333+ cached := h.HealthChecker.GetCachedStatus(endpoint)
3434+ if cached != nil {
3535+ // Cache hit - return final status
3636+ h.renderBadge(w, endpoint, cached.Reachable, false)
3737+ return
3838+ }
3939+4040+ // Cache miss - perform quick check with 2 second timeout
4141+ ctx, cancel := context.WithTimeout(r.Context(), 2*time.Second)
4242+ defer cancel()
4343+4444+ reachable, err := h.HealthChecker.CheckHealth(ctx, endpoint)
4545+4646+ if ctx.Err() == context.DeadlineExceeded {
4747+ // Still pending - render "Checking..." badge with HTMX retry
4848+ h.renderBadge(w, endpoint, false, true)
4949+ } else if err != nil {
5050+ // Error - mark as unreachable
5151+ h.renderBadge(w, endpoint, false, false)
5252+ } else {
5353+ // Success
5454+ h.renderBadge(w, endpoint, reachable, false)
5555+ }
5656+}
5757+5858+// renderBadge renders the appropriate badge HTML snippet
5959+func (h *ManifestHealthHandler) renderBadge(w http.ResponseWriter, endpoint string, reachable, pending bool) {
6060+ w.Header().Set("Content-Type", "text/html")
6161+6262+ if pending {
6363+ // Still checking - render badge with HTMX retry after 3 seconds
6464+ retryURL := "/api/manifest-health?endpoint=" + url.QueryEscape(endpoint)
6565+ w.Write([]byte(`<span class="checking-badge"
6666+ hx-get="` + retryURL + `"
6767+ hx-trigger="load delay:3s"
6868+ hx-swap="outerHTML">🔄 Checking...</span>`))
6969+ } else if !reachable {
7070+ // Unreachable - render offline badge
7171+ w.Write([]byte(`<span class="offline-badge">⚠️ Offline</span>`))
7272+ } else {
7373+ // Reachable - no badge (empty response)
7474+ w.Write([]byte(``))
7575+ }
7676+}
+80-12
pkg/appview/handlers/repository.go
···11package handlers
2233import (
44+ "context"
45 "database/sql"
56 "html/template"
67 "log"
78 "net/http"
99+ "sync"
1010+ "time"
811912 "atcr.io/pkg/appview/db"
1313+ "atcr.io/pkg/appview/holdhealth"
1014 "atcr.io/pkg/appview/middleware"
1115 "atcr.io/pkg/atproto"
1216 "atcr.io/pkg/auth/oauth"
···16201721// RepositoryPageHandler handles the public repository page
1822type RepositoryPageHandler struct {
1919- DB *sql.DB
2020- Templates *template.Template
2121- RegistryURL string
2222- Directory identity.Directory
2323- Refresher *oauth.Refresher
2323+ DB *sql.DB
2424+ Templates *template.Template
2525+ RegistryURL string
2626+ Directory identity.Directory
2727+ Refresher *oauth.Refresher
2828+ HealthChecker *holdhealth.Checker
2429}
25302631func (h *RepositoryPageHandler) ServeHTTP(w http.ResponseWriter, r *http.Request) {
···5459 return
5560 }
56616262+ // Check health status for each manifest's hold endpoint (concurrent with 1s timeout)
6363+ if h.HealthChecker != nil {
6464+ // Create context with 1 second deadline for fast-fail
6565+ ctx, cancel := context.WithTimeout(r.Context(), 1*time.Second)
6666+ defer cancel()
6767+6868+ var wg sync.WaitGroup
6969+ var mu sync.Mutex
7070+7171+ for i := range manifests {
7272+ if manifests[i].HoldEndpoint == "" {
7373+ // No hold endpoint, mark as unreachable
7474+ manifests[i].Reachable = false
7575+ manifests[i].Pending = false
7676+ continue
7777+ }
7878+7979+ wg.Add(1)
8080+ go func(idx int) {
8181+ defer wg.Done()
8282+8383+ endpoint := manifests[idx].HoldEndpoint
8484+8585+ // Try to get cached status first (instant)
8686+ if cached := h.HealthChecker.GetCachedStatus(endpoint); cached != nil {
8787+ mu.Lock()
8888+ manifests[idx].Reachable = cached.Reachable
8989+ manifests[idx].Pending = false
9090+ mu.Unlock()
9191+ return
9292+ }
9393+9494+ // Perform health check with timeout context
9595+ reachable, err := h.HealthChecker.CheckHealth(ctx, endpoint)
9696+9797+ mu.Lock()
9898+ if ctx.Err() == context.DeadlineExceeded {
9999+ // Timeout - mark as pending for HTMX polling
100100+ manifests[idx].Reachable = false
101101+ manifests[idx].Pending = true
102102+ } else if err != nil {
103103+ // Error - mark as unreachable
104104+ manifests[idx].Reachable = false
105105+ manifests[idx].Pending = false
106106+ } else {
107107+ // Success
108108+ manifests[idx].Reachable = reachable
109109+ manifests[idx].Pending = false
110110+ }
111111+ mu.Unlock()
112112+ }(i)
113113+ }
114114+115115+ // Wait for all checks to complete or timeout
116116+ wg.Wait()
117117+ } else {
118118+ // If no health checker, assume all are reachable (backward compatibility)
119119+ for i := range manifests {
120120+ manifests[i].Reachable = true
121121+ manifests[i].Pending = false
122122+ }
123123+ }
124124+57125 if len(tagsWithPlatforms) == 0 && len(manifests) == 0 {
58126 http.Error(w, "Repository not found", http.StatusNotFound)
59127 return
···100168101169 data := struct {
102170 PageData
103103- Owner *db.User // Repository owner
104104- Repository *db.Repository // Repository summary
105105- Tags []db.TagWithPlatforms // Tags with platform info
106106- Manifests []db.ManifestWithMetadata // Top-level manifests only
107107- StarCount int
108108- IsStarred bool
109109- IsOwner bool // Whether current user owns this repository
171171+ Owner *db.User // Repository owner
172172+ Repository *db.Repository // Repository summary
173173+ Tags []db.TagWithPlatforms // Tags with platform info
174174+ Manifests []db.ManifestWithMetadata // Top-level manifests only
175175+ StarCount int
176176+ IsStarred bool
177177+ IsOwner bool // Whether current user owns this repository
110178 }{
111179 PageData: NewPageData(r, h.RegistryURL),
112180 Owner: owner,
+179
pkg/appview/holdhealth/checker.go
···11+package holdhealth
22+33+import (
44+ "context"
55+ "fmt"
66+ "net/http"
77+ "sync"
88+ "time"
99+1010+ "atcr.io/pkg/appview"
1111+)
1212+1313+// HealthStatus represents the health status of a hold endpoint
1414+type HealthStatus struct {
1515+ Reachable bool
1616+ LastChecked time.Time
1717+ LastError error
1818+}
1919+2020+// Checker manages health checking for hold endpoints
2121+type Checker struct {
2222+ client *http.Client
2323+ cache map[string]*HealthStatus
2424+ cacheMu sync.RWMutex
2525+ cacheTTL time.Duration
2626+ cleanupMu sync.Mutex
2727+}
2828+2929+// NewChecker creates a new health checker with the specified cache TTL
3030+func NewChecker(cacheTTL time.Duration) *Checker {
3131+ return NewCheckerWithTimeout(cacheTTL, 2*time.Second)
3232+}
3333+3434+// NewCheckerWithTimeout creates a new health checker with custom timeout
3535+// Useful for testing with shorter timeouts
3636+func NewCheckerWithTimeout(cacheTTL, httpTimeout time.Duration) *Checker {
3737+ return &Checker{
3838+ client: &http.Client{
3939+ Timeout: httpTimeout,
4040+ },
4141+ cache: make(map[string]*HealthStatus),
4242+ cacheTTL: cacheTTL,
4343+ }
4444+}
4545+4646+// CheckHealth performs an HTTP health check on the hold endpoint
4747+// Accepts either DID (did:web:host) or URL (https://host) format
4848+// Checks {endpoint}/xrpc/_health and returns true if reachable
4949+func (c *Checker) CheckHealth(ctx context.Context, endpoint string) (bool, error) {
5050+ // Convert DID to HTTP URL if needed
5151+ // did:web:hold.example.com → https://hold.example.com
5252+ // https://hold.example.com → https://hold.example.com (passthrough)
5353+ httpURL := appview.ResolveHoldURL(endpoint)
5454+5555+ // Build health check URL
5656+ healthURL := httpURL + "/xrpc/_health"
5757+5858+ // Create request with context
5959+ req, err := http.NewRequestWithContext(ctx, "GET", healthURL, nil)
6060+ if err != nil {
6161+ return false, fmt.Errorf("failed to create request: %w", err)
6262+ }
6363+6464+ // Perform request
6565+ resp, err := c.client.Do(req)
6666+ if err != nil {
6767+ return false, fmt.Errorf("request failed: %w", err)
6868+ }
6969+ defer resp.Body.Close()
7070+7171+ // Check status code
7272+ if resp.StatusCode >= 200 && resp.StatusCode < 300 {
7373+ return true, nil
7474+ }
7575+7676+ return false, fmt.Errorf("unexpected status code: %d", resp.StatusCode)
7777+}
7878+7979+// GetStatus returns the cached health status for an endpoint
8080+// If the cache is expired or missing, it performs an on-demand check
8181+func (c *Checker) GetStatus(ctx context.Context, endpoint string) *HealthStatus {
8282+ // Check cache first
8383+ c.cacheMu.RLock()
8484+ status, exists := c.cache[endpoint]
8585+ c.cacheMu.RUnlock()
8686+8787+ // If cached and not expired, return it
8888+ if exists && time.Since(status.LastChecked) < c.cacheTTL {
8989+ return status
9090+ }
9191+9292+ // On-demand check
9393+ reachable, err := c.CheckHealth(ctx, endpoint)
9494+9595+ // Update cache
9696+ newStatus := &HealthStatus{
9797+ Reachable: reachable,
9898+ LastChecked: time.Now(),
9999+ LastError: err,
100100+ }
101101+102102+ c.cacheMu.Lock()
103103+ c.cache[endpoint] = newStatus
104104+ c.cacheMu.Unlock()
105105+106106+ return newStatus
107107+}
108108+109109+// GetCachedStatus returns the cached status without performing a check
110110+// Returns nil if no cached status exists
111111+func (c *Checker) GetCachedStatus(endpoint string) *HealthStatus {
112112+ c.cacheMu.RLock()
113113+ defer c.cacheMu.RUnlock()
114114+115115+ status, exists := c.cache[endpoint]
116116+ if !exists {
117117+ return nil
118118+ }
119119+120120+ // Return nil if expired
121121+ if time.Since(status.LastChecked) > c.cacheTTL {
122122+ return nil
123123+ }
124124+125125+ return status
126126+}
127127+128128+// SetStatus manually sets the health status for an endpoint
129129+// Used by the background worker to update cache
130130+func (c *Checker) SetStatus(endpoint string, reachable bool, err error) {
131131+ status := &HealthStatus{
132132+ Reachable: reachable,
133133+ LastChecked: time.Now(),
134134+ LastError: err,
135135+ }
136136+137137+ c.cacheMu.Lock()
138138+ c.cache[endpoint] = status
139139+ c.cacheMu.Unlock()
140140+}
141141+142142+// Cleanup removes stale cache entries (older than 30 minutes)
143143+func (c *Checker) Cleanup() {
144144+ c.cleanupMu.Lock()
145145+ defer c.cleanupMu.Unlock()
146146+147147+ c.cacheMu.Lock()
148148+ defer c.cacheMu.Unlock()
149149+150150+ cutoff := time.Now().Add(-30 * time.Minute)
151151+ for endpoint, status := range c.cache {
152152+ if status.LastChecked.Before(cutoff) {
153153+ delete(c.cache, endpoint)
154154+ }
155155+ }
156156+}
157157+158158+// GetCacheStats returns cache statistics for debugging
159159+func (c *Checker) GetCacheStats() map[string]any {
160160+ c.cacheMu.RLock()
161161+ defer c.cacheMu.RUnlock()
162162+163163+ reachable := 0
164164+ unreachable := 0
165165+166166+ for _, status := range c.cache {
167167+ if status.Reachable {
168168+ reachable++
169169+ } else {
170170+ unreachable++
171171+ }
172172+ }
173173+174174+ return map[string]any{
175175+ "total": len(c.cache),
176176+ "reachable": reachable,
177177+ "unreachable": unreachable,
178178+ }
179179+}
+253
pkg/appview/holdhealth/checker_test.go
···11+package holdhealth
22+33+import (
44+ "context"
55+ "net/http"
66+ "net/http/httptest"
77+ "testing"
88+ "time"
99+)
1010+1111+func TestNewChecker(t *testing.T) {
1212+ cacheTTL := 15 * time.Minute
1313+ checker := NewChecker(cacheTTL)
1414+1515+ if checker == nil {
1616+ t.Fatal("NewChecker returned nil")
1717+ }
1818+1919+ if checker.cacheTTL != cacheTTL {
2020+ t.Errorf("Expected cache TTL %v, got %v", cacheTTL, checker.cacheTTL)
2121+ }
2222+2323+ if checker.cache == nil {
2424+ t.Error("Cache map not initialized")
2525+ }
2626+}
2727+2828+func TestCheckHealth_Success(t *testing.T) {
2929+ // Create test server that returns 200
3030+ server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
3131+ if r.URL.Path != "/xrpc/_health" {
3232+ t.Errorf("Expected path /xrpc/_health, got %s", r.URL.Path)
3333+ }
3434+ w.WriteHeader(http.StatusOK)
3535+ w.Write([]byte(`{"version": "1.0.0"}`))
3636+ }))
3737+ defer server.Close()
3838+3939+ checker := NewChecker(15 * time.Minute)
4040+ ctx := context.Background()
4141+4242+ reachable, err := checker.CheckHealth(ctx, server.URL)
4343+ if err != nil {
4444+ t.Errorf("CheckHealth returned error: %v", err)
4545+ }
4646+4747+ if !reachable {
4848+ t.Error("Expected hold to be reachable")
4949+ }
5050+}
5151+5252+func TestCheckHealth_WithDID(t *testing.T) {
5353+ // Create test server that returns 200
5454+ server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
5555+ if r.URL.Path != "/xrpc/_health" {
5656+ t.Errorf("Expected path /xrpc/_health, got %s", r.URL.Path)
5757+ }
5858+ w.WriteHeader(http.StatusOK)
5959+ w.Write([]byte(`{"version": "1.0.0"}`))
6060+ }))
6161+ defer server.Close()
6262+6363+ checker := NewChecker(15 * time.Minute)
6464+ ctx := context.Background()
6565+6666+ // Test with DID format (did:web:host)
6767+ // Extract host:port from test server URL
6868+ // http://127.0.0.1:12345 → did:web:127.0.0.1:12345
6969+ serverURL := server.URL
7070+ didFormat := "did:web:" + serverURL[7:] // Remove "http://"
7171+7272+ reachable, err := checker.CheckHealth(ctx, didFormat)
7373+ if err != nil {
7474+ t.Errorf("CheckHealth with DID returned error: %v", err)
7575+ }
7676+7777+ if !reachable {
7878+ t.Error("Expected hold to be reachable with DID format")
7979+ }
8080+}
8181+8282+func TestCheckHealth_Failure(t *testing.T) {
8383+ // Create test server that returns 500
8484+ server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
8585+ w.WriteHeader(http.StatusInternalServerError)
8686+ }))
8787+ defer server.Close()
8888+8989+ checker := NewChecker(15 * time.Minute)
9090+ ctx := context.Background()
9191+9292+ reachable, err := checker.CheckHealth(ctx, server.URL)
9393+ if err == nil {
9494+ t.Error("Expected error for 500 status code")
9595+ }
9696+9797+ if reachable {
9898+ t.Error("Expected hold to be unreachable")
9999+ }
100100+}
101101+102102+func TestCheckHealth_Timeout(t *testing.T) {
103103+ // Create test server that delays longer than client timeout
104104+ server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
105105+ time.Sleep(200 * time.Millisecond) // Longer than 100ms test timeout
106106+ }))
107107+ defer server.Close()
108108+109109+ // Use custom timeout of 100ms for faster test
110110+ checker := NewCheckerWithTimeout(15*time.Minute, 100*time.Millisecond)
111111+ ctx := context.Background()
112112+113113+ reachable, err := checker.CheckHealth(ctx, server.URL)
114114+ if err == nil {
115115+ t.Error("Expected timeout error")
116116+ }
117117+118118+ if reachable {
119119+ t.Error("Expected hold to be unreachable due to timeout")
120120+ }
121121+}
122122+123123+func TestGetStatus_CacheHit(t *testing.T) {
124124+ checker := NewChecker(15 * time.Minute)
125125+ endpoint := "https://example.com"
126126+127127+ // Manually set cached status
128128+ checker.SetStatus(endpoint, true, nil)
129129+130130+ // Get status should return cached value
131131+ status := checker.GetStatus(context.Background(), endpoint)
132132+ if status == nil {
133133+ t.Fatal("GetStatus returned nil")
134134+ }
135135+136136+ if !status.Reachable {
137137+ t.Error("Expected cached status to be reachable")
138138+ }
139139+140140+ if status.LastError != nil {
141141+ t.Errorf("Expected no error, got %v", status.LastError)
142142+ }
143143+}
144144+145145+func TestGetStatus_CacheMiss(t *testing.T) {
146146+ // Create test server
147147+ server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
148148+ w.WriteHeader(http.StatusOK)
149149+ }))
150150+ defer server.Close()
151151+152152+ checker := NewChecker(15 * time.Minute)
153153+154154+ // Get status should perform check on cache miss
155155+ status := checker.GetStatus(context.Background(), server.URL)
156156+ if status == nil {
157157+ t.Fatal("GetStatus returned nil")
158158+ }
159159+160160+ if !status.Reachable {
161161+ t.Error("Expected status to be reachable")
162162+ }
163163+}
164164+165165+func TestGetStatus_CacheExpiry(t *testing.T) {
166166+ // Create checker with very short TTL
167167+ checker := NewChecker(100 * time.Millisecond)
168168+ endpoint := "https://example.com"
169169+170170+ // Set cached status
171171+ checker.SetStatus(endpoint, true, nil)
172172+173173+ // Wait for cache to expire
174174+ time.Sleep(150 * time.Millisecond)
175175+176176+ // GetCachedStatus should return nil for expired entry
177177+ status := checker.GetCachedStatus(endpoint)
178178+ if status != nil {
179179+ t.Error("Expected nil for expired cache entry")
180180+ }
181181+}
182182+183183+func TestSetStatus(t *testing.T) {
184184+ checker := NewChecker(15 * time.Minute)
185185+ endpoint := "https://example.com"
186186+187187+ // Set status
188188+ checker.SetStatus(endpoint, true, nil)
189189+190190+ // Verify it was set
191191+ status := checker.GetCachedStatus(endpoint)
192192+ if status == nil {
193193+ t.Fatal("Status not found in cache")
194194+ }
195195+196196+ if !status.Reachable {
197197+ t.Error("Expected status to be reachable")
198198+ }
199199+}
200200+201201+func TestCleanup(t *testing.T) {
202202+ checker := NewChecker(1 * time.Minute)
203203+204204+ // Add old entry (simulate old timestamp by manually setting it)
205205+ endpoint := "https://example.com"
206206+ checker.cache[endpoint] = &HealthStatus{
207207+ Reachable: true,
208208+ LastChecked: time.Now().Add(-31 * time.Minute), // 31 minutes ago
209209+ }
210210+211211+ // Add recent entry
212212+ recentEndpoint := "https://recent.com"
213213+ checker.SetStatus(recentEndpoint, true, nil)
214214+215215+ // Run cleanup
216216+ checker.Cleanup()
217217+218218+ // Old entry should be removed
219219+ if checker.GetCachedStatus(endpoint) != nil {
220220+ t.Error("Expected old entry to be cleaned up")
221221+ }
222222+223223+ // Recent entry should remain
224224+ if checker.GetCachedStatus(recentEndpoint) == nil {
225225+ t.Error("Expected recent entry to remain after cleanup")
226226+ }
227227+}
228228+229229+func TestGetCacheStats(t *testing.T) {
230230+ checker := NewChecker(15 * time.Minute)
231231+232232+ // Add some entries
233233+ checker.SetStatus("https://reachable1.com", true, nil)
234234+ checker.SetStatus("https://reachable2.com", true, nil)
235235+ checker.SetStatus("https://unreachable1.com", false, nil)
236236+237237+ stats := checker.GetCacheStats()
238238+239239+ total, ok := stats["total"].(int)
240240+ if !ok || total != 3 {
241241+ t.Errorf("Expected total=3, got %v", stats["total"])
242242+ }
243243+244244+ reachable, ok := stats["reachable"].(int)
245245+ if !ok || reachable != 2 {
246246+ t.Errorf("Expected reachable=2, got %v", stats["reachable"])
247247+ }
248248+249249+ unreachable, ok := stats["unreachable"].(int)
250250+ if !ok || unreachable != 1 {
251251+ t.Errorf("Expected unreachable=1, got %v", stats["unreachable"])
252252+ }
253253+}
+169
pkg/appview/holdhealth/worker.go
···11+package holdhealth
22+33+import (
44+ "context"
55+ "database/sql"
66+ "fmt"
77+ "log"
88+ "sync"
99+ "time"
1010+)
1111+1212+// DBQuerier interface for database queries (allows mocking in tests)
1313+type DBQuerier interface {
1414+ GetUniqueHoldEndpoints() ([]string, error)
1515+}
1616+1717+// Worker runs background health checks for hold endpoints
1818+type Worker struct {
1919+ checker *Checker
2020+ db DBQuerier
2121+ refreshTicker *time.Ticker
2222+ cleanupTicker *time.Ticker
2323+ stopChan chan struct{}
2424+ wg sync.WaitGroup
2525+}
2626+2727+// NewWorker creates a new background worker
2828+func NewWorker(checker *Checker, db DBQuerier, refreshInterval time.Duration) *Worker {
2929+ return &Worker{
3030+ checker: checker,
3131+ db: db,
3232+ refreshTicker: time.NewTicker(refreshInterval),
3333+ cleanupTicker: time.NewTicker(30 * time.Minute), // Cleanup every 30 minutes
3434+ stopChan: make(chan struct{}),
3535+ }
3636+}
3737+3838+// Start begins the background worker
3939+func (w *Worker) Start(ctx context.Context) {
4040+ w.wg.Add(1)
4141+ go func() {
4242+ defer w.wg.Done()
4343+4444+ log.Println("Hold health worker: Starting background health checks")
4545+4646+ // Perform initial check immediately
4747+ w.refreshAllHolds(ctx)
4848+4949+ for {
5050+ select {
5151+ case <-ctx.Done():
5252+ log.Println("Hold health worker: Context cancelled, stopping")
5353+ return
5454+ case <-w.stopChan:
5555+ log.Println("Hold health worker: Stop signal received")
5656+ return
5757+ case <-w.refreshTicker.C:
5858+ w.refreshAllHolds(ctx)
5959+ case <-w.cleanupTicker.C:
6060+ log.Println("Hold health worker: Running cache cleanup")
6161+ w.checker.Cleanup()
6262+ }
6363+ }
6464+ }()
6565+}
6666+6767+// Stop gracefully stops the worker
6868+func (w *Worker) Stop() {
6969+ close(w.stopChan)
7070+ w.refreshTicker.Stop()
7171+ w.cleanupTicker.Stop()
7272+ w.wg.Wait()
7373+ log.Println("Hold health worker: Stopped")
7474+}
7575+7676+// refreshAllHolds queries the database for unique hold endpoints and refreshes their health status
7777+func (w *Worker) refreshAllHolds(ctx context.Context) {
7878+ log.Println("Hold health worker: Starting refresh cycle")
7979+8080+ // Get unique hold endpoints from database
8181+ endpoints, err := w.db.GetUniqueHoldEndpoints()
8282+ if err != nil {
8383+ log.Printf("Hold health worker: Failed to fetch hold endpoints: %v", err)
8484+ return
8585+ }
8686+8787+ if len(endpoints) == 0 {
8888+ log.Println("Hold health worker: No hold endpoints to check")
8989+ return
9090+ }
9191+9292+ log.Printf("Hold health worker: Checking %d unique hold endpoints", len(endpoints))
9393+9494+ // Check health concurrently with rate limiting
9595+ // Use a semaphore to limit concurrent requests (max 10 at a time)
9696+ sem := make(chan struct{}, 10)
9797+ var wg sync.WaitGroup
9898+9999+ reachable := 0
100100+ unreachable := 0
101101+ var statsMu sync.Mutex
102102+103103+ for _, endpoint := range endpoints {
104104+ wg.Add(1)
105105+106106+ go func(ep string) {
107107+ defer wg.Done()
108108+109109+ // Acquire semaphore
110110+ sem <- struct{}{}
111111+ defer func() { <-sem }()
112112+113113+ // Check health
114114+ isReachable, err := w.checker.CheckHealth(ctx, ep)
115115+116116+ // Update cache
117117+ w.checker.SetStatus(ep, isReachable, err)
118118+119119+ // Update stats
120120+ statsMu.Lock()
121121+ if isReachable {
122122+ reachable++
123123+ } else {
124124+ unreachable++
125125+ log.Printf("Hold health worker: Hold unreachable: %s (error: %v)", ep, err)
126126+ }
127127+ statsMu.Unlock()
128128+ }(endpoint)
129129+ }
130130+131131+ // Wait for all checks to complete
132132+ wg.Wait()
133133+134134+ log.Printf("Hold health worker: Refresh complete - %d reachable, %d unreachable", reachable, unreachable)
135135+}
136136+137137+// DBAdapter wraps sql.DB to implement DBQuerier interface
138138+type DBAdapter struct {
139139+ db *sql.DB
140140+}
141141+142142+// NewDBAdapter creates a new database adapter
143143+func NewDBAdapter(db *sql.DB) *DBAdapter {
144144+ return &DBAdapter{db: db}
145145+}
146146+147147+// GetUniqueHoldEndpoints queries the database for unique hold endpoints
148148+func (a *DBAdapter) GetUniqueHoldEndpoints() ([]string, error) {
149149+ rows, err := a.db.Query(`SELECT DISTINCT hold_endpoint FROM manifests WHERE hold_endpoint != ''`)
150150+ if err != nil {
151151+ return nil, fmt.Errorf("failed to query hold endpoints: %w", err)
152152+ }
153153+ defer rows.Close()
154154+155155+ var endpoints []string
156156+ for rows.Next() {
157157+ var endpoint string
158158+ if err := rows.Scan(&endpoint); err != nil {
159159+ return nil, fmt.Errorf("failed to scan endpoint: %w", err)
160160+ }
161161+ endpoints = append(endpoints, endpoint)
162162+ }
163163+164164+ if err := rows.Err(); err != nil {
165165+ return nil, fmt.Errorf("error iterating rows: %w", err)
166166+ }
167167+168168+ return endpoints, nil
169169+}