···11-"""TODO"""
11+"""Local repository storage for atdata datasets.
22+33+This module provides a local storage backend for atdata datasets using:
44+- S3-compatible object storage for dataset tar files and metadata
55+- Redis for indexing and tracking datasets
66+77+The main classes are:
88+- Repo: Manages dataset storage in S3 with Redis indexing
99+- Index: Redis-backed index for tracking dataset metadata
1010+- BasicIndexEntry: Index entry representing a stored dataset
1111+1212+This is intended for development and small-scale deployment before
1313+migrating to the full atproto PDS infrastructure.
1414+"""
215316##
417# Imports
···5568# Helpers
56695770def _kind_str_for_sample_type( st: Type[PackableSample] ) -> str:
5858- """TODO"""
7171+ """Convert a sample type to a fully-qualified string identifier.
7272+7373+ Args:
7474+ st: The sample type class.
7575+7676+ Returns:
7777+ A string in the format 'module.name' identifying the sample type.
7878+ """
5979 return f'{st.__module__}.{st.__name__}'
60806181def _decode_bytes_dict( d: dict[bytes, bytes] ) -> dict[str, str]:
6262- """TODO"""
8282+ """Decode a dictionary with byte keys and values to strings.
8383+8484+ Redis returns dictionaries with bytes keys/values, this converts them to strings.
8585+8686+ Args:
8787+ d: Dictionary with bytes keys and values.
8888+8989+ Returns:
9090+ Dictionary with UTF-8 decoded string keys and values.
9191+ """
6392 return {
6493 k.decode('utf-8'): v.decode('utf-8')
6594 for k, v in d.items()
···7110072101@dataclass
73102class BasicIndexEntry:
7474- """TODO"""
103103+ """Index entry for a dataset stored in the repository.
104104+105105+ Tracks metadata about a dataset stored in S3, including its location,
106106+ type, and unique identifier.
107107+ """
75108 ##
7610977110 wds_url: str
7878- """TODO"""
111111+ """WebDataset URL for the dataset tar files, for use with atdata.Dataset."""
112112+79113 sample_kind: str
8080- """TODO"""
114114+ """Fully-qualified sample type name (e.g., 'module.ClassName')."""
8111582116 metadata_url: str | None
8383- """TODO"""
117117+ """S3 URL to the dataset's metadata msgpack file, if any."""
8411885119 uuid: str | None = field( default_factory = lambda: str( uuid4() ) )
8686- """TODO"""
120120+ """Unique identifier for this dataset entry. Defaults to a new UUID if not provided."""
8712188122 def write_to( self, redis: Redis ):
8989- """TODO"""
123123+ """Persist this index entry to Redis.
124124+125125+ Stores the entry as a Redis hash with key 'BasicIndexEntry:{uuid}'.
126126+127127+ Args:
128128+ redis: Redis connection to write to.
129129+ """
90130 save_key = f'BasicIndexEntry:{self.uuid}'
91131 # TODO figure out how to get linting to work correctly here
92132 redis.hset( save_key, mapping = asdict( self ) )
9313394134def _s3_env( credentials_path: str | Path ) -> dict[str, Any]:
9595- """TODO"""
135135+ """Load S3 credentials from a .env file.
136136+137137+ Args:
138138+ credentials_path: Path to .env file containing S3 credentials.
139139+140140+ Returns:
141141+ Dictionary with AWS_ENDPOINT, AWS_ACCESS_KEY_ID, and AWS_SECRET_ACCESS_KEY.
142142+143143+ Raises:
144144+ AssertionError: If required credentials are missing from the file.
145145+ """
96146 ##
97147 credentials_path = Path( credentials_path )
98148 env_values = dotenv_values( credentials_path )
99149 assert 'AWS_ENDPOINT' in env_values
100150 assert 'AWS_ACCESS_KEY_ID' in env_values
101151 assert 'AWS_SECRET_ACCESS_KEY' in env_values
102102-152152+103153 return {
104154 k: env_values[k]
105155 for k in (
···110160 }
111161112162def _s3_from_credentials( creds: str | Path | dict ) -> S3FileSystem:
113113- """TODO"""
163163+ """Create an S3FileSystem from credentials.
164164+165165+ Args:
166166+ creds: Either a path to a .env file with credentials, or a dict
167167+ containing AWS_ENDPOINT, AWS_ACCESS_KEY_ID, and AWS_SECRET_ACCESS_KEY.
168168+169169+ Returns:
170170+ Configured S3FileSystem instance.
171171+ """
114172 ##
115173 if not isinstance( creds, dict ):
116174 creds = _s3_env( creds )
117117-175175+118176 return S3FileSystem(
119177 endpoint_url = creds['AWS_ENDPOINT'],
120178 key = creds['AWS_ACCESS_KEY_ID'],
···126184# Classes
127185128186class Repo:
129129- """TODO"""
187187+ """Repository for storing and managing atdata datasets.
188188+189189+ Provides storage of datasets in S3-compatible object storage with Redis-based
190190+ indexing. Datasets are stored as WebDataset tar files with optional metadata.
191191+192192+ Attributes:
193193+ s3_credentials: S3 credentials dictionary or None.
194194+ bucket_fs: S3FileSystem instance or None.
195195+ hive_path: Path within S3 bucket for storing datasets.
196196+ hive_bucket: Name of the S3 bucket.
197197+ index: Index instance for tracking datasets.
198198+ """
130199131200 ##
132201···139208 #
140209 **kwargs
141210 ) -> None:
142142- """TODO"""
211211+ """Initialize a repository.
212212+213213+ Args:
214214+ s3_credentials: Path to .env file with S3 credentials, or dict with
215215+ AWS_ENDPOINT, AWS_ACCESS_KEY_ID, and AWS_SECRET_ACCESS_KEY.
216216+ If None, S3 functionality will be disabled.
217217+ hive_path: Path within the S3 bucket to store datasets.
218218+ Required if s3_credentials is provided.
219219+ redis: Redis connection for indexing. If None, creates a new connection.
220220+ **kwargs: Additional arguments (reserved for future use).
221221+222222+ Raises:
223223+ ValueError: If hive_path is not provided when s3_credentials is set.
224224+ """
143225144226 if s3_credentials is None:
145227 self.s3_credentials = None
···174256 #
175257 **kwargs
176258 ) -> tuple[BasicIndexEntry, Dataset[T]]:
177177- """TODO"""
259259+ """Insert a dataset into the repository.
260260+261261+ Writes the dataset to S3 as WebDataset tar files, stores metadata,
262262+ and creates an index entry in Redis.
263263+264264+ Args:
265265+ ds: The dataset to insert.
266266+ cache_local: If True, write to local temporary storage first, then
267267+ copy to S3. This can be faster for some workloads.
268268+ **kwargs: Additional arguments passed to wds.ShardWriter.
269269+270270+ Returns:
271271+ A tuple of (index_entry, new_dataset) where:
272272+ - index_entry: BasicIndexEntry for the stored dataset
273273+ - new_dataset: Dataset object pointing to the stored copy
274274+275275+ Raises:
276276+ AssertionError: If S3 credentials or hive_path are not configured.
277277+ RuntimeError: If no shards were written.
278278+ """
178279179280 assert self.s3_credentials is not None
180281 assert self.hive_bucket is not None
···318419319420320421class Index:
321321- """TODO"""
422422+ """Redis-backed index for tracking datasets in a repository.
423423+424424+ Maintains a registry of BasicIndexEntry objects in Redis, allowing
425425+ enumeration and lookup of stored datasets.
426426+427427+ Attributes:
428428+ _redis: Redis connection for index storage.
429429+ """
322430323431 ##
324432···326434 redis: Redis | None = None,
327435 **kwargs
328436 ) -> None:
329329- """TODO"""
437437+ """Initialize an index.
438438+439439+ Args:
440440+ redis: Redis connection to use. If None, creates a new connection
441441+ using the provided kwargs.
442442+ **kwargs: Additional arguments passed to Redis() constructor if
443443+ redis is None.
444444+ """
330445 ##
331446332447 if redis is not None:
···340455341456 @property
342457 def all_entries( self ) -> list[BasicIndexEntry]:
343343- """TODO"""
458458+ """Get all index entries as a list.
459459+460460+ Returns:
461461+ List of all BasicIndexEntry objects in the index.
462462+ """
344463 return list( self.entries )
345464346465 @property
347466 def entries( self ) -> Generator[BasicIndexEntry, None, None]:
348348- """TODO"""
467467+ """Iterate over all index entries.
468468+469469+ Scans Redis for all BasicIndexEntry keys and yields them one at a time.
470470+471471+ Yields:
472472+ BasicIndexEntry objects from the index.
473473+ """
349474 ##
350475 for key in self._redis.scan_iter( match = 'BasicIndexEntry:*' ):
351476 # TODO typing issue for `redis`
352477 cur_entry_data = _decode_bytes_dict( self._redis.hgetall( key ) )
353478 cur_entry = BasicIndexEntry( **cur_entry_data )
354479 yield cur_entry
355355-480480+356481 return
357482358483 def add_entry( self, ds: Dataset,
359484 uuid: str | None = None,
360485 ) -> BasicIndexEntry:
361361- """TODO"""
486486+ """Add a dataset to the index.
487487+488488+ Creates a BasicIndexEntry for the dataset and persists it to Redis.
489489+490490+ Args:
491491+ ds: The dataset to add to the index.
492492+ uuid: Optional UUID for the entry. If None, a new UUID is generated.
493493+494494+ Returns:
495495+ The created BasicIndexEntry object.
496496+ """
362497 ##
363498 temp_sample_kind = _kind_str_for_sample_type( ds.sample_type )
364499