this repo has no description
0
fork

Configure Feed

Select the types of activity you want to include in your feed.

Initial commit.

alice 74cbfdc7

+388
+164
.gitignore
··· 1 + # Byte-compiled / optimized / DLL files 2 + __pycache__/ 3 + *.py[cod] 4 + *$py.class 5 + 6 + # C extensions 7 + *.so 8 + 9 + # Distribution / packaging 10 + .Python 11 + build/ 12 + develop-eggs/ 13 + dist/ 14 + downloads/ 15 + eggs/ 16 + .eggs/ 17 + lib/ 18 + lib64/ 19 + parts/ 20 + sdist/ 21 + var/ 22 + wheels/ 23 + share/python-wheels/ 24 + *.egg-info/ 25 + .installed.cfg 26 + *.egg 27 + MANIFEST 28 + 29 + # PyInstaller 30 + # Usually these files are written by a python script from a template 31 + # before PyInstaller builds the exe, so as to inject date/other infos into it. 32 + *.manifest 33 + *.spec 34 + 35 + # Installer logs 36 + pip-log.txt 37 + pip-delete-this-directory.txt 38 + 39 + # Unit test / coverage reports 40 + htmlcov/ 41 + .tox/ 42 + .nox/ 43 + .coverage 44 + .coverage.* 45 + .cache 46 + nosetests.xml 47 + coverage.xml 48 + *.cover 49 + *.py,cover 50 + .hypothesis/ 51 + .pytest_cache/ 52 + cover/ 53 + 54 + # Translations 55 + *.mo 56 + *.pot 57 + 58 + # Django stuff: 59 + *.log 60 + local_settings.py 61 + db.sqlite3 62 + db.sqlite3-journal 63 + 64 + # Flask stuff: 65 + instance/ 66 + .webassets-cache 67 + 68 + # Scrapy stuff: 69 + .scrapy 70 + 71 + # Sphinx documentation 72 + docs/_build/ 73 + 74 + # PyBuilder 75 + .pybuilder/ 76 + target/ 77 + 78 + # Jupyter Notebook 79 + .ipynb_checkpoints 80 + 81 + # IPython 82 + profile_default/ 83 + ipython_config.py 84 + 85 + # pyenv 86 + # For a library or package, you might want to ignore these files since the code is 87 + # intended to run in multiple environments; otherwise, check them in: 88 + # .python-version 89 + 90 + # pipenv 91 + # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 92 + # However, in case of collaboration, if having platform-specific dependencies or dependencies 93 + # having no cross-platform support, pipenv may install dependencies that don't work, or not 94 + # install all needed dependencies. 95 + #Pipfile.lock 96 + 97 + # poetry 98 + # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. 99 + # This is especially recommended for binary packages to ensure reproducibility, and is more 100 + # commonly ignored for libraries. 101 + # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control 102 + #poetry.lock 103 + 104 + # pdm 105 + # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. 106 + #pdm.lock 107 + # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it 108 + # in version control. 109 + # https://pdm.fming.dev/latest/usage/project/#working-with-version-control 110 + .pdm.toml 111 + .pdm-python 112 + .pdm-build/ 113 + 114 + # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm 115 + __pypackages__/ 116 + 117 + # Celery stuff 118 + celerybeat-schedule 119 + celerybeat.pid 120 + 121 + # SageMath parsed files 122 + *.sage.py 123 + 124 + # Environments 125 + .env 126 + .venv 127 + env/ 128 + venv/ 129 + ENV/ 130 + env.bak/ 131 + venv.bak/ 132 + 133 + # Spyder project settings 134 + .spyderproject 135 + .spyproject 136 + 137 + # Rope project settings 138 + .ropeproject 139 + 140 + # mkdocs documentation 141 + /site 142 + 143 + # mypy 144 + .mypy_cache/ 145 + .dmypy.json 146 + dmypy.json 147 + 148 + # Pyre type checker 149 + .pyre/ 150 + 151 + # pytype static type analyzer 152 + .pytype/ 153 + 154 + # Cython debug symbols 155 + cython_debug/ 156 + 157 + # PyCharm 158 + # JetBrains specific template is maintained in a separate JetBrains.gitignore that can 159 + # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore 160 + # and can be added to the global gitignore or merged into this file. For a more nuclear 161 + # option (not recommended) you can uncomment the following to ignore the entire idea folder. 162 + #.idea/ 163 + 164 + car_files
+185
app.py
··· 1 + import os 2 + import json 3 + import asyncio 4 + import aiohttp 5 + import logging 6 + import multiprocessing 7 + from fastapi import FastAPI, HTTPException 8 + from pydantic import BaseModel 9 + from typing import Dict 10 + from concurrent.futures import ThreadPoolExecutor 11 + from functools import partial 12 + from tenacity import ( 13 + retry, 14 + stop_after_attempt, 15 + wait_random_exponential, 16 + retry_if_exception_type, 17 + ) 18 + from atmst.cartool import print_all_records 19 + 20 + 21 + # Configure logging 22 + logging.basicConfig( 23 + level=logging.INFO, 24 + format="%(asctime)s [%(levelname)s] %(message)s", 25 + datefmt="%Y-%m-%d %H:%M:%S", 26 + handlers=[logging.FileHandler("car_service.log"), logging.StreamHandler()], 27 + ) 28 + 29 + app = FastAPI(title="CAR File Fetcher and Parser") 30 + 31 + CAR_FILES_DIR = "car_files" 32 + os.makedirs(CAR_FILES_DIR, exist_ok=True) 33 + 34 + MAX_RETRIES = 10 35 + INITIAL_BACKOFF = 1 36 + BACKOFF_FACTOR = 2 37 + 38 + executor = ThreadPoolExecutor(max_workers=multiprocessing.cpu_count()) 39 + 40 + 41 + class FetchRequest(BaseModel): 42 + did: str 43 + pds: str 44 + 45 + 46 + class RetryableError(Exception): 47 + """Exception raised for retryable errors.""" 48 + 49 + pass 50 + 51 + 52 + class NonRetryableError(Exception): 53 + """Exception raised for non-retryable errors.""" 54 + 55 + pass 56 + 57 + 58 + def parse_car(car_file_path: str): 59 + return json.dumps(print_all_records(car_file_path, True)) 60 + 61 + 62 + @retry( 63 + reraise=True, 64 + stop=stop_after_attempt(MAX_RETRIES), 65 + wait=wait_random_exponential( 66 + multiplier=INITIAL_BACKOFF, min=INITIAL_BACKOFF, max=60 67 + ), 68 + retry=( 69 + retry_if_exception_type(RetryableError) 70 + | retry_if_exception_type(asyncio.TimeoutError) 71 + ), 72 + ) 73 + async def fetch_car_with_retry( 74 + session: aiohttp.ClientSession, url: str, headers: Dict[str, str], did: str 75 + ) -> bytes: 76 + """ 77 + Fetches the CAR file with retry logic using tenacity. 78 + 79 + Args: 80 + session (aiohttp.ClientSession): The HTTP session. 81 + url (str): The URL to fetch. 82 + headers (Dict[str, str]): The request headers. 83 + did (str): The DID being fetched (for logging purposes). 84 + 85 + Returns: 86 + bytes: The fetched CAR file bytes. 87 + 88 + Raises: 89 + NonRetryableError: For non-retryable HTTP status codes. 90 + RetryableError: For retryable HTTP status codes or exceptions. 91 + """ 92 + try: 93 + async with session.get(url, headers=headers) as response: 94 + if response.status == 200: 95 + car_bytes = await response.read() 96 + if not car_bytes: 97 + logging.error("Received empty CAR file.") 98 + raise NonRetryableError("Received empty CAR file.") 99 + return car_bytes # Successful fetch 100 + elif response.status in {429, 500, 502, 503, 504}: 101 + logging.warning( 102 + f"Received HTTP {response.status} for DID {did}. Retrying..." 103 + ) 104 + raise RetryableError(f"HTTP {response.status} error.") 105 + else: 106 + logging.error(f"Failed to fetch CAR file: HTTP {response.status}") 107 + raise NonRetryableError(f"HTTP {response.status} error.") 108 + except aiohttp.ClientResponseError as e: 109 + logging.error(f"Client response error for DID {did}: {e}") 110 + raise RetryableError(str(e)) 111 + except asyncio.TimeoutError: 112 + logging.warning(f"Timeout while fetching DID {did}. Retrying...") 113 + raise RetryableError("Timeout error.") 114 + except aiohttp.ClientError as e: 115 + logging.error(f"Client error for DID {did}: {e}") 116 + raise RetryableError(str(e)) 117 + 118 + 119 + @app.post("/fetch") 120 + async def fetch_car_file(request: FetchRequest): 121 + """ 122 + Fetches the CAR file for the given DID and PDS, parses it, saves it, 123 + and returns the extracted data as JSON. 124 + """ 125 + did = request.did 126 + pds = request.pds 127 + 128 + url = f"https://{pds}/xrpc/com.atproto.sync.getRepo?did={did}" 129 + headers = { 130 + "Accept": "application/vnd.ipld.car", 131 + "User-Agent": "emojistats-backfiller/0.0.1", 132 + } 133 + 134 + logging.info(f"Fetching CAR file for DID: {did} from PDS: {pds}") 135 + 136 + # Fetch CAR file with retries using tenacity 137 + try: 138 + async with aiohttp.ClientSession( 139 + timeout=aiohttp.ClientTimeout(total=60) 140 + ) as session: 141 + car_bytes = await fetch_car_with_retry(session, url, headers, did) 142 + except RetryableError as e: 143 + logging.error(f"Retryable error fetching CAR file for DID {did}: {e}") 144 + raise HTTPException( 145 + status_code=502, 146 + detail=f"Failed to fetch CAR file after {MAX_RETRIES} attempts.", 147 + ) 148 + except NonRetryableError as e: 149 + logging.error(f"Non-retryable error fetching CAR file for DID {did}: {e}") 150 + raise HTTPException(status_code=502, detail=str(e)) 151 + except Exception as e: 152 + logging.error(f"Unexpected error while fetching CAR file for DID {did}: {e}") 153 + raise HTTPException(status_code=500, detail="Internal server error.") 154 + 155 + car_file_path = None 156 + try: 157 + # Create filename from DID, replacing colons with underscores 158 + filename = f"{did.replace(':', '_')}.car" 159 + car_file_path = os.path.join(CAR_FILES_DIR, filename) 160 + 161 + # Delete existing file if it exists 162 + if os.path.exists(car_file_path): 163 + os.remove(car_file_path) 164 + logging.info(f"Deleted existing CAR file: {car_file_path}") 165 + 166 + # Save the new CAR file 167 + with open(car_file_path, "wb") as f: 168 + f.write(car_bytes) 169 + logging.info(f"Saved CAR file to {car_file_path}") 170 + except Exception as e: 171 + logging.error(f"Error saving CAR file: {e}") 172 + raise HTTPException(status_code=500, detail="Error saving CAR file.") 173 + 174 + # Parse CAR file in thread pool 175 + try: 176 + loop = asyncio.get_running_loop() 177 + parsed_data = await loop.run_in_executor( 178 + executor, partial(parse_car, car_file_path) 179 + ) 180 + logging.info(f"Parsed CAR file for DID: {did}") 181 + except Exception as e: 182 + logging.error(f"Error parsing CAR file: {e}") 183 + raise HTTPException(status_code=500, detail="Error parsing CAR file.") 184 + 185 + return parsed_data
+28
log_config.ini
··· 1 + [loggers] 2 + keys=root 3 + 4 + [handlers] 5 + keys=consoleHandler,fileHandler 6 + 7 + [formatters] 8 + keys=defaultFormatter 9 + 10 + [logger_root] 11 + level=INFO 12 + handlers=consoleHandler,fileHandler 13 + 14 + [handler_consoleHandler] 15 + class=StreamHandler 16 + level=INFO 17 + formatter=defaultFormatter 18 + args=(sys.stdout,) 19 + 20 + [handler_fileHandler] 21 + class=FileHandler 22 + level=INFO 23 + formatter=defaultFormatter 24 + args=('car_service.log', 'a') 25 + 26 + [formatter_defaultFormatter] 27 + format=%(asctime)s [%(levelname)s] %(message)s 28 + datefmt=%Y-%m-%d %H:%M:%S
+9
requirements.txt
··· 1 + fastapi 2 + uvicorn 3 + aiohttp 4 + async-timeout 5 + pydantic 6 + tenacity 7 + python-dotenv 8 + ruff 9 + ../atmst
+2
run.sh
··· 1 + #!/bin/bash 2 + uvicorn app:app --host 0.0.0.0 --port 8000 --reload --log-config log_config.ini