this repo has no description
4
fork

Configure Feed

Select the types of activity you want to include in your feed.

Some work on getting posts from a users

+288 -1
+2
.gitignore
··· 1 1 .env 2 + **/__pycache__/** 3 + data/*
+6 -1
crawl_follows.py
··· 8 8 from typing import Tuple, List, Dict 9 9 10 10 from atproto import AsyncClient 11 + from atproto import exceptions as at_exceptions 11 12 import pandas as pd 12 13 13 14 logger = logging.getLogger(__name__) ··· 124 125 # Try to only send 10 requests a second 125 126 batch_count = 1 126 127 fail_count = 0 127 - rate_limiter = RateLimit(10) 128 + rate_limiter = RateLimit(BATCH_SIZE) 128 129 while len(to_explore): 129 130 batch = to_explore[:BATCH_SIZE] 130 131 to_explore = to_explore[BATCH_SIZE:] ··· 138 139 follower, follows = await result 139 140 follow_map[follower] = follows 140 141 logger.info(f"{follower} follows {len(follows)} (public) accounts") 142 + except at_exceptions.BadRequestError as e: 143 + # Bad request is probably a profile that's private or deleted 144 + logger.info(f"Bad Request: {e.response.content.error}") 145 + continue 141 146 except Exception as e: 142 147 logger.error(f"Failed to get followers: {e}", exc_info=1) 143 148 fail_count += 1
+213
get_posts.py
··· 1 + import argparse 2 + import asyncio 3 + import decimal 4 + import json 5 + import logging 6 + import os 7 + import sys 8 + import time 9 + from typing import Tuple, List, Dict 10 + 11 + from atproto import AsyncClient 12 + from atproto import exceptions as at_exceptions 13 + from atproto_client.models.app.bsky.feed.defs import FeedViewPost 14 + import pandas as pd 15 + from rich import print 16 + 17 + from crawl_follows import RateLimit 18 + 19 + logger = logging.getLogger(__name__) 20 + logger.setLevel(logging.INFO) 21 + 22 + # Create formatter 23 + formatter = logging.Formatter( 24 + "%(asctime)s | %(levelname)s | %(message)s", datefmt="%Y-%m-%d %H:%M:%S" 25 + ) 26 + 27 + # Console handler 28 + console_handler = logging.StreamHandler(sys.stdout) 29 + console_handler.setFormatter(formatter) 30 + logger.addHandler(console_handler) 31 + 32 + 33 + BATCH_SIZE = 10 34 + CHECKPOINT_THRESHOLD = 1_000 35 + FOLLOWER_THRESHOLD = 150 36 + REQUIRED_ENV = ("BSKY_USER", "BSKY_APP_PW") 37 + 38 + 39 + def process_post(top: FeedViewPost): 40 + post = top.post 41 + data = { 42 + "author": post.author.did, 43 + "text": post.record.text, 44 + "cid": post.cid, 45 + "created_at": post.record.created_at, 46 + "repost": False, 47 + } 48 + if ( 49 + top.reason is not None 50 + and top.reason.py_type == "app.bsky.feed.defs#reasonRepost" 51 + ): 52 + data["repost"] = True 53 + 54 + if post.embed: 55 + data["embed"] = {} 56 + if post.embed.py_type == "app.bsky.embed.external#view": 57 + data["embed"] = { 58 + "title": post.embed.external.title, 59 + "description": post.embed.external.description, 60 + "uri": post.embed.external.uri, 61 + "thumb": post.embed.external.thumb, 62 + } 63 + elif post.embed.py_type == "app.bsky.embed.record#view": 64 + data["embed"] = { 65 + "author": post.embed.record.author.did, 66 + "text": post.embed.record.value.text, 67 + "cid": post.embed.record.cid, 68 + "created_at": post.embed.record.value.created_at, 69 + } 70 + elif post.embed.py_type == "app.bsky.embed.images#view": 71 + data["embed"]["images"] = [] 72 + for image in post.embed.images: 73 + data["embed"]["images"].append( 74 + { 75 + "alt_text": image.alt, 76 + "full_url": image.fullsize, 77 + "thumb_url": image.thumb, 78 + } 79 + ) 80 + elif post.embed.py_type == "app.bsky.embed.video#view": 81 + data["embed"]["video"] = { 82 + "alt": post.embed.alt, 83 + "full_url": post.embed.playlist, 84 + "thumb_url": post.embed.thumbnail, 85 + } 86 + 87 + if top.reply: 88 + data["reply_parent"] = {} 89 + data["reply_parent"]["author"] = top.reply.parent.author.did 90 + data["reply_parent"]["text"] = top.reply.parent.record.text 91 + data["reply_parent"]["cid"] = top.reply.parent.cid 92 + data["reply_parent"]["created_at"] = top.reply.parent.record.created_at 93 + data["reply_parent"]["root_cid"] = top.reply.root.cid 94 + 95 + return data 96 + 97 + 98 + async def get_all_posts(client: AsyncClient, rate_limit: RateLimit, account_did: str): 99 + await rate_limit.acquire() 100 + data = await client.get_author_feed( 101 + actor=account_did, 102 + filter="posts_and_author_threads", 103 + ) 104 + for i in range(4): 105 + print(data.feed[i]) 106 + process_post(data.feed[i]) 107 + 108 + 109 + async def retrieve_posts(user: str, app_pw: str, graph_file: str, checkpoint_dir: str): 110 + 111 + # If checkpoint dir doesn't exist, try to create it 112 + if not os.path.isdir(checkpoint_dir): 113 + logger.info("Checkpoint dir doesn't exist, creating...") 114 + try: 115 + os.mkdir(checkpoint_dir) 116 + except Exception as e: 117 + logger.error(f"Failed to created checkpoint dir, {checkpoint_dir}\n{e}") 118 + sys.exit(1) 119 + 120 + # Checkpoint folders contain one file per user 121 + completed_accounts = set() 122 + try: 123 + files = os.listdir(checkpoint_dir) 124 + for file in files: 125 + completed_accounts.add(file) 126 + except Exception as e: 127 + logger.error( 128 + f"Failed to recover from checkpoint dir, {checkpoint_dir}\n{e}", 129 + exc_info=1, 130 + ) 131 + sys.exit(1) 132 + 133 + # Load follow graph parquet file 134 + # to_explore = dict() 135 + # try: 136 + # logger.info("Parsing follower graph file...") 137 + # follow_df = pd.read_parquet(graph_file) 138 + # except Exception as e: 139 + # logger.error(f"Failed to open follow graph file, {graph_file}\n{e}") 140 + # sys.exit(1) 141 + 142 + # for _, row in follow_df.iterrows(): 143 + # for acct in row["follows"]: 144 + # if acct not in completed_accounts: 145 + # if acct not in to_explore: 146 + # to_explore[acct] = 0 147 + # to_explore[acct] += 1 148 + 149 + # accts = [ 150 + # (acct, follows) 151 + # for acct, follows in to_explore.items() 152 + # if follows >= FOLLOWER_THRESHOLD 153 + # ] 154 + # accts.sort(key=lambda x: -1 * x[1]) 155 + accts = [("did:plc:5o6k7jvowuyaquloafzn3cfw", 8604)] 156 + 157 + logger.info(f"Num of accounts to retrieve posts from: {len(accts)}") 158 + 159 + client = AsyncClient() 160 + await client.login(user, app_pw) 161 + 162 + # Get all posts for accounts 163 + batch_count = 1 164 + fail_count = 0 165 + rate_limiter = RateLimit(BATCH_SIZE) 166 + for i in range(0, len(accts), BATCH_SIZE): 167 + batch = [acct for acct, follow_count in accts[i : i + BATCH_SIZE]] 168 + await get_all_posts(client, rate_limiter, batch[0]) 169 + sys.exit(1) 170 + 171 + 172 + def main(): 173 + for key in REQUIRED_ENV: 174 + if key not in os.environ: 175 + raise ValueError(f"Must set '{key}' env var") 176 + 177 + user_name = os.environ["BSKY_USER"] 178 + app_pw = os.environ["BSKY_APP_PW"] 179 + 180 + parser = argparse.ArgumentParser( 181 + prog="GetPosts", 182 + description="Get all posts for accounts in provided follow graph", 183 + ) 184 + parser.add_argument( 185 + "--graph-file", 186 + dest="graph_file", 187 + required=True, 188 + help="File with follow graph", 189 + ) 190 + parser.add_argument( 191 + "--save-dir", 192 + dest="save_dir", 193 + required=True, 194 + help="Where to store crawl data", 195 + ) 196 + args = parser.parse_args() 197 + 198 + if args.save_dir is None and args.ckpt is None: 199 + logger.error("Must provide save dir or checkpoint dir") 200 + sys.exit(1) 201 + 202 + asyncio.run( 203 + retrieve_posts( 204 + user_name, 205 + app_pw, 206 + graph_file=args.graph_file, 207 + checkpoint_dir=args.save_dir, 208 + ) 209 + ) 210 + 211 + 212 + if __name__ == "__main__": 213 + main()
+1
pyproject.toml
··· 8 8 "atproto>=0.0.61", 9 9 "pandas>=2.3.1", 10 10 "pyarrow>=21.0.0", 11 + "rich>=14.1.0", 11 12 ]
+66
uv.lock
··· 247 247 { name = "atproto" }, 248 248 { name = "pandas" }, 249 249 { name = "pyarrow" }, 250 + { name = "rich" }, 250 251 ] 251 252 252 253 [package.metadata] ··· 254 255 { name = "atproto", specifier = ">=0.0.61" }, 255 256 { name = "pandas", specifier = ">=2.3.1" }, 256 257 { name = "pyarrow", specifier = ">=21.0.0" }, 258 + { name = "rich", specifier = ">=14.1.0" }, 257 259 ] 258 260 259 261 [[package]] ··· 395 397 { url = "https://files.pythonhosted.org/packages/c6/4a/80c127f873df5025af159b4cb08454f1034c835c1d111cf17da5def803ff/libipld-3.1.1-pp39-pypy39_pp73-musllinux_1_1_aarch64.whl", hash = "sha256:7fd70b7201f547f6bf338ce8efef1bd40b1a19e8ae1d4a8026692e63f27fd952", size = 480400, upload-time = "2025-06-24T23:12:47.692Z" }, 396 398 { url = "https://files.pythonhosted.org/packages/2c/c8/23f2cdbdf289543cf81902d5de163f805a2f7634a615a5098cd6097a2723/libipld-3.1.1-pp39-pypy39_pp73-musllinux_1_1_x86_64.whl", hash = "sha256:9b4bb2748f1752cd0aadb880dd43e7f93cf41d055ea6e041a4b1d126b5364b97", size = 473436, upload-time = "2025-06-24T23:12:48.762Z" }, 397 399 { url = "https://files.pythonhosted.org/packages/0c/30/6777d8172b54e67ce2e94aee00c9804f46b32adce24369e883265e6416ac/libipld-3.1.1-pp39-pypy39_pp73-win_amd64.whl", hash = "sha256:a923ef63501b67649ef767a6e178e862173aaf31380010922f5a2c7f6ea889af", size = 175417, upload-time = "2025-06-24T23:12:49.763Z" }, 400 + ] 401 + 402 + [[package]] 403 + name = "markdown-it-py" 404 + version = "3.0.0" 405 + source = { registry = "https://pypi.org/simple" } 406 + resolution-markers = [ 407 + "python_full_version < '3.10'", 408 + ] 409 + dependencies = [ 410 + { name = "mdurl", marker = "python_full_version < '3.10'" }, 411 + ] 412 + sdist = { url = "https://files.pythonhosted.org/packages/38/71/3b932df36c1a044d397a1f92d1cf91ee0a503d91e470cbd670aa66b07ed0/markdown-it-py-3.0.0.tar.gz", hash = "sha256:e3f60a94fa066dc52ec76661e37c851cb232d92f9886b15cb560aaada2df8feb", size = 74596, upload-time = "2023-06-03T06:41:14.443Z" } 413 + wheels = [ 414 + { url = "https://files.pythonhosted.org/packages/42/d7/1ec15b46af6af88f19b8e5ffea08fa375d433c998b8a7639e76935c14f1f/markdown_it_py-3.0.0-py3-none-any.whl", hash = "sha256:355216845c60bd96232cd8d8c40e8f9765cc86f46880e43a8fd22dc1a1a8cab1", size = 87528, upload-time = "2023-06-03T06:41:11.019Z" }, 415 + ] 416 + 417 + [[package]] 418 + name = "markdown-it-py" 419 + version = "4.0.0" 420 + source = { registry = "https://pypi.org/simple" } 421 + resolution-markers = [ 422 + "python_full_version >= '3.12'", 423 + "python_full_version == '3.11.*'", 424 + "python_full_version == '3.10.*'", 425 + ] 426 + dependencies = [ 427 + { name = "mdurl", marker = "python_full_version >= '3.10'" }, 428 + ] 429 + sdist = { url = "https://files.pythonhosted.org/packages/5b/f5/4ec618ed16cc4f8fb3b701563655a69816155e79e24a17b651541804721d/markdown_it_py-4.0.0.tar.gz", hash = "sha256:cb0a2b4aa34f932c007117b194e945bd74e0ec24133ceb5bac59009cda1cb9f3", size = 73070, upload-time = "2025-08-11T12:57:52.854Z" } 430 + wheels = [ 431 + { url = "https://files.pythonhosted.org/packages/94/54/e7d793b573f298e1c9013b8c4dade17d481164aa517d1d7148619c2cedbf/markdown_it_py-4.0.0-py3-none-any.whl", hash = "sha256:87327c59b172c5011896038353a81343b6754500a08cd7a4973bb48c6d578147", size = 87321, upload-time = "2025-08-11T12:57:51.923Z" }, 432 + ] 433 + 434 + [[package]] 435 + name = "mdurl" 436 + version = "0.1.2" 437 + source = { registry = "https://pypi.org/simple" } 438 + sdist = { url = "https://files.pythonhosted.org/packages/d6/54/cfe61301667036ec958cb99bd3efefba235e65cdeb9c84d24a8293ba1d90/mdurl-0.1.2.tar.gz", hash = "sha256:bb413d29f5eea38f31dd4754dd7377d4465116fb207585f97bf925588687c1ba", size = 8729, upload-time = "2022-08-14T12:40:10.846Z" } 439 + wheels = [ 440 + { url = "https://files.pythonhosted.org/packages/b3/38/89ba8ad64ae25be8de66a6d463314cf1eb366222074cfda9ee839c56a4b4/mdurl-0.1.2-py3-none-any.whl", hash = "sha256:84008a41e51615a49fc9966191ff91509e3c40b939176e643fd50a5c2196b8f8", size = 9979, upload-time = "2022-08-14T12:40:09.779Z" }, 398 441 ] 399 442 400 443 [[package]] ··· 843 886 ] 844 887 845 888 [[package]] 889 + name = "pygments" 890 + version = "2.19.2" 891 + source = { registry = "https://pypi.org/simple" } 892 + sdist = { url = "https://files.pythonhosted.org/packages/b0/77/a5b8c569bf593b0140bde72ea885a803b82086995367bf2037de0159d924/pygments-2.19.2.tar.gz", hash = "sha256:636cb2477cec7f8952536970bc533bc43743542f70392ae026374600add5b887", size = 4968631, upload-time = "2025-06-21T13:39:12.283Z" } 893 + wheels = [ 894 + { url = "https://files.pythonhosted.org/packages/c7/21/705964c7812476f378728bdf590ca4b771ec72385c533964653c68e86bdc/pygments-2.19.2-py3-none-any.whl", hash = "sha256:86540386c03d588bb81d44bc3928634ff26449851e99741617ecb9037ee5ec0b", size = 1225217, upload-time = "2025-06-21T13:39:07.939Z" }, 895 + ] 896 + 897 + [[package]] 846 898 name = "python-dateutil" 847 899 version = "2.9.0.post0" 848 900 source = { registry = "https://pypi.org/simple" } ··· 861 913 sdist = { url = "https://files.pythonhosted.org/packages/f8/bf/abbd3cdfb8fbc7fb3d4d38d320f2441b1e7cbe29be4f23797b4a2b5d8aac/pytz-2025.2.tar.gz", hash = "sha256:360b9e3dbb49a209c21ad61809c7fb453643e048b38924c765813546746e81c3", size = 320884, upload-time = "2025-03-25T02:25:00.538Z" } 862 914 wheels = [ 863 915 { url = "https://files.pythonhosted.org/packages/81/c4/34e93fe5f5429d7570ec1fa436f1986fb1f00c3e0f43a589fe2bbcd22c3f/pytz-2025.2-py2.py3-none-any.whl", hash = "sha256:5ddf76296dd8c44c26eb8f4b6f35488f3ccbf6fbbd7adee0b7262d43f0ec2f00", size = 509225, upload-time = "2025-03-25T02:24:58.468Z" }, 916 + ] 917 + 918 + [[package]] 919 + name = "rich" 920 + version = "14.1.0" 921 + source = { registry = "https://pypi.org/simple" } 922 + dependencies = [ 923 + { name = "markdown-it-py", version = "3.0.0", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.10'" }, 924 + { name = "markdown-it-py", version = "4.0.0", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.10'" }, 925 + { name = "pygments" }, 926 + ] 927 + sdist = { url = "https://files.pythonhosted.org/packages/fe/75/af448d8e52bf1d8fa6a9d089ca6c07ff4453d86c65c145d0a300bb073b9b/rich-14.1.0.tar.gz", hash = "sha256:e497a48b844b0320d45007cdebfeaeed8db2a4f4bcf49f15e455cfc4af11eaa8", size = 224441, upload-time = "2025-07-25T07:32:58.125Z" } 928 + wheels = [ 929 + { url = "https://files.pythonhosted.org/packages/e3/30/3c4d035596d3cf444529e0b2953ad0466f6049528a879d27534700580395/rich-14.1.0-py3-none-any.whl", hash = "sha256:536f5f1785986d6dbdea3c75205c473f970777b4a0d6c6dd1b696aa05a3fa04f", size = 243368, upload-time = "2025-07-25T07:32:56.73Z" }, 864 930 ] 865 931 866 932 [[package]]