this repo has no description
0
fork

Configure Feed

Select the types of activity you want to include in your feed.

Tweet source extraction and new migration system (#24)

Closes #12 and #23

authored by

Simon Willison and committed by
GitHub
c9295233 619f724a

+163 -19
+49
tests/test_migrations.py
··· 1 + import sqlite_utils 2 + from click.testing import CliRunner 3 + import sqlite_utils 4 + from twitter_to_sqlite import cli, migrations 5 + 6 + from .test_import import zip_contents_path 7 + 8 + 9 + def test_no_migrations_on_first_run(tmpdir, zip_contents_path): 10 + output = str(tmpdir / "output.db") 11 + args = ["import", output, str(zip_contents_path / "follower.js")] 12 + result = CliRunner().invoke(cli.cli, args) 13 + assert 0 == result.exit_code, result.stdout 14 + db = sqlite_utils.Database(output) 15 + assert ["archive_follower"] == db.table_names() 16 + # Re-running the command again should also run the migrations 17 + result = CliRunner().invoke(cli.cli, args) 18 + db = sqlite_utils.Database(output) 19 + assert {"archive_follower", "migrations"} == set(db.table_names()) 20 + 21 + 22 + def test_convert_source_column(): 23 + db = sqlite_utils.Database(memory=True) 24 + db["tweets"].insert_all( 25 + [ 26 + {"id": 1, "source": '<a href="URL">NAME</a>'}, 27 + {"id": 2, "source": '<a href="URL2">NAME2</a>'}, 28 + {"id": 3, "source": "d3c1d39c57fecfc09202f20ea5e2db30262029fd"}, 29 + ], 30 + pk="id", 31 + ) 32 + migrations.convert_source_column(db) 33 + assert [ 34 + { 35 + "id": "d3c1d39c57fecfc09202f20ea5e2db30262029fd", 36 + "url": "URL", 37 + "name": "NAME", 38 + }, 39 + { 40 + "id": "000e4c4db71278018fb8c322f070d051e76885b1", 41 + "url": "URL2", 42 + "name": "NAME2", 43 + }, 44 + ] == list(db["sources"].rows) 45 + assert [ 46 + {"id": 1, "source": "d3c1d39c57fecfc09202f20ea5e2db30262029fd"}, 47 + {"id": 2, "source": "000e4c4db71278018fb8c322f070d051e76885b1"}, 48 + {"id": 3, "source": "d3c1d39c57fecfc09202f20ea5e2db30262029fd"}, 49 + ] == list(db["tweets"].rows)
+32 -5
tests/test_save_tweets.py
··· 20 20 21 21 def test_tables(db): 22 22 assert { 23 + "sources", 23 24 "users_fts_idx", 24 25 "users_fts_data", 25 26 "tweets_fts", ··· 182 183 "retweeted_status": None, 183 184 "quoted_status": None, 184 185 "place": None, 186 + "source": "e6528b505bcfd811fdd40ff2d46665dbccba2024", 185 187 "truncated": 0, 186 188 "display_text_range": "[0, 139]", 187 - "source": '<a href="http://itunes.apple.com/us/app/twitter/id409789998?mt=12" rel="nofollow">Twitter for Mac</a>', 188 189 "in_reply_to_status_id": None, 189 190 "in_reply_to_user_id": None, 190 191 "in_reply_to_screen_name": None, ··· 207 208 "retweeted_status": None, 208 209 "quoted_status": 861696799362478100, 209 210 "place": None, 211 + "source": "1f89d6a41b1505a3071169f8d0d028ba9ad6f952", 210 212 "truncated": 0, 211 213 "display_text_range": "[0, 239]", 212 - "source": '<a href="https://mobile.twitter.com" rel="nofollow">Twitter Web App</a>', 213 214 "in_reply_to_status_id": None, 214 215 "in_reply_to_user_id": None, 215 216 "in_reply_to_screen_name": None, ··· 232 233 "retweeted_status": None, 233 234 "quoted_status": None, 234 235 "place": "01a9a39529b27f36", 236 + "source": "95f3aaaddaa45937ac94765e0ddb68ba2be92d20", 235 237 "truncated": 0, 236 238 "display_text_range": "[45, 262]", 237 - "source": '<a href="http://twitter.com/download/iphone" rel="nofollow">Twitter for iPhone</a>', 238 239 "in_reply_to_status_id": "1169079390577320000", 239 240 "in_reply_to_user_id": "82016165", 240 241 "in_reply_to_screen_name": "scientiffic", ··· 257 258 "retweeted_status": None, 258 259 "quoted_status": None, 259 260 "place": None, 261 + "source": "942cfc2bf9f290ddbe3d78f1907dc084a00ed23f", 260 262 "truncated": 0, 261 263 "display_text_range": "[0, 235]", 262 - "source": '<a href="http://www.voxmedia.com" rel="nofollow">Vox Media</a>', 263 264 "in_reply_to_status_id": None, 264 265 "in_reply_to_user_id": None, 265 266 "in_reply_to_screen_name": None, ··· 282 283 "retweeted_status": 1169242008432644000, 283 284 "quoted_status": None, 284 285 "place": None, 286 + "source": "95f3aaaddaa45937ac94765e0ddb68ba2be92d20", 285 287 "truncated": 0, 286 288 "display_text_range": "[0, 143]", 287 - "source": '<a href="http://twitter.com/download/iphone" rel="nofollow">Twitter for iPhone</a>', 288 289 "in_reply_to_status_id": None, 289 290 "in_reply_to_user_id": None, 290 291 "in_reply_to_screen_name": None, ··· 300 301 "lang": "en", 301 302 }, 302 303 ] == tweet_rows 304 + 305 + 306 + def test_sources(db): 307 + source_rows = list(db["sources"].rows) 308 + assert [ 309 + { 310 + "id": "942cfc2bf9f290ddbe3d78f1907dc084a00ed23f", 311 + "name": "Vox Media", 312 + "url": "http://www.voxmedia.com", 313 + }, 314 + { 315 + "id": "95f3aaaddaa45937ac94765e0ddb68ba2be92d20", 316 + "name": "Twitter for iPhone", 317 + "url": "http://twitter.com/download/iphone", 318 + }, 319 + { 320 + "id": "1f89d6a41b1505a3071169f8d0d028ba9ad6f952", 321 + "name": "Twitter Web App", 322 + "url": "https://mobile.twitter.com", 323 + }, 324 + { 325 + "id": "e6528b505bcfd811fdd40ff2d46665dbccba2024", 326 + "name": "Twitter for Mac", 327 + "url": "http://itunes.apple.com/us/app/twitter/id409789998?mt=12", 328 + }, 329 + ] == source_rows 303 330 304 331 305 332 def test_places(db):
+15 -13
twitter_to_sqlite/cli.py
··· 5 5 import time 6 6 7 7 import click 8 - import sqlite_utils 8 + 9 9 from twitter_to_sqlite import archive 10 10 from twitter_to_sqlite import utils 11 11 ··· 102 102 "Save followers for specified user (defaults to authenticated user)" 103 103 auth = json.load(open(auth)) 104 104 session = utils.session_for_auth(auth) 105 - db = sqlite_utils.Database(db_path) 105 + db = utils.open_database(db_path) 106 106 fetched = [] 107 107 # Get the follower count, so we can have a progress bar 108 108 count = 0 ··· 152 152 "Save tweets favorited by specified user" 153 153 auth = json.load(open(auth)) 154 154 session = utils.session_for_auth(auth) 155 - db = sqlite_utils.Database(db_path) 155 + db = utils.open_database(db_path) 156 156 profile = utils.get_profile(db, session, user_id, screen_name) 157 157 with click.progressbar( 158 158 utils.fetch_favorites(session, user_id, screen_name, stop_after), ··· 193 193 raise click.ClickException("Use either --since or --since_id, not both") 194 194 auth = json.load(open(auth)) 195 195 session = utils.session_for_auth(auth) 196 - db = sqlite_utils.Database(db_path) 196 + db = utils.open_database(db_path) 197 197 profile = utils.get_profile(db, session, user_id, screen_name) 198 198 expected_length = profile["statuses_count"] 199 199 ··· 209 209 pass 210 210 211 211 with click.progressbar( 212 - utils.fetch_user_timeline(session, user_id, screen_name, stop_after, since_id=since_id), 212 + utils.fetch_user_timeline( 213 + session, user_id, screen_name, stop_after, since_id=since_id 214 + ), 213 215 length=expected_length, 214 216 label="Importing tweets", 215 217 show_pos=True, ··· 253 255 raise click.ClickException("Use either --since or --since_id, not both") 254 256 auth = json.load(open(auth)) 255 257 session = utils.session_for_auth(auth) 256 - db = sqlite_utils.Database(db_path) 258 + db = utils.open_database(db_path) 257 259 profile = utils.get_profile(db, session) 258 260 expected_length = 800 259 261 if since and db["timeline_tweets"].exists: ··· 310 312 "Fetch user accounts" 311 313 auth = json.load(open(auth)) 312 314 session = utils.session_for_auth(auth) 313 - db = sqlite_utils.Database(db_path) 315 + db = utils.open_database(db_path) 314 316 identifiers = utils.resolve_identifiers(db, identifiers, attach, sql) 315 317 for batch in utils.fetch_user_batches(session, identifiers, ids): 316 318 utils.save_users(db, batch) ··· 338 340 "Fetch tweets by their IDs" 339 341 auth = json.load(open(auth)) 340 342 session = utils.session_for_auth(auth) 341 - db = sqlite_utils.Database(db_path) 343 + db = utils.open_database(db_path) 342 344 identifiers = utils.resolve_identifiers(db, identifiers, attach, sql) 343 345 if skip_existing: 344 346 existing_ids = set( ··· 381 383 "Fetch lists - accepts one or more screen_name/list_slug identifiers" 382 384 auth = json.load(open(auth)) 383 385 session = utils.session_for_auth(auth) 384 - db = sqlite_utils.Database(db_path) 386 + db = utils.open_database(db_path) 385 387 for identifier in identifiers: 386 388 utils.fetch_and_save_list(db, session, identifier, ids) 387 389 ··· 477 479 "Experimental: Save tweets matching these keywords in real-time" 478 480 auth = json.load(open(auth)) 479 481 session = utils.session_for_auth(auth) 480 - db = sqlite_utils.Database(db_path) 482 + db = utils.open_database(db_path) 481 483 for tweet in utils.stream_filter(session, track=track): 482 484 if verbose: 483 485 print(json.dumps(tweet, indent=2)) ··· 505 507 "Experimental: Follow these Twitter users and save tweets in real-time" 506 508 auth = json.load(open(auth)) 507 509 session = utils.session_for_auth(auth) 508 - db = sqlite_utils.Database(db_path) 510 + db = utils.open_database(db_path) 509 511 identifiers = utils.resolve_identifiers(db, identifiers, attach, sql) 510 512 # Make sure we have saved these users to the database 511 513 for batch in utils.fetch_user_batches(session, identifiers, ids): ··· 528 530 ): 529 531 auth = json.load(open(auth)) 530 532 session = utils.session_for_auth(auth) 531 - db = sqlite_utils.Database(db_path) 533 + db = utils.open_database(db_path) 532 534 identifiers = utils.resolve_identifiers(db, identifiers, attach, sql) 533 535 for identifier in identifiers: 534 536 # Make sure this user is saved ··· 568 570 Import data from a Twitter exported archive. Input can be the path to a zip 569 571 file, a directory full of .js files or one or more direct .js files. 570 572 """ 571 - db = sqlite_utils.Database(db_path) 573 + db = utils.open_database(db_path) 572 574 for filepath in paths: 573 575 path = pathlib.Path(filepath) 574 576 if path.suffix == ".zip":
+22
twitter_to_sqlite/migrations.py
··· 1 + from .utils import extract_and_save_source 2 + 3 + MIGRATIONS = [] 4 + 5 + 6 + def migration(fn): 7 + MIGRATIONS.append(fn) 8 + return fn 9 + 10 + 11 + @migration 12 + def convert_source_column(db): 13 + tables = set(db.table_names()) 14 + if "tweets" not in tables: 15 + return 16 + # Now we extract any '<a href=...' records from the source 17 + for id, source in db.conn.execute( 18 + "select id, source from tweets where source like '<%'" 19 + ).fetchall(): 20 + db["tweets"].update(id, {"source": extract_and_save_source(db, source)}) 21 + db["tweets"].create_index(["source"]) 22 + db["tweets"].add_foreign_key("source")
+45 -1
twitter_to_sqlite/utils.py
··· 2 2 import html 3 3 import json 4 4 import pathlib 5 + import re 5 6 import time 6 7 import urllib.parse 7 8 import zipfile 8 9 9 10 from dateutil import parser 10 11 from requests_oauthlib import OAuth1Session 12 + import sqlite_utils 11 13 12 14 # Twitter API error codes 13 15 RATE_LIMIT_ERROR_CODE = 88 16 + 17 + source_re = re.compile('<a href="(?P<url>.*?)".*?>(?P<name>.*?)</a>') 18 + 19 + 20 + def open_database(db_path): 21 + db = sqlite_utils.Database(db_path) 22 + # Only run migrations if this is an existing DB (has tables) 23 + if db.tables: 24 + migrate(db) 25 + return db 26 + 27 + 28 + def migrate(db): 29 + from twitter_to_sqlite.migrations import MIGRATIONS 30 + 31 + if "migrations" not in db.table_names(): 32 + db["migrations"].create({"name": str, "applied": str}, pk="name") 33 + applied_migrations = { 34 + m[0] for m in db.conn.execute("select name from migrations").fetchall() 35 + } 36 + for migration in MIGRATIONS: 37 + name = migration.__name__ 38 + if name in applied_migrations: 39 + continue 40 + migration(db) 41 + db["migrations"].insert( 42 + {"name": name, "applied": datetime.datetime.utcnow().isoformat()} 43 + ) 14 44 15 45 16 46 def session_for_auth(auth): ··· 186 216 table_names = set(db.table_names()) 187 217 if "places" not in table_names: 188 218 db["places"].create({"id": str}, pk="id") 219 + if "sources" not in table_names: 220 + db["sources"].create({"id": str, "name": str, "url": str}, pk="id") 189 221 if "users" not in table_names: 190 222 db["users"].create( 191 223 { ··· 210 242 "retweeted_status": int, 211 243 "quoted_status": int, 212 244 "place": str, 245 + "source": str, 213 246 }, 214 247 pk="id", 215 - foreign_keys=(("user", "users", "id"), ("place", "places", "id")), 248 + foreign_keys=( 249 + ("user", "users", "id"), 250 + ("place", "places", "id"), 251 + ("source", "sources", "id"), 252 + ), 216 253 ) 217 254 db["tweets"].enable_fts(["full_text"], create_triggers=True) 218 255 db["tweets"].add_foreign_key("retweeted_status", "tweets") ··· 235 272 user = tweet.pop("user") 236 273 transform_user(user) 237 274 tweet["user"] = user["id"] 275 + tweet["source"] = extract_and_save_source(db, tweet["source"]) 238 276 if tweet.get("place"): 239 277 db["places"].upsert(tweet["place"], pk="id", alter=True) 240 278 tweet["place"] = tweet["place"]["id"] ··· 472 510 for zi in zf.filelist: 473 511 if zi.filename.endswith(".js"): 474 512 yield zi.filename, zf.open(zi.filename).read() 513 + 514 + 515 + def extract_and_save_source(db, source): 516 + m = source_re.match(source) 517 + details = m.groupdict() 518 + return db["sources"].upsert(details, hash_id="id").last_pk