import command now works on files and directories, closes #22 · alice.mosphere.at/twitter-to-sqlite@8d60719

+11 -1

README.md

··· 240 240 241 241 This command does not populate any of the regular tables, since Twitter's export data does not exactly match the schema returned by the Twitter API. 242 242 243 - It will delete and recreate all of your `archive_*` tables every time you run it. If this is not what you want, run the command against a new SQLite database file name rather than running it against one that already exists. 243 + It will delete and recreate the corresponding `archive_*` tables every time you run it. If this is not what you want, run the command against a new SQLite database file name rather than running it against one that already exists. 244 + 245 + If you have already decompressed your archive, you can run this against the directory that you decompressed it to: 246 + 247 + $ twitter-to-sqlite import archive.db ~/Downloads/twitter-2019-06-25-b31f2/ 248 + 249 + You can also run it against one or more specific files within that folder. For example, to import just the follower.js and following.js files: 250 + 251 + $ twitter-to-sqlite import archive.db \ 252 + ~/Downloads/twitter-2019-06-25-b31f2/follower.js \ 253 + ~/Downloads/twitter-2019-06-25-b31f2/following.js 244 254 245 255 You may want to use other commands to populate tables based on data from the archive. For example, to retrieve full API versions of each of the tweets you have favourited in your archive, you could run the following: 246 256

+1 -1

tests/test_create_zip.py

··· 4 4 5 5 6 6 def test_create_zip(): 7 - zf = create_zip() 7 + zf = create_zip(pathlib.Path(__file__).parent / "zip_contents") 8 8 assert {"account.js", "saved-search.js", "following.js", "follower.js"} == { 9 9 f.filename for f in zf.filelist 10 10 }

+65 -12

tests/test_import.py

··· 1 1 import io 2 + import pathlib 2 3 3 4 import pytest 4 5 import sqlite_utils ··· 9 10 10 11 11 12 @pytest.fixture 12 - def import_test_dir(tmpdir): 13 + def zip_contents_path(): 14 + return pathlib.Path(__file__).parent / "zip_contents" 15 + 16 + 17 + @pytest.fixture 18 + def import_test_zip(tmpdir, zip_contents_path): 13 19 archive = str(tmpdir / "archive.zip") 14 20 buf = io.BytesIO() 15 - zf = create_zip(buf) 21 + zf = create_zip(zip_contents_path, buf) 16 22 zf.close() 17 23 open(archive, "wb").write(buf.getbuffer()) 18 24 return tmpdir, archive 19 25 20 26 21 - def test_cli_import(import_test_dir): 22 - tmpdir, archive = import_test_dir 27 + def test_create_zip(zip_contents_path): 28 + zf = create_zip(zip_contents_path) 29 + assert {"account.js", "saved-search.js", "following.js", "follower.js"} == { 30 + f.filename for f in zf.filelist 31 + } 32 + 33 + 34 + def test_cli_import_zip_file(import_test_zip): 35 + tmpdir, archive = import_test_zip 23 36 output = str(tmpdir / "output.db") 24 37 result = CliRunner().invoke(cli.cli, ["import", output, archive]) 25 - assert 0 == result.exit_code, result.stderr 38 + assert 0 == result.exit_code, result.stdout 39 + db = sqlite_utils.Database(output) 40 + assert_imported_db(db) 41 + 42 + 43 + def test_cli_import_folder(tmpdir, zip_contents_path): 44 + output = str(tmpdir / "output.db") 45 + result = CliRunner().invoke(cli.cli, ["import", output, str(zip_contents_path)]) 46 + assert 0 == result.exit_code, result.stdout 47 + db = sqlite_utils.Database(output) 48 + assert_imported_db(db) 49 + 50 + 51 + def test_cli_import_specific_files(tmpdir, zip_contents_path): 52 + output = str(tmpdir / "output.db") 53 + result = CliRunner().invoke( 54 + cli.cli, 55 + [ 56 + "import", 57 + output, 58 + str(zip_contents_path / "follower.js"), 59 + str(zip_contents_path / "following.js"), 60 + ], 61 + ) 62 + assert 0 == result.exit_code, result.stdout 26 63 db = sqlite_utils.Database(output) 64 + # Should just have two tables 65 + assert ["archive_follower", "archive_following"] == db.table_names() 66 + 67 + 68 + def assert_imported_db(db): 27 69 assert { 28 70 "archive_follower", 29 71 "archive_saved_search", ··· 56 98 ] == list(db["archive_account"].rows) 57 99 58 100 59 - def test_deletes_existing_archive_tables(import_test_dir): 60 - tmpdir, archive = import_test_dir 101 + def test_deletes_existing_archive_tables(import_test_zip): 102 + tmpdir, archive = import_test_zip 61 103 output = str(tmpdir / "output.db") 62 104 db = sqlite_utils.Database(output) 63 105 # Create a table 64 - db["archive_foo"].create({"id": int}) 65 - assert ["archive_foo"] == db.table_names() 66 - result = CliRunner().invoke(cli.cli, ["import", output, archive]) 67 - # That table should have been deleted 68 - assert "archive_foo" not in db.table_names() 106 + db["archive_follower"].create({"id": int}) 107 + db["archive_follower"].insert({"id": 1}) 108 + assert ["archive_follower"] == db.table_names() 109 + assert [{"id": 1}] == list(db["archive_follower"].rows) 110 + assert ( 111 + "CREATE TABLE [archive_follower] (\n [id] INTEGER\n)" 112 + == db["archive_follower"].schema 113 + ) 114 + # Running the import should wipe and recreate that table 115 + CliRunner().invoke(cli.cli, ["import", output, archive]) 116 + # That table should have been deleted and recreated 117 + assert ( 118 + "CREATE TABLE [archive_follower] (\n [accountId] TEXT PRIMARY KEY\n)" 119 + == db["archive_follower"].schema 120 + ) 121 + assert 2 == db["archive_follower"].count

+1 -3

tests/utils.py

··· 1 1 import io 2 - import pathlib 3 2 import zipfile 4 3 5 4 6 - def create_zip(buf=None): 5 + def create_zip(path, buf=None): 7 6 if buf is None: 8 7 buf = io.BytesIO() 9 - path = pathlib.Path(__file__).parent / "zip_contents" 10 8 zf = zipfile.ZipFile(buf, "w") 11 9 for filepath in path.glob("**/*"): 12 10 if filepath.is_file():

+21

twitter_to_sqlite/archive.py

··· 201 201 bits = url.split("/") 202 202 lists.append({"screen_name": bits[-3], "list_slug": bits[-1]}) 203 203 return lists 204 + 205 + 206 + def import_from_file(db, filename, content): 207 + assert filename.endswith(".js"), "{} does not end with .js".format(filename) 208 + existing_tables = set(db.table_names()) 209 + filename = filename[: -len(".js")] 210 + if filename not in transformers: 211 + print("{}: not yet implemented".format(filename)) 212 + return 213 + transformer, pk = transformers.get(filename) 214 + data = extract_json(content) 215 + to_insert = transformer(data) 216 + for table, rows in to_insert.items(): 217 + table_name = "archive_{}".format(table.replace("-", "_")) 218 + # Drop and re-create if it already exists 219 + if table_name in existing_tables: 220 + db[table_name].drop() 221 + if pk is not None: 222 + db[table_name].upsert_all(rows, pk=pk) 223 + else: 224 + db[table_name].upsert_all(rows, hash_id="pk")

+23 -25

twitter_to_sqlite/cli.py

··· 1 1 import datetime 2 2 import json 3 3 import os 4 + import pathlib 4 5 import time 5 6 6 7 import click ··· 509 510 @cli.command(name="import") 510 511 @click.argument( 511 512 "db_path", 512 - type=click.Path(file_okay=True, dir_okay=False, allow_dash=False), 513 + type=click.Path(file_okay=True, dir_okay=True, allow_dash=False), 513 514 required=True, 514 515 ) 515 516 @click.argument( 516 - "archive_path", 517 - type=click.Path(file_okay=True, dir_okay=False, allow_dash=False, exists=True), 517 + "paths", 518 + type=click.Path(file_okay=True, dir_okay=True, allow_dash=False, exists=True), 518 519 required=True, 520 + nargs=-1, 519 521 ) 520 - def import_(db_path, archive_path): 521 - "Import data from a Twitter exported archive" 522 + def import_(db_path, paths): 523 + """ 524 + Import data from a Twitter exported archive. Input can be the path to a zip 525 + file, a directory full of .js files or one or more direct .js files. 526 + """ 522 527 db = sqlite_utils.Database(db_path) 523 - # Drop archive-* tables that already exist 524 - for table in db.tables: 525 - if table.name.startswith("archive_"): 526 - table.drop() 527 - for filename, content in utils.read_archive_js(archive_path): 528 - filename = filename[: -len(".js")] 529 - if filename not in archive.transformers: 530 - print("{}: not yet implemented".format(filename)) 531 - continue 532 - transformer, pk = archive.transformers.get(filename) 533 - data = archive.extract_json(content) 534 - to_insert = transformer(data) 535 - for table, rows in to_insert.items(): 536 - table_name = "archive_{}".format(table.replace("-", "_")) 537 - if pk is not None: 538 - db[table_name].upsert_all(rows, pk=pk) 539 - else: 540 - db[table_name].upsert_all(rows, hash_id="pk") 541 - count = db[table_name].count 542 - print("{}: {} item{}".format(table_name, count, "s" if count == 1 else "")) 528 + for filepath in paths: 529 + path = pathlib.Path(filepath) 530 + if path.suffix == ".zip": 531 + for filename, content in utils.read_archive_js(filepath): 532 + archive.import_from_file(db, filename, content) 533 + elif path.is_dir(): 534 + # Import every .js file in this directory 535 + for filepath in path.glob("*.js"): 536 + archive.import_from_file(db, filepath.name, open(filepath, "rb").read()) 537 + elif path.suffix == ".js": 538 + archive.import_from_file(db, path.name, open(path, "rb").read()) 539 + else: 540 + raise click.ClickException("Path must be a .js or .zip file or a directory")

+8 -2

twitter_to_sqlite/utils.py

··· 97 97 # Rate limit 75/15 mins = 5/minute = every 12 seconds 98 98 sleep = 12 99 99 yield from fetch_timeline( 100 - session, "https://api.twitter.com/1.1/favorites/list.json", args, sleep=sleep, stop_after=stop_after 100 + session, 101 + "https://api.twitter.com/1.1/favorites/list.json", 102 + args, 103 + sleep=sleep, 104 + stop_after=stop_after, 101 105 ) 102 106 103 107 ··· 135 139 136 140 137 141 def transform_tweet(tweet): 138 - tweet["full_text"] = html.unescape(expand_entities(tweet["full_text"], tweet.pop("entities"))) 142 + tweet["full_text"] = html.unescape( 143 + expand_entities(tweet["full_text"], tweet.pop("entities")) 144 + ) 139 145 to_remove = [k for k in tweet if k.endswith("_str")] + [ 140 146 "quoted_status_id", 141 147 "quoted_status_permalink",

Configure Feed

Configure Feed