twitter-to-sqlite import command, closes #4 · alice.mosphere.at/twitter-to-sqlite@2019ee9

+16

README.md

··· 215 215 --sql="select distinct followed_id from following" \ 216 216 --ids 217 217 218 + ## Importing data from your Twitter archive 219 + 220 + You can request an archive of your Twitter data by [following these instructions](https://help.twitter.com/en/managing-your-account/how-to-download-your-twitter-archive). 221 + 222 + Twitter will send you a link to download a `.zip` file. You can import the contents of that file into a set of tables (each beginning with the `archive-` prefix) using the `import` command: 223 + 224 + $ twitter-to-sqlite import archive.db ~/Downloads/twitter-2019-06-25-b31f2.zip 225 + 226 + This command does not populate any of the regular tables, since Twitter's export data does not exactly match the schema returned by the Twitter API. 227 + 228 + You may want to use other commands to populate tables based on data from the archive. For example, to retrieve full API versions of each of the tweets you have favourited in your archive, you could run the following: 229 + 230 + $ twitter-to-sqlite statuses-lookup archive.db \ 231 + --sql='select tweetId from [archive-like]' \ 232 + --skip-existing 233 + 218 234 ## Design notes 219 235 220 236 * Tweet IDs are stored as integers, to afford sorting by ID in a sensible way

tests/__init__.py

This is a binary file and will not be displayed.

+10

tests/test_create_zip.py

··· 1 + import pathlib 2 + 3 + from .utils import create_zip 4 + 5 + 6 + def test_create_zip(): 7 + zf = create_zip() 8 + assert {"account.js", "saved-search.js", "following.js", "follower.js"} == { 9 + f.filename for f in zf.filelist 10 + }

+51

tests/test_import.py

··· 1 + import io 2 + 3 + import pytest 4 + import sqlite_utils 5 + from click.testing import CliRunner 6 + from twitter_to_sqlite import cli 7 + 8 + from .utils import create_zip 9 + 10 + 11 + def test_cli_import(tmpdir): 12 + archive = str(tmpdir / "archive.zip") 13 + output = str(tmpdir / "output.db") 14 + buf = io.BytesIO() 15 + zf = create_zip(buf) 16 + zf.close() 17 + open(archive, "wb").write(buf.getbuffer()) 18 + result = CliRunner().invoke(cli.cli, ["import", output, archive]) 19 + assert 0 == result.exit_code, result.stderr 20 + db = sqlite_utils.Database(output) 21 + assert { 22 + "archive-follower", 23 + "archive-saved-search", 24 + "archive-account", 25 + "archive-following", 26 + } == set(db.table_names()) 27 + 28 + assert [{"accountId": "73747798"}, {"accountId": "386025404"}] == list( 29 + db["archive-follower"].rows 30 + ) 31 + assert [{"accountId": "547842573"}, {"accountId": "12158"}] == list( 32 + db["archive-following"].rows 33 + ) 34 + 35 + assert [ 36 + {"savedSearchId": "42214", "query": "simonw"}, 37 + {"savedSearchId": "55814", "query": "django"}, 38 + ] == list(db["archive-saved-search"].rows) 39 + dd = list(db["archive-account"].rows) 40 + assert [ 41 + { 42 + "pk": "c4e32e91742df2331ef3ad1e481d1a64d781183a", 43 + "phoneNumber": "+15555555555", 44 + "email": "swillison@example.com", 45 + "createdVia": "web", 46 + "username": "simonw", 47 + "accountId": "12497", 48 + "createdAt": "2006-11-15T13:18:50.000Z", 49 + "accountDisplayName": "Simon Willison", 50 + } 51 + ] == dd

+4 -3

tests/test_save_tweets.py

··· 1 - from twitter_to_sqlite import utils 1 + import json 2 + import pathlib 3 + 2 4 import pytest 3 - import pathlib 4 5 import sqlite_utils 5 - import json 6 + from twitter_to_sqlite import utils 6 7 7 8 8 9 @pytest.fixture

+14

tests/utils.py

··· 1 + import io 2 + import pathlib 3 + import zipfile 4 + 5 + 6 + def create_zip(buf=None): 7 + if buf is None: 8 + buf = io.BytesIO() 9 + path = pathlib.Path(__file__).parent / "zip_contents" 10 + zf = zipfile.ZipFile(buf, "w") 11 + for filepath in path.glob("**/*"): 12 + if filepath.is_file(): 13 + zf.write(filepath, str(filepath.relative_to(path))) 14 + return zf

+11

tests/zip_contents/account.js

··· 1 + window.YTD.account.part0 = [ { 2 + "account" : { 3 + "phoneNumber" : "+15555555555", 4 + "email" : "swillison@example.com", 5 + "createdVia" : "web", 6 + "username" : "simonw", 7 + "accountId" : "12497", 8 + "createdAt" : "2006-11-15T13:18:50.000Z", 9 + "accountDisplayName" : "Simon Willison" 10 + } 11 + } ]

+9

tests/zip_contents/follower.js

··· 1 + window.YTD.follower.part0 = [ { 2 + "follower" : { 3 + "accountId" : "73747798" 4 + } 5 + }, { 6 + "follower" : { 7 + "accountId" : "386025404" 8 + } 9 + } ]

+9

tests/zip_contents/following.js

··· 1 + window.YTD.following.part0 = [ { 2 + "following" : { 3 + "accountId" : "547842573" 4 + } 5 + }, { 6 + "following" : { 7 + "accountId" : "12158" 8 + } 9 + } ]

+11

tests/zip_contents/saved-search.js

··· 1 + window.YTD.saved_search.part0 = [ { 2 + "savedSearch" : { 3 + "savedSearchId" : "42214", 4 + "query" : "simonw" 5 + } 6 + }, { 7 + "savedSearch" : { 8 + "savedSearchId" : "55814", 9 + "query" : "django" 10 + } 11 + } ]

+203

twitter_to_sqlite/archive.py

··· 1 + # Utilities for dealing with Twitter archives 2 + import json 3 + 4 + # Goal is to have a mapping of filename to a tuple with 5 + # (callable, pk=) triples, where the callable 6 + # takes the JSON from that file and returns a dictionary 7 + # of tables that should be created {"tabe": [rows-to-upsert]} 8 + transformers = {} 9 + 10 + 11 + def register(filename, each, pk=None): 12 + def callback(data): 13 + return {filename: [item.get(each) for item in data]} 14 + 15 + transformers[filename] = (callback, pk) 16 + 17 + 18 + def register_each(filename, pk=None): 19 + def inner(fn): 20 + def callback(data): 21 + return {filename: [fn(item) for item in data]} 22 + 23 + transformers[filename] = (callback, pk) 24 + 25 + return inner 26 + 27 + 28 + def register_multi(filename): 29 + def inner(fn): 30 + transformers[filename] = (fn, None) 31 + 32 + return inner 33 + 34 + 35 + def register_all(filename): 36 + def inner(fn): 37 + transformers[filename] = (fn, None) 38 + 39 + return inner 40 + 41 + 42 + def extract_json(contents): 43 + # window.YTD.account_creation_ip.part0 = [ ... data ...] 44 + contents = contents.strip() 45 + if contents.startswith(b"window."): 46 + contents = contents.split(b" = ", 1)[1] 47 + return json.loads(contents) 48 + 49 + 50 + register("account-creation-ip", each="accountCreationIp") 51 + register("account-suspension", each="accountSuspension") 52 + register("account-timezone", each="accountTimezone") 53 + register("account", each="account") 54 + 55 + 56 + @register_each("ad-engagements") 57 + def ad_engagements(item): 58 + return item["ad"]["adsUserData"]["adEngagements"] 59 + 60 + 61 + @register_each("ad-impressions") 62 + def ad_impressions(item): 63 + return item["ad"]["adsUserData"]["adImpressions"] 64 + 65 + 66 + @register_each("ad-mobile-conversions-attributed") 67 + def ad_mobile_conversions_attributed(item): 68 + return item["ad"]["adsUserData"]["attributedMobileAppConversions"] 69 + 70 + 71 + @register_each("ad-mobile-conversions-unattributed") 72 + def ad_mobile_conversions_unattributed(item): 73 + return item["ad"]["adsUserData"]["unattributedMobileAppConversions"] 74 + 75 + 76 + @register_each("ad-online-conversions-attributed") 77 + def ad_online_conversions_attributed(item): 78 + return item["ad"]["adsUserData"]["attributedOnlineConversions"] 79 + 80 + 81 + @register_each("ad-online-conversions-unattributed") 82 + def ad_online_conversions_unattributed(item): 83 + return item["ad"]["adsUserData"]["unattributedOnlineConversions"] 84 + 85 + 86 + @register_each("ageinfo") 87 + def ageinfo(item): 88 + return item["ageMeta"]["ageInfo"] 89 + 90 + 91 + register("block", each="blocking", pk="accountId") 92 + register("connected-applications", each="connectedApplication", pk="id") 93 + # register("contact", ...) 94 + register("direct-message-group-headers", each="dmConversation", pk="conversationId") 95 + register("direct-message-group", each="dmConversation", pk="conversationId") 96 + register("direct-message-headers", each="dmConversation", pk="conversationId") 97 + # pk for this one is NOT set, because there are dupes: 98 + # TODO: These actually do warrant separate tables: 99 + register("direct-message", each="dmConversation") 100 + 101 + register("email-address-change", each="emailAddressChange") 102 + register("follower", each="follower", pk="accountId") 103 + register("following", each="following", pk="accountId") 104 + register("ip-audit", each="ipAudit") 105 + register("like", each="like", pk="tweetId") 106 + 107 + 108 + @register_all("lists-created") 109 + def lists_created(data): 110 + return {"lists-created": _list_from_common(data)} 111 + 112 + 113 + @register_all("lists-member") 114 + def lists_member(data): 115 + return {"lists-member": _list_from_common(data)} 116 + 117 + 118 + @register_all("lists-subscribed") 119 + def lists_subscribed(data): 120 + return {"lists-subscribed": _list_from_common(data)} 121 + 122 + 123 + register("moment", each="moment", pk="momentId") 124 + # register("mute", ...) 125 + 126 + 127 + @register_all("ni-devices") 128 + def lists_created(data): 129 + devices = [] 130 + for block in data: 131 + block = block["niDeviceResponse"] 132 + category = list(block.keys())[0] 133 + details = list(block.values())[0] 134 + details["category"] = category 135 + devices.append(details) 136 + return {"ne-devices": devices} 137 + 138 + 139 + # Skipped all the periscope- stuff for the moment 140 + 141 + 142 + @register_multi("personalization") 143 + def personalization(data): 144 + data = data[0] 145 + # As a multi, we get to return a dict of 146 + # table names => list of objects to insert 147 + to_create = {} 148 + demographics = data["p13nData"]["demographics"] 149 + to_create["personalization-demographics-languages"] = demographics["languages"] 150 + to_create["personalization-demographics-genderInfo"] = [demographics["genderInfo"]] 151 + to_create["personalization-interests"] = data["p13nData"]["interests"]["interests"] 152 + to_create["personalization-partnerInterests"] = data["p13nData"]["interests"][ 153 + "partnerInterests" 154 + ] 155 + to_create["personalization-advertisers"] = [ 156 + {"name": name} 157 + for name in data["p13nData"]["interests"]["audienceAndAdvertisers"][ 158 + "advertisers" 159 + ] 160 + ] 161 + to_create["personalization-num-audiences"] = [ 162 + { 163 + "numAudiences": data["p13nData"]["interests"]["audienceAndAdvertisers"][ 164 + "numAudiences" 165 + ] 166 + } 167 + ] 168 + to_create["personalization-shows"] = [ 169 + {"name": name} for name in data["p13nData"]["interests"]["shows"] 170 + ] 171 + to_create["personalization-locationHistory"] = [ 172 + {"name": name} for name in data["p13nData"]["locationHistory"] 173 + ] 174 + to_create["personalization-inferredAgeInfo"] = [data["p13nData"]["inferredAgeInfo"]] 175 + return to_create 176 + 177 + 178 + register("phone-number", each="device") 179 + register("profile", each="profile") 180 + # protected-history.js 181 + 182 + register("saved-search", each="savedSearch", pk="savedSearchId") 183 + # screen-name-change.js 184 + 185 + 186 + @register_each("tweet", pk="id") 187 + def tweet(item): 188 + for key in item: 189 + if key == "id" or key.endswith("_id"): 190 + item[key] = int(item[key]) 191 + return item 192 + 193 + 194 + register("verified", each="verified") 195 + 196 + 197 + def _list_from_common(data): 198 + lists = [] 199 + for block in data: 200 + for url in block["userListInfo"]["urls"]: 201 + bits = url.split("/") 202 + lists.append({"screen_name": bits[-3], "list_slug": bits[-1]}) 203 + return lists

+37 -3

twitter_to_sqlite/cli.py

··· 1 - import click 2 1 import datetime 2 + import json 3 3 import os 4 - import sqlite_utils 5 4 import time 6 - import json 5 + 6 + import click 7 + import sqlite_utils 8 + from twitter_to_sqlite import archive 7 9 from twitter_to_sqlite import utils 8 10 9 11 ··· 455 457 ignore=True, 456 458 ) 457 459 time.sleep(sleep) 460 + 461 + 462 + @cli.command(name="import") 463 + @click.argument( 464 + "db_path", 465 + type=click.Path(file_okay=True, dir_okay=False, allow_dash=False), 466 + required=True, 467 + ) 468 + @click.argument( 469 + "archive_path", 470 + type=click.Path(file_okay=True, dir_okay=False, allow_dash=False, exists=True), 471 + required=True, 472 + ) 473 + def import_(db_path, archive_path): 474 + "Import data from a Twitter exported archive" 475 + db = sqlite_utils.Database(db_path) 476 + for filename, content in utils.read_archive_js(archive_path): 477 + filename = filename[: -len(".js")] 478 + if filename not in archive.transformers: 479 + print("{}: not yet implemented".format(filename)) 480 + continue 481 + transformer, pk = archive.transformers.get(filename) 482 + data = archive.extract_json(content) 483 + to_insert = transformer(data) 484 + for table, rows in to_insert.items(): 485 + table_name = "archive-{}".format(table) 486 + if pk is not None: 487 + db[table_name].upsert_all(rows, pk=pk) 488 + else: 489 + db[table_name].upsert_all(rows, hash_id="pk") 490 + count = db[table_name].count 491 + print("{}: {} item{}".format(table_name, count, "s" if count == 1 else ""))

+14 -4

twitter_to_sqlite/utils.py

··· 1 - from requests_oauthlib import OAuth1Session 2 - from dateutil import parser 3 1 import datetime 4 - import time 2 + import json 5 3 import pathlib 6 - import json 4 + import time 7 5 import urllib.parse 6 + import zipfile 7 + 8 + from dateutil import parser 9 + from requests_oauthlib import OAuth1Session 8 10 9 11 10 12 def session_for_auth(auth): ··· 413 415 return [ 414 416 r[0] for r in db.conn.execute(sql, [s.lower() for s in screen_names]).fetchall() 415 417 ] 418 + 419 + 420 + def read_archive_js(filepath): 421 + "Open zip file, return (filename, content) for all .js" 422 + zf = zipfile.ZipFile(filepath) 423 + for zi in zf.filelist: 424 + if zi.filename.endswith(".js"): 425 + yield zi.filename, zf.open(zi.filename).read()

Configure Feed

Configure Feed