···240240241241This command does not populate any of the regular tables, since Twitter's export data does not exactly match the schema returned by the Twitter API.
242242243243-It will delete and recreate all of your `archive_*` tables every time you run it. If this is not what you want, run the command against a new SQLite database file name rather than running it against one that already exists.
243243+It will delete and recreate the corresponding `archive_*` tables every time you run it. If this is not what you want, run the command against a new SQLite database file name rather than running it against one that already exists.
244244+245245+If you have already decompressed your archive, you can run this against the directory that you decompressed it to:
246246+247247+ $ twitter-to-sqlite import archive.db ~/Downloads/twitter-2019-06-25-b31f2/
248248+249249+You can also run it against one or more specific files within that folder. For example, to import just the follower.js and following.js files:
250250+251251+ $ twitter-to-sqlite import archive.db \
252252+ ~/Downloads/twitter-2019-06-25-b31f2/follower.js \
253253+ ~/Downloads/twitter-2019-06-25-b31f2/following.js
244254245255You may want to use other commands to populate tables based on data from the archive. For example, to retrieve full API versions of each of the tweets you have favourited in your archive, you could run the following:
246256
+1-1
tests/test_create_zip.py
···445566def test_create_zip():
77- zf = create_zip()
77+ zf = create_zip(pathlib.Path(__file__).parent / "zip_contents")
88 assert {"account.js", "saved-search.js", "following.js", "follower.js"} == {
99 f.filename for f in zf.filelist
1010 }
···11import io
22-import pathlib
32import zipfile
435466-def create_zip(buf=None):
55+def create_zip(path, buf=None):
76 if buf is None:
87 buf = io.BytesIO()
99- path = pathlib.Path(__file__).parent / "zip_contents"
108 zf = zipfile.ZipFile(buf, "w")
119 for filepath in path.glob("**/*"):
1210 if filepath.is_file():
+21
twitter_to_sqlite/archive.py
···201201 bits = url.split("/")
202202 lists.append({"screen_name": bits[-3], "list_slug": bits[-1]})
203203 return lists
204204+205205+206206+def import_from_file(db, filename, content):
207207+ assert filename.endswith(".js"), "{} does not end with .js".format(filename)
208208+ existing_tables = set(db.table_names())
209209+ filename = filename[: -len(".js")]
210210+ if filename not in transformers:
211211+ print("{}: not yet implemented".format(filename))
212212+ return
213213+ transformer, pk = transformers.get(filename)
214214+ data = extract_json(content)
215215+ to_insert = transformer(data)
216216+ for table, rows in to_insert.items():
217217+ table_name = "archive_{}".format(table.replace("-", "_"))
218218+ # Drop and re-create if it already exists
219219+ if table_name in existing_tables:
220220+ db[table_name].drop()
221221+ if pk is not None:
222222+ db[table_name].upsert_all(rows, pk=pk)
223223+ else:
224224+ db[table_name].upsert_all(rows, hash_id="pk")
+23-25
twitter_to_sqlite/cli.py
···11import datetime
22import json
33import os
44+import pathlib
45import time
5667import click
···509510@cli.command(name="import")
510511@click.argument(
511512 "db_path",
512512- type=click.Path(file_okay=True, dir_okay=False, allow_dash=False),
513513+ type=click.Path(file_okay=True, dir_okay=True, allow_dash=False),
513514 required=True,
514515)
515516@click.argument(
516516- "archive_path",
517517- type=click.Path(file_okay=True, dir_okay=False, allow_dash=False, exists=True),
517517+ "paths",
518518+ type=click.Path(file_okay=True, dir_okay=True, allow_dash=False, exists=True),
518519 required=True,
520520+ nargs=-1,
519521)
520520-def import_(db_path, archive_path):
521521- "Import data from a Twitter exported archive"
522522+def import_(db_path, paths):
523523+ """
524524+ Import data from a Twitter exported archive. Input can be the path to a zip
525525+ file, a directory full of .js files or one or more direct .js files.
526526+ """
522527 db = sqlite_utils.Database(db_path)
523523- # Drop archive-* tables that already exist
524524- for table in db.tables:
525525- if table.name.startswith("archive_"):
526526- table.drop()
527527- for filename, content in utils.read_archive_js(archive_path):
528528- filename = filename[: -len(".js")]
529529- if filename not in archive.transformers:
530530- print("{}: not yet implemented".format(filename))
531531- continue
532532- transformer, pk = archive.transformers.get(filename)
533533- data = archive.extract_json(content)
534534- to_insert = transformer(data)
535535- for table, rows in to_insert.items():
536536- table_name = "archive_{}".format(table.replace("-", "_"))
537537- if pk is not None:
538538- db[table_name].upsert_all(rows, pk=pk)
539539- else:
540540- db[table_name].upsert_all(rows, hash_id="pk")
541541- count = db[table_name].count
542542- print("{}: {} item{}".format(table_name, count, "s" if count == 1 else ""))
528528+ for filepath in paths:
529529+ path = pathlib.Path(filepath)
530530+ if path.suffix == ".zip":
531531+ for filename, content in utils.read_archive_js(filepath):
532532+ archive.import_from_file(db, filename, content)
533533+ elif path.is_dir():
534534+ # Import every .js file in this directory
535535+ for filepath in path.glob("*.js"):
536536+ archive.import_from_file(db, filepath.name, open(filepath, "rb").read())
537537+ elif path.suffix == ".js":
538538+ archive.import_from_file(db, path.name, open(path, "rb").read())
539539+ else:
540540+ raise click.ClickException("Path must be a .js or .zip file or a directory")
+8-2
twitter_to_sqlite/utils.py
···9797 # Rate limit 75/15 mins = 5/minute = every 12 seconds
9898 sleep = 12
9999 yield from fetch_timeline(
100100- session, "https://api.twitter.com/1.1/favorites/list.json", args, sleep=sleep, stop_after=stop_after
100100+ session,
101101+ "https://api.twitter.com/1.1/favorites/list.json",
102102+ args,
103103+ sleep=sleep,
104104+ stop_after=stop_after,
101105 )
102106103107···135139136140137141def transform_tweet(tweet):
138138- tweet["full_text"] = html.unescape(expand_entities(tweet["full_text"], tweet.pop("entities")))
142142+ tweet["full_text"] = html.unescape(
143143+ expand_entities(tweet["full_text"], tweet.pop("entities"))
144144+ )
139145 to_remove = [k for k in tweet if k.endswith("_str")] + [
140146 "quoted_status_id",
141147 "quoted_status_permalink",