···55import time
6677import click
88-import sqlite_utils
88+99from twitter_to_sqlite import archive
1010from twitter_to_sqlite import utils
1111···102102 "Save followers for specified user (defaults to authenticated user)"
103103 auth = json.load(open(auth))
104104 session = utils.session_for_auth(auth)
105105- db = sqlite_utils.Database(db_path)
105105+ db = utils.open_database(db_path)
106106 fetched = []
107107 # Get the follower count, so we can have a progress bar
108108 count = 0
···152152 "Save tweets favorited by specified user"
153153 auth = json.load(open(auth))
154154 session = utils.session_for_auth(auth)
155155- db = sqlite_utils.Database(db_path)
155155+ db = utils.open_database(db_path)
156156 profile = utils.get_profile(db, session, user_id, screen_name)
157157 with click.progressbar(
158158 utils.fetch_favorites(session, user_id, screen_name, stop_after),
···193193 raise click.ClickException("Use either --since or --since_id, not both")
194194 auth = json.load(open(auth))
195195 session = utils.session_for_auth(auth)
196196- db = sqlite_utils.Database(db_path)
196196+ db = utils.open_database(db_path)
197197 profile = utils.get_profile(db, session, user_id, screen_name)
198198 expected_length = profile["statuses_count"]
199199···209209 pass
210210211211 with click.progressbar(
212212- utils.fetch_user_timeline(session, user_id, screen_name, stop_after, since_id=since_id),
212212+ utils.fetch_user_timeline(
213213+ session, user_id, screen_name, stop_after, since_id=since_id
214214+ ),
213215 length=expected_length,
214216 label="Importing tweets",
215217 show_pos=True,
···253255 raise click.ClickException("Use either --since or --since_id, not both")
254256 auth = json.load(open(auth))
255257 session = utils.session_for_auth(auth)
256256- db = sqlite_utils.Database(db_path)
258258+ db = utils.open_database(db_path)
257259 profile = utils.get_profile(db, session)
258260 expected_length = 800
259261 if since and db["timeline_tweets"].exists:
···310312 "Fetch user accounts"
311313 auth = json.load(open(auth))
312314 session = utils.session_for_auth(auth)
313313- db = sqlite_utils.Database(db_path)
315315+ db = utils.open_database(db_path)
314316 identifiers = utils.resolve_identifiers(db, identifiers, attach, sql)
315317 for batch in utils.fetch_user_batches(session, identifiers, ids):
316318 utils.save_users(db, batch)
···338340 "Fetch tweets by their IDs"
339341 auth = json.load(open(auth))
340342 session = utils.session_for_auth(auth)
341341- db = sqlite_utils.Database(db_path)
343343+ db = utils.open_database(db_path)
342344 identifiers = utils.resolve_identifiers(db, identifiers, attach, sql)
343345 if skip_existing:
344346 existing_ids = set(
···381383 "Fetch lists - accepts one or more screen_name/list_slug identifiers"
382384 auth = json.load(open(auth))
383385 session = utils.session_for_auth(auth)
384384- db = sqlite_utils.Database(db_path)
386386+ db = utils.open_database(db_path)
385387 for identifier in identifiers:
386388 utils.fetch_and_save_list(db, session, identifier, ids)
387389···477479 "Experimental: Save tweets matching these keywords in real-time"
478480 auth = json.load(open(auth))
479481 session = utils.session_for_auth(auth)
480480- db = sqlite_utils.Database(db_path)
482482+ db = utils.open_database(db_path)
481483 for tweet in utils.stream_filter(session, track=track):
482484 if verbose:
483485 print(json.dumps(tweet, indent=2))
···505507 "Experimental: Follow these Twitter users and save tweets in real-time"
506508 auth = json.load(open(auth))
507509 session = utils.session_for_auth(auth)
508508- db = sqlite_utils.Database(db_path)
510510+ db = utils.open_database(db_path)
509511 identifiers = utils.resolve_identifiers(db, identifiers, attach, sql)
510512 # Make sure we have saved these users to the database
511513 for batch in utils.fetch_user_batches(session, identifiers, ids):
···528530):
529531 auth = json.load(open(auth))
530532 session = utils.session_for_auth(auth)
531531- db = sqlite_utils.Database(db_path)
533533+ db = utils.open_database(db_path)
532534 identifiers = utils.resolve_identifiers(db, identifiers, attach, sql)
533535 for identifier in identifiers:
534536 # Make sure this user is saved
···568570 Import data from a Twitter exported archive. Input can be the path to a zip
569571 file, a directory full of .js files or one or more direct .js files.
570572 """
571571- db = sqlite_utils.Database(db_path)
573573+ db = utils.open_database(db_path)
572574 for filepath in paths:
573575 path = pathlib.Path(filepath)
574576 if path.suffix == ".zip":
+22
twitter_to_sqlite/migrations.py
···11+from .utils import extract_and_save_source
22+33+MIGRATIONS = []
44+55+66+def migration(fn):
77+ MIGRATIONS.append(fn)
88+ return fn
99+1010+1111+@migration
1212+def convert_source_column(db):
1313+ tables = set(db.table_names())
1414+ if "tweets" not in tables:
1515+ return
1616+ # Now we extract any '<a href=...' records from the source
1717+ for id, source in db.conn.execute(
1818+ "select id, source from tweets where source like '<%'"
1919+ ).fetchall():
2020+ db["tweets"].update(id, {"source": extract_and_save_source(db, source)})
2121+ db["tweets"].create_index(["source"])
2222+ db["tweets"].add_foreign_key("source")
+45-1
twitter_to_sqlite/utils.py
···22import html
33import json
44import pathlib
55+import re
56import time
67import urllib.parse
78import zipfile
89910from dateutil import parser
1011from requests_oauthlib import OAuth1Session
1212+import sqlite_utils
11131214# Twitter API error codes
1315RATE_LIMIT_ERROR_CODE = 88
1616+1717+source_re = re.compile('<a href="(?P<url>.*?)".*?>(?P<name>.*?)</a>')
1818+1919+2020+def open_database(db_path):
2121+ db = sqlite_utils.Database(db_path)
2222+ # Only run migrations if this is an existing DB (has tables)
2323+ if db.tables:
2424+ migrate(db)
2525+ return db
2626+2727+2828+def migrate(db):
2929+ from twitter_to_sqlite.migrations import MIGRATIONS
3030+3131+ if "migrations" not in db.table_names():
3232+ db["migrations"].create({"name": str, "applied": str}, pk="name")
3333+ applied_migrations = {
3434+ m[0] for m in db.conn.execute("select name from migrations").fetchall()
3535+ }
3636+ for migration in MIGRATIONS:
3737+ name = migration.__name__
3838+ if name in applied_migrations:
3939+ continue
4040+ migration(db)
4141+ db["migrations"].insert(
4242+ {"name": name, "applied": datetime.datetime.utcnow().isoformat()}
4343+ )
144415451646def session_for_auth(auth):
···186216 table_names = set(db.table_names())
187217 if "places" not in table_names:
188218 db["places"].create({"id": str}, pk="id")
219219+ if "sources" not in table_names:
220220+ db["sources"].create({"id": str, "name": str, "url": str}, pk="id")
189221 if "users" not in table_names:
190222 db["users"].create(
191223 {
···210242 "retweeted_status": int,
211243 "quoted_status": int,
212244 "place": str,
245245+ "source": str,
213246 },
214247 pk="id",
215215- foreign_keys=(("user", "users", "id"), ("place", "places", "id")),
248248+ foreign_keys=(
249249+ ("user", "users", "id"),
250250+ ("place", "places", "id"),
251251+ ("source", "sources", "id"),
252252+ ),
216253 )
217254 db["tweets"].enable_fts(["full_text"], create_triggers=True)
218255 db["tweets"].add_foreign_key("retweeted_status", "tweets")
···235272 user = tweet.pop("user")
236273 transform_user(user)
237274 tweet["user"] = user["id"]
275275+ tweet["source"] = extract_and_save_source(db, tweet["source"])
238276 if tweet.get("place"):
239277 db["places"].upsert(tweet["place"], pk="id", alter=True)
240278 tweet["place"] = tweet["place"]["id"]
···472510 for zi in zf.filelist:
473511 if zi.filename.endswith(".js"):
474512 yield zi.filename, zf.open(zi.filename).read()
513513+514514+515515+def extract_and_save_source(db, source):
516516+ m = source_re.match(source)
517517+ details = m.groupdict()
518518+ return db["sources"].upsert(details, hash_id="id").last_pk