Fixes for archive imports (#55) · alice.mosphere.at/twitter-to-sqlite@bf622dc

this repo has no description

Fixes for archive imports (#55)

* Find data files in subdirectories in archives

See #54 for discussion. This also ignores files in the new "assets"
directory, which appear to be some stuff for a browser interface
Twitter's created.

* Fix list-member importer

It appears in list data that some rows contain a `urls` key with a list
of URLs, while others contain a `url` key with just a single one. This
change supports either way.

* Fix tweet import

This was working, sorta, but wasn't properly unpacking the tweet
data into columns. This commit fixes that in what I think should
be a backwards-compatible way.

authored by

Jacob Kaplan-Moss and committed by

GitHub 4 years ago bf622dcb b6a4da8b

+25 -3

2 changed files

expand all

twitter_to_sqlite

archive.py

utils.py

+19 -1

twitter_to_sqlite/archive.py

··· 185 185 186 186 @register_each("tweet", pk="id") 187 187 def tweet(item): 188 + # Older versions of the archive have the tweet data at the top level of the 189 + # item; newer versions have it all in a 'tweet' sub-key. 190 + if "tweet" in item: 191 + item = item["tweet"] 192 + 188 193 for key in item: 189 194 if key == "id" or key.endswith("_id"): 190 195 item[key] = int(item[key]) 196 + 197 + # Handle some columns that are sometimes missing 198 + optional_columns = ["possibly_sensitive", "coordinates", "geo", "extended_entities"] 199 + for col in optional_columns: 200 + item.setdefault(col, None) 201 + 191 202 return item 192 203 193 204 ··· 197 208 def _list_from_common(data): 198 209 lists = [] 199 210 for block in data: 200 - for url in block["userListInfo"]["urls"]: 211 + info = block["userListInfo"] 212 + if "urls" in info: 213 + urls = info["urls"] 214 + elif "url" in info: 215 + urls = [info["url"]] 216 + else: 217 + urls = [] 218 + for url in urls: 201 219 bits = url.split("/") 202 220 lists.append({"screen_name": bits[-3], "list_slug": bits[-1]}) 203 221 return lists

+6 -2

twitter_to_sqlite/utils.py

··· 657 657 "Open zip file, return (filename, content) for all .js" 658 658 zf = zipfile.ZipFile(filepath) 659 659 for zi in zf.filelist: 660 - if zi.filename.endswith(".js"): 661 - yield zi.filename, zf.open(zi.filename).read() 660 + # Ignore files in a assets dir -- these are for Twitter's archive 661 + # browser thingie -- and only use final filenames since some archives 662 + # appear to put data in a data/ subdir, which can screw up the filename 663 + # -> importer mapping. 664 + if zi.filename.endswith(".js") and not zi.filename.startswith("assets/"): 665 + yield pathlib.Path(zi.filename).name, zf.open(zi.filename).read() 662 666 663 667 664 668 def extract_and_save_source(db, source):

Configure Feed

Configure Feed