this repo has no description
0
fork

Configure Feed

Select the types of activity you want to include in your feed.

1# Utilities for dealing with Twitter archives 2import json 3 4# Goal is to have a mapping of filename to a tuple with 5# (callable, pk=) triples, where the callable 6# takes the JSON from that file and returns a dictionary 7# of tables that should be created {"tabe": [rows-to-upsert]} 8transformers = {} 9 10 11def register(filename, each, pk=None): 12 def callback(data): 13 return {filename: [item.get(each) for item in data]} 14 15 transformers[filename] = (callback, pk) 16 17 18def register_each(filename, pk=None): 19 def inner(fn): 20 def callback(data): 21 return {filename: [fn(item) for item in data]} 22 23 transformers[filename] = (callback, pk) 24 25 return inner 26 27 28def register_multi(filename): 29 def inner(fn): 30 transformers[filename] = (fn, None) 31 32 return inner 33 34 35def register_all(filename): 36 def inner(fn): 37 transformers[filename] = (fn, None) 38 39 return inner 40 41 42def extract_json(contents): 43 # window.YTD.account_creation_ip.part0 = [ ... data ...] 44 contents = contents.strip() 45 if contents.startswith(b"window."): 46 contents = contents.split(b" = ", 1)[1] 47 return json.loads(contents) 48 49 50register("account-creation-ip", each="accountCreationIp") 51register("account-suspension", each="accountSuspension") 52register("account-timezone", each="accountTimezone") 53register("account", each="account") 54 55 56@register_each("ad-engagements") 57def ad_engagements(item): 58 return item["ad"]["adsUserData"]["adEngagements"] 59 60 61@register_each("ad-impressions") 62def ad_impressions(item): 63 return item["ad"]["adsUserData"]["adImpressions"] 64 65 66@register_each("ad-mobile-conversions-attributed") 67def ad_mobile_conversions_attributed(item): 68 return item["ad"]["adsUserData"]["attributedMobileAppConversions"] 69 70 71@register_each("ad-mobile-conversions-unattributed") 72def ad_mobile_conversions_unattributed(item): 73 return item["ad"]["adsUserData"]["unattributedMobileAppConversions"] 74 75 76@register_each("ad-online-conversions-attributed") 77def ad_online_conversions_attributed(item): 78 return item["ad"]["adsUserData"]["attributedOnlineConversions"] 79 80 81@register_each("ad-online-conversions-unattributed") 82def ad_online_conversions_unattributed(item): 83 return item["ad"]["adsUserData"]["unattributedOnlineConversions"] 84 85 86@register_each("ageinfo") 87def ageinfo(item): 88 return item["ageMeta"]["ageInfo"] 89 90 91register("block", each="blocking", pk="accountId") 92register("connected-applications", each="connectedApplication", pk="id") 93# register("contact", ...) 94register("direct-message-group-headers", each="dmConversation", pk="conversationId") 95register("direct-message-group", each="dmConversation", pk="conversationId") 96register("direct-message-headers", each="dmConversation", pk="conversationId") 97# pk for this one is NOT set, because there are dupes: 98# TODO: These actually do warrant separate tables: 99register("direct-message", each="dmConversation") 100 101register("email-address-change", each="emailAddressChange") 102register("follower", each="follower", pk="accountId") 103register("following", each="following", pk="accountId") 104register("ip-audit", each="ipAudit") 105register("like", each="like", pk="tweetId") 106 107 108@register_all("lists-created") 109def lists_created(data): 110 return {"lists-created": _list_from_common(data)} 111 112 113@register_all("lists-member") 114def lists_member(data): 115 return {"lists-member": _list_from_common(data)} 116 117 118@register_all("lists-subscribed") 119def lists_subscribed(data): 120 return {"lists-subscribed": _list_from_common(data)} 121 122 123register("moment", each="moment", pk="momentId") 124# register("mute", ...) 125 126 127@register_all("ni-devices") 128def lists_created(data): 129 devices = [] 130 for block in data: 131 block = block["niDeviceResponse"] 132 category = list(block.keys())[0] 133 details = list(block.values())[0] 134 details["category"] = category 135 devices.append(details) 136 return {"ne-devices": devices} 137 138 139# Skipped all the periscope- stuff for the moment 140 141 142@register_multi("personalization") 143def personalization(data): 144 data = data[0] 145 # As a multi, we get to return a dict of 146 # table names => list of objects to insert 147 to_create = {} 148 demographics = data["p13nData"]["demographics"] 149 to_create["personalization-demographics-languages"] = demographics["languages"] 150 to_create["personalization-demographics-genderInfo"] = [demographics["genderInfo"]] 151 to_create["personalization-interests"] = data["p13nData"]["interests"]["interests"] 152 to_create["personalization-partnerInterests"] = data["p13nData"]["interests"][ 153 "partnerInterests" 154 ] 155 to_create["personalization-advertisers"] = [ 156 {"name": name} 157 for name in data["p13nData"]["interests"]["audienceAndAdvertisers"][ 158 "advertisers" 159 ] 160 ] 161 to_create["personalization-num-audiences"] = [ 162 { 163 "numAudiences": data["p13nData"]["interests"]["audienceAndAdvertisers"][ 164 "numAudiences" 165 ] 166 } 167 ] 168 to_create["personalization-shows"] = [ 169 {"name": name} for name in data["p13nData"]["interests"]["shows"] 170 ] 171 to_create["personalization-locationHistory"] = [ 172 {"name": name} for name in data["p13nData"]["locationHistory"] 173 ] 174 to_create["personalization-inferredAgeInfo"] = [data["p13nData"]["inferredAgeInfo"]] 175 return to_create 176 177 178register("phone-number", each="device") 179register("profile", each="profile") 180# protected-history.js 181 182register("saved-search", each="savedSearch", pk="savedSearchId") 183# screen-name-change.js 184 185 186@register_each("tweet", pk="id") 187def tweet(item): 188 for key in item: 189 if key == "id" or key.endswith("_id"): 190 item[key] = int(item[key]) 191 return item 192 193 194register("verified", each="verified") 195 196 197def _list_from_common(data): 198 lists = [] 199 for block in data: 200 for url in block["userListInfo"]["urls"]: 201 bits = url.split("/") 202 lists.append({"screen_name": bits[-3], "list_slug": bits[-1]}) 203 return lists 204 205 206def import_from_file(db, filename, content): 207 assert filename.endswith(".js"), "{} does not end with .js".format(filename) 208 existing_tables = set(db.table_names()) 209 filename = filename[: -len(".js")] 210 if filename not in transformers: 211 print("{}: not yet implemented".format(filename)) 212 return 213 transformer, pk = transformers.get(filename) 214 data = extract_json(content) 215 to_insert = transformer(data) 216 for table, rows in to_insert.items(): 217 table_name = "archive_{}".format(table.replace("-", "_")) 218 # Drop and re-create if it already exists 219 if table_name in existing_tables: 220 db[table_name].drop() 221 if pk is not None: 222 db[table_name].insert_all(rows, pk=pk, replace=True) 223 else: 224 db[table_name].insert_all(rows, hash_id="pk", replace=True)