this repo has no description
1# Utilities for dealing with Twitter archives
2import json
3
4# Goal is to have a mapping of filename to a tuple with
5# (callable, pk=) triples, where the callable
6# takes the JSON from that file and returns a dictionary
7# of tables that should be created {"tabe": [rows-to-upsert]}
8transformers = {}
9
10
11def register(filename, each, pk=None):
12 def callback(data):
13 return {filename: [item.get(each) for item in data]}
14
15 transformers[filename] = (callback, pk)
16
17
18def register_each(filename, pk=None):
19 def inner(fn):
20 def callback(data):
21 return {filename: [fn(item) for item in data]}
22
23 transformers[filename] = (callback, pk)
24
25 return inner
26
27
28def register_multi(filename):
29 def inner(fn):
30 transformers[filename] = (fn, None)
31
32 return inner
33
34
35def register_all(filename):
36 def inner(fn):
37 transformers[filename] = (fn, None)
38
39 return inner
40
41
42def extract_json(contents):
43 # window.YTD.account_creation_ip.part0 = [ ... data ...]
44 contents = contents.strip()
45 if contents.startswith(b"window."):
46 contents = contents.split(b" = ", 1)[1]
47 return json.loads(contents)
48
49
50register("account-creation-ip", each="accountCreationIp")
51register("account-suspension", each="accountSuspension")
52register("account-timezone", each="accountTimezone")
53register("account", each="account")
54
55
56@register_each("ad-engagements")
57def ad_engagements(item):
58 return item["ad"]["adsUserData"]["adEngagements"]
59
60
61@register_each("ad-impressions")
62def ad_impressions(item):
63 return item["ad"]["adsUserData"]["adImpressions"]
64
65
66@register_each("ad-mobile-conversions-attributed")
67def ad_mobile_conversions_attributed(item):
68 return item["ad"]["adsUserData"]["attributedMobileAppConversions"]
69
70
71@register_each("ad-mobile-conversions-unattributed")
72def ad_mobile_conversions_unattributed(item):
73 return item["ad"]["adsUserData"]["unattributedMobileAppConversions"]
74
75
76@register_each("ad-online-conversions-attributed")
77def ad_online_conversions_attributed(item):
78 return item["ad"]["adsUserData"]["attributedOnlineConversions"]
79
80
81@register_each("ad-online-conversions-unattributed")
82def ad_online_conversions_unattributed(item):
83 return item["ad"]["adsUserData"]["unattributedOnlineConversions"]
84
85
86@register_each("ageinfo")
87def ageinfo(item):
88 return item["ageMeta"]["ageInfo"]
89
90
91register("block", each="blocking", pk="accountId")
92register("connected-applications", each="connectedApplication", pk="id")
93# register("contact", ...)
94register("direct-message-group-headers", each="dmConversation", pk="conversationId")
95register("direct-message-group", each="dmConversation", pk="conversationId")
96register("direct-message-headers", each="dmConversation", pk="conversationId")
97# pk for this one is NOT set, because there are dupes:
98# TODO: These actually do warrant separate tables:
99register("direct-message", each="dmConversation")
100
101register("email-address-change", each="emailAddressChange")
102register("follower", each="follower", pk="accountId")
103register("following", each="following", pk="accountId")
104register("ip-audit", each="ipAudit")
105register("like", each="like", pk="tweetId")
106
107
108@register_all("lists-created")
109def lists_created(data):
110 return {"lists-created": _list_from_common(data)}
111
112
113@register_all("lists-member")
114def lists_member(data):
115 return {"lists-member": _list_from_common(data)}
116
117
118@register_all("lists-subscribed")
119def lists_subscribed(data):
120 return {"lists-subscribed": _list_from_common(data)}
121
122
123register("moment", each="moment", pk="momentId")
124# register("mute", ...)
125
126
127@register_all("ni-devices")
128def lists_created(data):
129 devices = []
130 for block in data:
131 block = block["niDeviceResponse"]
132 category = list(block.keys())[0]
133 details = list(block.values())[0]
134 details["category"] = category
135 devices.append(details)
136 return {"ne-devices": devices}
137
138
139# Skipped all the periscope- stuff for the moment
140
141
142@register_multi("personalization")
143def personalization(data):
144 data = data[0]
145 # As a multi, we get to return a dict of
146 # table names => list of objects to insert
147 to_create = {}
148 demographics = data["p13nData"]["demographics"]
149 to_create["personalization-demographics-languages"] = demographics["languages"]
150 to_create["personalization-demographics-genderInfo"] = [demographics["genderInfo"]]
151 to_create["personalization-interests"] = data["p13nData"]["interests"]["interests"]
152 to_create["personalization-partnerInterests"] = data["p13nData"]["interests"][
153 "partnerInterests"
154 ]
155 to_create["personalization-advertisers"] = [
156 {"name": name}
157 for name in data["p13nData"]["interests"]["audienceAndAdvertisers"][
158 "advertisers"
159 ]
160 ]
161 to_create["personalization-num-audiences"] = [
162 {
163 "numAudiences": data["p13nData"]["interests"]["audienceAndAdvertisers"][
164 "numAudiences"
165 ]
166 }
167 ]
168 to_create["personalization-shows"] = [
169 {"name": name} for name in data["p13nData"]["interests"]["shows"]
170 ]
171 to_create["personalization-locationHistory"] = [
172 {"name": name} for name in data["p13nData"]["locationHistory"]
173 ]
174 to_create["personalization-inferredAgeInfo"] = [data["p13nData"]["inferredAgeInfo"]]
175 return to_create
176
177
178register("phone-number", each="device")
179register("profile", each="profile")
180# protected-history.js
181
182register("saved-search", each="savedSearch", pk="savedSearchId")
183# screen-name-change.js
184
185
186@register_each("tweet", pk="id")
187def tweet(item):
188 for key in item:
189 if key == "id" or key.endswith("_id"):
190 item[key] = int(item[key])
191 return item
192
193
194register("verified", each="verified")
195
196
197def _list_from_common(data):
198 lists = []
199 for block in data:
200 for url in block["userListInfo"]["urls"]:
201 bits = url.split("/")
202 lists.append({"screen_name": bits[-3], "list_slug": bits[-1]})
203 return lists
204
205
206def import_from_file(db, filename, content):
207 assert filename.endswith(".js"), "{} does not end with .js".format(filename)
208 existing_tables = set(db.table_names())
209 filename = filename[: -len(".js")]
210 if filename not in transformers:
211 print("{}: not yet implemented".format(filename))
212 return
213 transformer, pk = transformers.get(filename)
214 data = extract_json(content)
215 to_insert = transformer(data)
216 for table, rows in to_insert.items():
217 table_name = "archive_{}".format(table.replace("-", "_"))
218 # Drop and re-create if it already exists
219 if table_name in existing_tables:
220 db[table_name].drop()
221 if pk is not None:
222 db[table_name].insert_all(rows, pk=pk, replace=True)
223 else:
224 db[table_name].insert_all(rows, hash_id="pk", replace=True)