···55This is used to feed https://arewedecentralizedyet.online/ .
6677* `BIndex.md`: A proposal for a way of measuring blockability on distributed social networks
88-* `centralization-stats.py <file.csv>`: Computes [Herfindahl–Hirschman index](https://en.wikipedia.org/wiki/Herfindahl%E2%80%93Hirschman_index) and other statistics. Pass `--json` to get machine-readable output
88+* `centralization_stats.py <file.csv>`: Computes [Herfindahl–Hirschman index](https://en.wikipedia.org/wiki/Herfindahl%E2%80%93Hirschman_index) and other statistics. Pass `--json` to get machine-readable output
99* `data-fetchers/` contains various scripts to grab statistics from a number of sources
1010* `data-static/` contains static versions of the data - some are fetched with scripts from `data-fetchers/`, others are one-time dumps from sources such as academic papers
1111* `habib-paper/` code and data specifically related to the Habib et al. paper
···15151616TODO:
1717* Add matrix, see https://codeberg.org/ricci/are-we-decentralized-yet/issues/2
1818-
-158
centralization-stats.py
···11-#!/usr/bin/env python3
22-33-import argparse
44-import csv
55-import json
66-import math
77-import sys
88-99-# https://en.wikipedia.org/wiki/Herfindahl%E2%80%93Hirschman_index
1010-def calc_hhi(x):
1111- total = sum(x)
1212- hhi = sum([(a/total)**2 for a in x])
1313- return hhi
1414-1515-# https://www.statology.org/shannon-diversity-index/
1616-def calc_shannon(x):
1717- total = sum(x)
1818- shannon = -sum([((a/total)*math.log(a/total,math.e)) for a in x])
1919- return shannon
2020-2121-# https://statologos.com/indice-de-diversidad-de-los-simpson/
2222-def calc_simpson(x):
2323- total = sum(x)
2424- simpson = 1 - sum([a*(a-1) for a in x]) / (total*(total-1))
2525- return simpson
2626-2727-def calc_B(x,n):
2828- assert(n<=100)
2929- total = sum(x)
3030- accum = 0
3131- for b in range(0, len(x)-1):
3232- accum += x[b]/total
3333- if (accum >= n/100.0):
3434- return(b+1)
3535-3636-3737-COMBINE_HOSTS = [[
3838- "mastodon.social", "mastodon.online"
3939-]]
4040-COMBINE_SUFFIXES = [ ".host.bsky.network" ]
4141-4242-def combine_key(row):
4343- domain = get_domain(row)
4444- for hlist in COMBINE_HOSTS:
4545- if domain in hlist:
4646- return hlist[0]
4747- for suffix in COMBINE_SUFFIXES:
4848- if domain.endswith(suffix):
4949- return suffix
5050- return domain
5151-5252-# Software known to misreport user accounts
5353-SKIPPED_SOFTWARE = ["nodebb", "gotosocial", "yellbot","misskey", "sharkey"]
5454-def f_software(row):
5555- if "software" not in row:
5656- return True
5757- else:
5858- return all([s not in row["software"].lower() for s in SKIPPED_SOFTWARE])
5959-def f_count(row):
6060- return get_usercount(row) > 0
6161-6262-def normalize_keys(row):
6363- return {k.lower(): v for k, v in row.items()}
6464-6565-def extract_domain_counts(row):
6666- return { "domain": get_domain(row), "count": get_usercount(row) }
6767-6868-6969-# Different CSVs use different names for the user count field
7070-def get_usercount(row):
7171- for key in ("user_count", "mau", "monthly_active_users", "active_month", "active_users", "accountcount","origins", "count", "nb_hostnames", "domains_of_provider"):
7272- val = row.get(key, "")
7373- if val != "":
7474- try:
7575- int(val)
7676- except ValueError:
7777- return float(val)
7878- else:
7979- return int(val)
8080- return 0
8181-8282-# Different CSVs use different columns for the hostname
8383-def get_domain(row):
8484- for key in ("domain", "hostname","instance","name","org_id","e.id","o.name","a.asn","asn", "provider"):
8585- if key in row:
8686- return row.get(key, "")
8787- return None
8888-8989-def filter_rows(rows):
9090- rows = [normalize_keys(r) for r in rows]
9191- rows = [r for r in rows if f_count(r)]
9292- rows = [r for r in rows if f_software(r)]
9393-9494- return rows
9595-9696-def combine_rows(rows):
9797- combined = dict()
9898- for row in rows:
9999- key = combine_key(row)
100100- combined[key] = combined.get(key,[]) + [row]
101101-102102- newrows = list()
103103- for k,v in combined.items():
104104- newrows.append({"domain": k, "count": sum([r["count"] for r in v])})
105105- return newrows
106106-107107-def main(filename, json_out = False):
108108- with open(filename, newline="") as f:
109109- reader = csv.DictReader(f)
110110- rows = list(reader)
111111-112112- rows = filter_rows(rows)
113113-114114- extracted = [extract_domain_counts(row) for row in rows]
115115- combined = combine_rows(extracted)
116116-117117- user_counts = sorted([r["count"] for r in combined], reverse=True)
118118-119119- hhi = calc_hhi(user_counts)
120120- shannon = calc_shannon(user_counts)
121121- simpson = calc_simpson(user_counts)
122122- bs = [(b, calc_B(user_counts,b)) for b in [25,50,75,90,99,99.5] ]
123123- servers = len(user_counts)
124124- biggest_abs = user_counts[0]
125125- biggest_pct = 100*user_counts[0]/sum(user_counts)
126126- rest_abs = sum(user_counts[1:])
127127- rest_pct = 100*rest_abs/sum(user_counts)
128128-129129- if json_out:
130130- print(json.dumps({"HHI": int(hhi*10000),
131131- "shannon": round(shannon,4),
132132- "simpson": round(simpson,4),
133133- "servers": servers,
134134- "biggest_abs": biggest_abs,
135135- "biggest_pct": round(biggest_pct,2),
136136- "rest_abs": rest_abs,
137137- "rest_pct": round(rest_pct,2),
138138- "b_vals": bs}))
139139- else:
140140- print(f"HHI for user_count: {hhi:.4f}")
141141- print(f"Shannon Diversity for user_count: {shannon:.4f}")
142142- print(f"Simpson Diversity for user_count: {simpson:.4f}")
143143- print(f"Total servers: {servers}")
144144- print(f"Biggest server: {biggest_abs} ({biggest_pct:.2f}%)")
145145- print(f"Rest of the servers: {rest_abs} ({rest_pct:.2f}%)")
146146- print(f"Total users: {biggest_abs + rest_abs}")
147147- print(f"B values are {bs}")
148148-149149-if __name__ == "__main__":
150150- parser = argparse.ArgumentParser(
151151- prog=f"{sys.argv[0]}",
152152- description='Calculates statistics for social networks')
153153- parser.add_argument('csvfile')
154154- parser.add_argument('--json', action='store_true')
155155-156156- args = parser.parse_args()
157157- main(args.csvfile, args.json)
158158-
+168
centralization_stats.py
···11+#!/usr/bin/env python3
22+33+import argparse
44+import csv
55+import json
66+import math
77+import sys
88+99+# https://en.wikipedia.org/wiki/Herfindahl%E2%80%93Hirschman_index
1010+def calc_hhi(x):
1111+ total = sum(x)
1212+ hhi = sum([(a/total)**2 for a in x])
1313+ return hhi
1414+1515+# https://www.statology.org/shannon-diversity-index/
1616+def calc_shannon(x):
1717+ total = sum(x)
1818+ shannon = -sum([((a/total)*math.log(a/total,math.e)) for a in x])
1919+ return shannon
2020+2121+# https://statologos.com/indice-de-diversidad-de-los-simpson/
2222+def calc_simpson(x):
2323+ total = sum(x)
2424+ simpson = 1 - sum([a*(a-1) for a in x]) / (total*(total-1))
2525+ return simpson
2626+2727+def calc_B(x,n):
2828+ assert(n<=100)
2929+ total = sum(x)
3030+ accum = 0
3131+ for b in range(0, len(x)-1):
3232+ accum += x[b]/total
3333+ if (accum >= n/100.0):
3434+ return(b+1)
3535+3636+3737+COMBINE_HOSTS = [[
3838+ "mastodon.social", "mastodon.online"
3939+]]
4040+COMBINE_SUFFIXES = [ ".host.bsky.network" ]
4141+4242+def combine_key(row):
4343+ domain = get_domain(row)
4444+ for hlist in COMBINE_HOSTS:
4545+ if domain in hlist:
4646+ return hlist[0]
4747+ for suffix in COMBINE_SUFFIXES:
4848+ if domain.endswith(suffix):
4949+ return suffix
5050+ return domain
5151+5252+# Software known to misreport user accounts
5353+SKIPPED_SOFTWARE = ["nodebb", "gotosocial", "yellbot","misskey", "sharkey"]
5454+def f_software(row):
5555+ if "software" not in row:
5656+ return True
5757+ else:
5858+ return all([s not in row["software"].lower() for s in SKIPPED_SOFTWARE])
5959+def f_count(row):
6060+ return get_usercount(row) > 0
6161+6262+def normalize_keys(row):
6363+ return {k.lower(): v for k, v in row.items()}
6464+6565+def extract_domain_counts(row):
6666+ return { "domain": get_domain(row), "count": get_usercount(row) }
6767+6868+6969+# Different CSVs use different names for the user count field
7070+def get_usercount(row):
7171+ for key in ("user_count", "mau", "monthly_active_users", "active_month", "active_users", "accountcount","origins", "count", "nb_hostnames", "domains_of_provider"):
7272+ val = row.get(key, "")
7373+ if val != "":
7474+ try:
7575+ int(val)
7676+ except ValueError:
7777+ return float(val)
7878+ else:
7979+ return int(val)
8080+ return 0
8181+8282+# Different CSVs use different columns for the hostname
8383+def get_domain(row):
8484+ for key in ("domain", "hostname","instance","name","org_id","e.id","o.name","a.asn","asn", "provider"):
8585+ if key in row:
8686+ return row.get(key, "")
8787+ return None
8888+8989+def filter_rows(rows):
9090+ rows = [normalize_keys(r) for r in rows]
9191+ rows = [r for r in rows if f_count(r)]
9292+ rows = [r for r in rows if f_software(r)]
9393+9494+ return rows
9595+9696+def combine_rows(rows):
9797+ combined = dict()
9898+ for row in rows:
9999+ key = combine_key(row)
100100+ combined[key] = combined.get(key,[]) + [row]
101101+102102+ newrows = list()
103103+ for k,v in combined.items():
104104+ newrows.append({"domain": k, "count": sum([r["count"] for r in v])})
105105+ return newrows
106106+107107+def stats_from_rows(rows):
108108+ rows = filter_rows(rows)
109109+110110+ extracted = [extract_domain_counts(row) for row in rows]
111111+ combined = combine_rows(extracted)
112112+113113+ user_counts = sorted([r["count"] for r in combined], reverse=True)
114114+115115+ hhi = calc_hhi(user_counts)
116116+ shannon = calc_shannon(user_counts)
117117+ simpson = calc_simpson(user_counts)
118118+ bs = [(b, calc_B(user_counts, b)) for b in [25, 50, 75, 90, 99, 99.5]]
119119+ servers = len(user_counts)
120120+ biggest_abs = user_counts[0]
121121+ biggest_pct = 100 * user_counts[0] / sum(user_counts)
122122+ rest_abs = sum(user_counts[1:])
123123+ rest_pct = 100 * rest_abs / sum(user_counts)
124124+125125+ return {
126126+ "HHI": int(hhi * 10000),
127127+ "shannon": round(shannon, 4),
128128+ "simpson": round(simpson, 4),
129129+ "servers": servers,
130130+ "biggest_abs": biggest_abs,
131131+ "biggest_pct": round(biggest_pct, 2),
132132+ "rest_abs": rest_abs,
133133+ "rest_pct": round(rest_pct, 2),
134134+ "b_vals": bs,
135135+ }
136136+137137+138138+def stats_from_csv(filename):
139139+ with open(filename, newline="") as f:
140140+ reader = csv.DictReader(f)
141141+ rows = list(reader)
142142+ return stats_from_rows(rows)
143143+144144+145145+def main(filename, json_out=False):
146146+ stats = stats_from_csv(filename)
147147+148148+ if json_out:
149149+ print(json.dumps(stats))
150150+ else:
151151+ print(f"HHI for user_count: {stats['HHI'] / 10000:.4f}")
152152+ print(f"Shannon Diversity for user_count: {stats['shannon']:.4f}")
153153+ print(f"Simpson Diversity for user_count: {stats['simpson']:.4f}")
154154+ print(f"Total servers: {stats['servers']}")
155155+ print(f"Biggest server: {stats['biggest_abs']} ({stats['biggest_pct']:.2f}%)")
156156+ print(f"Rest of the servers: {stats['rest_abs']} ({stats['rest_pct']:.2f}%)")
157157+ print(f"Total users: {stats['biggest_abs'] + stats['rest_abs']}")
158158+ print(f"B values are {stats['b_vals']}")
159159+160160+if __name__ == "__main__":
161161+ parser = argparse.ArgumentParser(
162162+ prog=f"{sys.argv[0]}",
163163+ description='Calculates statistics for social networks')
164164+ parser.add_argument('csvfile')
165165+ parser.add_argument('--json', action='store_true')
166166+167167+ args = parser.parse_args()
168168+ main(args.csvfile, args.json)
+1-2
habib-paper/README.md
···4455The actual original dataset is not stored here, as it is 10s of GB. It can be downloaded from: https://zenodo.org/records/15733582
6677-`transform.py` is a simple script that transforms any of the datafiles into a 'worldwide' form (*not* what was done in the original paper) so that they can directly be consumed by `centralization-stats.py` in the parent directory
77+`transform.py` is a simple script that transforms any of the datafiles into a 'worldwide' form (*not* what was done in the original paper) so that they can directly be consumed by `centralization_stats.py` in the parent directory
8899`original-hosting.csv` contains the data for hosting centralization extracted from Table F of the paper (Table 5)
10101111-