Code and data for arewedecentralizedyet.online and related projects
0
fork

Configure Feed

Select the types of activity you want to include in your feed.

Prepare centralization stats to be used as a library

Unfortunately, this requires renaming it

+172 -164
+1 -2
README.md
··· 5 5 This is used to feed https://arewedecentralizedyet.online/ . 6 6 7 7 * `BIndex.md`: A proposal for a way of measuring blockability on distributed social networks 8 - * `centralization-stats.py <file.csv>`: Computes [Herfindahl–Hirschman index](https://en.wikipedia.org/wiki/Herfindahl%E2%80%93Hirschman_index) and other statistics. Pass `--json` to get machine-readable output 8 + * `centralization_stats.py <file.csv>`: Computes [Herfindahl–Hirschman index](https://en.wikipedia.org/wiki/Herfindahl%E2%80%93Hirschman_index) and other statistics. Pass `--json` to get machine-readable output 9 9 * `data-fetchers/` contains various scripts to grab statistics from a number of sources 10 10 * `data-static/` contains static versions of the data - some are fetched with scripts from `data-fetchers/`, others are one-time dumps from sources such as academic papers 11 11 * `habib-paper/` code and data specifically related to the Habib et al. paper ··· 15 15 16 16 TODO: 17 17 * Add matrix, see https://codeberg.org/ricci/are-we-decentralized-yet/issues/2 18 -
-158
centralization-stats.py
··· 1 - #!/usr/bin/env python3 2 - 3 - import argparse 4 - import csv 5 - import json 6 - import math 7 - import sys 8 - 9 - # https://en.wikipedia.org/wiki/Herfindahl%E2%80%93Hirschman_index 10 - def calc_hhi(x): 11 - total = sum(x) 12 - hhi = sum([(a/total)**2 for a in x]) 13 - return hhi 14 - 15 - # https://www.statology.org/shannon-diversity-index/ 16 - def calc_shannon(x): 17 - total = sum(x) 18 - shannon = -sum([((a/total)*math.log(a/total,math.e)) for a in x]) 19 - return shannon 20 - 21 - # https://statologos.com/indice-de-diversidad-de-los-simpson/ 22 - def calc_simpson(x): 23 - total = sum(x) 24 - simpson = 1 - sum([a*(a-1) for a in x]) / (total*(total-1)) 25 - return simpson 26 - 27 - def calc_B(x,n): 28 - assert(n<=100) 29 - total = sum(x) 30 - accum = 0 31 - for b in range(0, len(x)-1): 32 - accum += x[b]/total 33 - if (accum >= n/100.0): 34 - return(b+1) 35 - 36 - 37 - COMBINE_HOSTS = [[ 38 - "mastodon.social", "mastodon.online" 39 - ]] 40 - COMBINE_SUFFIXES = [ ".host.bsky.network" ] 41 - 42 - def combine_key(row): 43 - domain = get_domain(row) 44 - for hlist in COMBINE_HOSTS: 45 - if domain in hlist: 46 - return hlist[0] 47 - for suffix in COMBINE_SUFFIXES: 48 - if domain.endswith(suffix): 49 - return suffix 50 - return domain 51 - 52 - # Software known to misreport user accounts 53 - SKIPPED_SOFTWARE = ["nodebb", "gotosocial", "yellbot","misskey", "sharkey"] 54 - def f_software(row): 55 - if "software" not in row: 56 - return True 57 - else: 58 - return all([s not in row["software"].lower() for s in SKIPPED_SOFTWARE]) 59 - def f_count(row): 60 - return get_usercount(row) > 0 61 - 62 - def normalize_keys(row): 63 - return {k.lower(): v for k, v in row.items()} 64 - 65 - def extract_domain_counts(row): 66 - return { "domain": get_domain(row), "count": get_usercount(row) } 67 - 68 - 69 - # Different CSVs use different names for the user count field 70 - def get_usercount(row): 71 - for key in ("user_count", "mau", "monthly_active_users", "active_month", "active_users", "accountcount","origins", "count", "nb_hostnames", "domains_of_provider"): 72 - val = row.get(key, "") 73 - if val != "": 74 - try: 75 - int(val) 76 - except ValueError: 77 - return float(val) 78 - else: 79 - return int(val) 80 - return 0 81 - 82 - # Different CSVs use different columns for the hostname 83 - def get_domain(row): 84 - for key in ("domain", "hostname","instance","name","org_id","e.id","o.name","a.asn","asn", "provider"): 85 - if key in row: 86 - return row.get(key, "") 87 - return None 88 - 89 - def filter_rows(rows): 90 - rows = [normalize_keys(r) for r in rows] 91 - rows = [r for r in rows if f_count(r)] 92 - rows = [r for r in rows if f_software(r)] 93 - 94 - return rows 95 - 96 - def combine_rows(rows): 97 - combined = dict() 98 - for row in rows: 99 - key = combine_key(row) 100 - combined[key] = combined.get(key,[]) + [row] 101 - 102 - newrows = list() 103 - for k,v in combined.items(): 104 - newrows.append({"domain": k, "count": sum([r["count"] for r in v])}) 105 - return newrows 106 - 107 - def main(filename, json_out = False): 108 - with open(filename, newline="") as f: 109 - reader = csv.DictReader(f) 110 - rows = list(reader) 111 - 112 - rows = filter_rows(rows) 113 - 114 - extracted = [extract_domain_counts(row) for row in rows] 115 - combined = combine_rows(extracted) 116 - 117 - user_counts = sorted([r["count"] for r in combined], reverse=True) 118 - 119 - hhi = calc_hhi(user_counts) 120 - shannon = calc_shannon(user_counts) 121 - simpson = calc_simpson(user_counts) 122 - bs = [(b, calc_B(user_counts,b)) for b in [25,50,75,90,99,99.5] ] 123 - servers = len(user_counts) 124 - biggest_abs = user_counts[0] 125 - biggest_pct = 100*user_counts[0]/sum(user_counts) 126 - rest_abs = sum(user_counts[1:]) 127 - rest_pct = 100*rest_abs/sum(user_counts) 128 - 129 - if json_out: 130 - print(json.dumps({"HHI": int(hhi*10000), 131 - "shannon": round(shannon,4), 132 - "simpson": round(simpson,4), 133 - "servers": servers, 134 - "biggest_abs": biggest_abs, 135 - "biggest_pct": round(biggest_pct,2), 136 - "rest_abs": rest_abs, 137 - "rest_pct": round(rest_pct,2), 138 - "b_vals": bs})) 139 - else: 140 - print(f"HHI for user_count: {hhi:.4f}") 141 - print(f"Shannon Diversity for user_count: {shannon:.4f}") 142 - print(f"Simpson Diversity for user_count: {simpson:.4f}") 143 - print(f"Total servers: {servers}") 144 - print(f"Biggest server: {biggest_abs} ({biggest_pct:.2f}%)") 145 - print(f"Rest of the servers: {rest_abs} ({rest_pct:.2f}%)") 146 - print(f"Total users: {biggest_abs + rest_abs}") 147 - print(f"B values are {bs}") 148 - 149 - if __name__ == "__main__": 150 - parser = argparse.ArgumentParser( 151 - prog=f"{sys.argv[0]}", 152 - description='Calculates statistics for social networks') 153 - parser.add_argument('csvfile') 154 - parser.add_argument('--json', action='store_true') 155 - 156 - args = parser.parse_args() 157 - main(args.csvfile, args.json) 158 -
+168
centralization_stats.py
··· 1 + #!/usr/bin/env python3 2 + 3 + import argparse 4 + import csv 5 + import json 6 + import math 7 + import sys 8 + 9 + # https://en.wikipedia.org/wiki/Herfindahl%E2%80%93Hirschman_index 10 + def calc_hhi(x): 11 + total = sum(x) 12 + hhi = sum([(a/total)**2 for a in x]) 13 + return hhi 14 + 15 + # https://www.statology.org/shannon-diversity-index/ 16 + def calc_shannon(x): 17 + total = sum(x) 18 + shannon = -sum([((a/total)*math.log(a/total,math.e)) for a in x]) 19 + return shannon 20 + 21 + # https://statologos.com/indice-de-diversidad-de-los-simpson/ 22 + def calc_simpson(x): 23 + total = sum(x) 24 + simpson = 1 - sum([a*(a-1) for a in x]) / (total*(total-1)) 25 + return simpson 26 + 27 + def calc_B(x,n): 28 + assert(n<=100) 29 + total = sum(x) 30 + accum = 0 31 + for b in range(0, len(x)-1): 32 + accum += x[b]/total 33 + if (accum >= n/100.0): 34 + return(b+1) 35 + 36 + 37 + COMBINE_HOSTS = [[ 38 + "mastodon.social", "mastodon.online" 39 + ]] 40 + COMBINE_SUFFIXES = [ ".host.bsky.network" ] 41 + 42 + def combine_key(row): 43 + domain = get_domain(row) 44 + for hlist in COMBINE_HOSTS: 45 + if domain in hlist: 46 + return hlist[0] 47 + for suffix in COMBINE_SUFFIXES: 48 + if domain.endswith(suffix): 49 + return suffix 50 + return domain 51 + 52 + # Software known to misreport user accounts 53 + SKIPPED_SOFTWARE = ["nodebb", "gotosocial", "yellbot","misskey", "sharkey"] 54 + def f_software(row): 55 + if "software" not in row: 56 + return True 57 + else: 58 + return all([s not in row["software"].lower() for s in SKIPPED_SOFTWARE]) 59 + def f_count(row): 60 + return get_usercount(row) > 0 61 + 62 + def normalize_keys(row): 63 + return {k.lower(): v for k, v in row.items()} 64 + 65 + def extract_domain_counts(row): 66 + return { "domain": get_domain(row), "count": get_usercount(row) } 67 + 68 + 69 + # Different CSVs use different names for the user count field 70 + def get_usercount(row): 71 + for key in ("user_count", "mau", "monthly_active_users", "active_month", "active_users", "accountcount","origins", "count", "nb_hostnames", "domains_of_provider"): 72 + val = row.get(key, "") 73 + if val != "": 74 + try: 75 + int(val) 76 + except ValueError: 77 + return float(val) 78 + else: 79 + return int(val) 80 + return 0 81 + 82 + # Different CSVs use different columns for the hostname 83 + def get_domain(row): 84 + for key in ("domain", "hostname","instance","name","org_id","e.id","o.name","a.asn","asn", "provider"): 85 + if key in row: 86 + return row.get(key, "") 87 + return None 88 + 89 + def filter_rows(rows): 90 + rows = [normalize_keys(r) for r in rows] 91 + rows = [r for r in rows if f_count(r)] 92 + rows = [r for r in rows if f_software(r)] 93 + 94 + return rows 95 + 96 + def combine_rows(rows): 97 + combined = dict() 98 + for row in rows: 99 + key = combine_key(row) 100 + combined[key] = combined.get(key,[]) + [row] 101 + 102 + newrows = list() 103 + for k,v in combined.items(): 104 + newrows.append({"domain": k, "count": sum([r["count"] for r in v])}) 105 + return newrows 106 + 107 + def stats_from_rows(rows): 108 + rows = filter_rows(rows) 109 + 110 + extracted = [extract_domain_counts(row) for row in rows] 111 + combined = combine_rows(extracted) 112 + 113 + user_counts = sorted([r["count"] for r in combined], reverse=True) 114 + 115 + hhi = calc_hhi(user_counts) 116 + shannon = calc_shannon(user_counts) 117 + simpson = calc_simpson(user_counts) 118 + bs = [(b, calc_B(user_counts, b)) for b in [25, 50, 75, 90, 99, 99.5]] 119 + servers = len(user_counts) 120 + biggest_abs = user_counts[0] 121 + biggest_pct = 100 * user_counts[0] / sum(user_counts) 122 + rest_abs = sum(user_counts[1:]) 123 + rest_pct = 100 * rest_abs / sum(user_counts) 124 + 125 + return { 126 + "HHI": int(hhi * 10000), 127 + "shannon": round(shannon, 4), 128 + "simpson": round(simpson, 4), 129 + "servers": servers, 130 + "biggest_abs": biggest_abs, 131 + "biggest_pct": round(biggest_pct, 2), 132 + "rest_abs": rest_abs, 133 + "rest_pct": round(rest_pct, 2), 134 + "b_vals": bs, 135 + } 136 + 137 + 138 + def stats_from_csv(filename): 139 + with open(filename, newline="") as f: 140 + reader = csv.DictReader(f) 141 + rows = list(reader) 142 + return stats_from_rows(rows) 143 + 144 + 145 + def main(filename, json_out=False): 146 + stats = stats_from_csv(filename) 147 + 148 + if json_out: 149 + print(json.dumps(stats)) 150 + else: 151 + print(f"HHI for user_count: {stats['HHI'] / 10000:.4f}") 152 + print(f"Shannon Diversity for user_count: {stats['shannon']:.4f}") 153 + print(f"Simpson Diversity for user_count: {stats['simpson']:.4f}") 154 + print(f"Total servers: {stats['servers']}") 155 + print(f"Biggest server: {stats['biggest_abs']} ({stats['biggest_pct']:.2f}%)") 156 + print(f"Rest of the servers: {stats['rest_abs']} ({stats['rest_pct']:.2f}%)") 157 + print(f"Total users: {stats['biggest_abs'] + stats['rest_abs']}") 158 + print(f"B values are {stats['b_vals']}") 159 + 160 + if __name__ == "__main__": 161 + parser = argparse.ArgumentParser( 162 + prog=f"{sys.argv[0]}", 163 + description='Calculates statistics for social networks') 164 + parser.add_argument('csvfile') 165 + parser.add_argument('--json', action='store_true') 166 + 167 + args = parser.parse_args() 168 + main(args.csvfile, args.json)
+1 -2
habib-paper/README.md
··· 4 4 5 5 The actual original dataset is not stored here, as it is 10s of GB. It can be downloaded from: https://zenodo.org/records/15733582 6 6 7 - `transform.py` is a simple script that transforms any of the datafiles into a 'worldwide' form (*not* what was done in the original paper) so that they can directly be consumed by `centralization-stats.py` in the parent directory 7 + `transform.py` is a simple script that transforms any of the datafiles into a 'worldwide' form (*not* what was done in the original paper) so that they can directly be consumed by `centralization_stats.py` in the parent directory 8 8 9 9 `original-hosting.csv` contains the data for hosting centralization extracted from Table F of the paper (Table 5) 10 10 11 -
+2 -2
helpers/diff-from-current.sh
··· 4 4 CSV=$2 5 5 CSV2=$3 6 6 7 - CUR=$(python3 ./centralization-stats.py --json $CSV | jq .shannon) 8 - PREV=$(python3 ./centralization-stats.py --json $CSV2 | jq .shannon) 7 + CUR=$(python3 ./centralization_stats.py --json $CSV | jq .shannon) 8 + PREV=$(python3 ./centralization_stats.py --json $CSV2 | jq .shannon) 9 9 10 10 DIFF=$(echo "$CUR - $PREV" | bc -l) 11 11