···11+#!/usr/bin/env python3
22+"""Generate C timezone list for the Top-1000 airports.
33+44+1. Parse `top1000.html` (downloaded from GetToCenter) to extract IATA codes and
55+ airport names.
66+2. Use the `airportsdata` package to obtain the IANA timezone (`tz`) for each
77+ airport.
88+3. For each distinct set of (std_offset, dst_offset, dst_start, dst_end)
99+ belonging to that timezone (for the current year), build a bucket of airport
1010+ IATA codes.
1111+4. Emit a C source file (`src/c/airport_tz_list.c`) that mirrors the structure
1212+ of `tz_list.c` already used by the Closest-Noon clock, but with **airport
1313+ IATA codes** in the pooled name list instead of city names.
1414+1515+Usage:
1616+ # Always parse HTML top1000.html, then fallback for missing offsets
1717+ python generate_airport_tz_list.py --html top1000.html --out src/c/airport_tz_list.c --top 10 --max-bucket 3
1818+1919+Dependencies:
2020+ pip install airportsdata beautifulsoup4 pandas requests
2121+2222+This script intentionally re-implements the DST-transition detection logic from
2323+`generate_tz_list.py` so it can remain self-contained.
2424+"""
2525+from __future__ import annotations
2626+2727+import argparse
2828+import sys
2929+from pathlib import Path
3030+from datetime import datetime, timedelta, timezone
3131+from typing import Dict, List, Tuple
3232+from functools import lru_cache
3333+3434+import zoneinfo # stdlib >=3.9
3535+from bs4 import BeautifulSoup # type: ignore
3636+import airportsdata # pip install airportsdata
3737+import pandas as pd # pip install pandas pyarrow
3838+import requests
3939+import io
4040+import gzip
4141+4242+# ---------------------------------------------------------------------------
4343+# Helper functions (copied & trimmed from generate_tz_list.py)
4444+# ---------------------------------------------------------------------------
4545+4646+def _get_tz_details(tz_name: str, dt_utc: datetime) -> Tuple[int, timedelta] | None:
4747+ """Return (total_offset_seconds, dst_component) or None if tz is invalid."""
4848+ try:
4949+ tz = zoneinfo.ZoneInfo(tz_name)
5050+ off = tz.utcoffset(dt_utc)
5151+ dst = tz.dst(dt_utc) or timedelta(0)
5252+ if off is not None:
5353+ return int(off.total_seconds()), dst
5454+ except Exception:
5555+ pass
5656+ return None
5757+5858+@lru_cache(maxsize=None)
5959+def _find_dst_transitions(tz_name: str, year: int) -> Tuple[int, int, int, int]:
6060+ """Return (std_offset_s, dst_offset_s, dst_start_utc_ts, dst_end_utc_ts).
6161+6262+ If the zone does not observe DST, std == dst and the transition timestamps
6363+ are 0.
6464+ """
6565+ std_offset_sec = None
6666+ dst_offset_sec = None
6767+ start_ts = 0
6868+ end_ts = 0
6969+7070+ # Iterate hour by hour from [year-01-01 00:00-01h] through end of the year
7171+ current_dt = datetime(year, 1, 1, tzinfo=timezone.utc) - timedelta(hours=1)
7272+ initial = _get_tz_details(tz_name, current_dt)
7373+ if not initial:
7474+ return (0, 0, 0, 0)
7575+7676+ prev_off, prev_dst = initial
7777+ total_hours = (366 * 24) + 3 # cover leap + buffer
7878+7979+ for _ in range(total_hours):
8080+ current_dt += timedelta(hours=1)
8181+ details = _get_tz_details(tz_name, current_dt)
8282+ if not details:
8383+ continue
8484+ cur_off, cur_dst = details
8585+8686+ # Track seen std/dst offsets
8787+ if cur_dst == timedelta(0):
8888+ std_offset_sec = cur_off
8989+ else:
9090+ dst_offset_sec = cur_off
9191+9292+ # Detect transition when DST component toggles
9393+ if cur_dst != prev_dst:
9494+ ts = int(current_dt.timestamp())
9595+ if current_dt.year == year:
9696+ if prev_dst == timedelta(0) and cur_dst > timedelta(0):
9797+ start_ts = ts
9898+ elif prev_dst > timedelta(0) and cur_dst == timedelta(0):
9999+ end_ts = ts
100100+ prev_off, prev_dst = cur_off, cur_dst
101101+102102+ if std_offset_sec is None:
103103+ std_offset_sec = prev_off
104104+ if dst_offset_sec is None:
105105+ dst_offset_sec = std_offset_sec
106106+107107+ # If offsets differ by <1 min, treat as no DST.
108108+ if abs(std_offset_sec - dst_offset_sec) < 60:
109109+ start_ts = 0
110110+ end_ts = 0
111111+ dst_offset_sec = std_offset_sec
112112+113113+ return (std_offset_sec, dst_offset_sec, start_ts, end_ts)
114114+115115+# ---------------------------------------------------------------------------
116116+# Build ranked list of airports with route counts (fallback if HTML omitted)
117117+# ---------------------------------------------------------------------------
118118+119119+def _download_routes_csv() -> pd.DataFrame:
120120+ """Fetch routes.dat from the OpenFlights repo and return a DataFrame."""
121121+ url = "https://raw.githubusercontent.com/jpatokal/openflights/master/data/routes.dat"
122122+ df = pd.read_csv(url, header=None, usecols=[2, 4], names=["src", "dst"], dtype=str)
123123+ return df
124124+125125+def _rank_airports_by_routes(airport_df: pd.DataFrame) -> pd.Series:
126126+ """Return Series indexed by IATA with descending route hit counts."""
127127+ routes = _download_routes_csv()
128128+ counts = pd.concat([routes["src"], routes["dst"]]).value_counts()
129129+ return counts
130130+131131+def build_topN_per_timezone(top_n: int) -> List[Tuple[str, str]]:
132132+ """Return a balanced list covering all timezones with up to top_n airports each.
133133+134134+ The ranking metric is route_hits (descending). Airports lacking route data
135135+ default to zero, but they might still be picked to cover empty timezones.
136136+ """
137137+ adict = airportsdata.load("IATA")
138138+ # Build DataFrame and remove record-level 'iata' column to avoid duplicates
139139+ df = pd.DataFrame.from_dict(adict, orient="index")
140140+ if 'iata' in df.columns:
141141+ df = df.drop(columns=['iata'])
142142+ df = df.reset_index().rename(columns={'index': 'iata'})
143143+144144+ # Add route_hits counts
145145+ counts = _rank_airports_by_routes(df)
146146+ df["route_hits"] = df["iata"].map(counts).fillna(0).astype(int)
147147+148148+ # Sort by route_hits descending
149149+ df_sorted = df.sort_values("route_hits", ascending=False, ignore_index=True)
150150+151151+ tz_to_codes: Dict[str, List[Tuple[str, str]]] = {}
152152+153153+ # First pass: iterate sorted df to fill up to top_n per tz
154154+ for _, row in df_sorted.iterrows():
155155+ tz = row["tz"]
156156+ if not isinstance(tz, str) or tz == "":
157157+ continue
158158+ lst = tz_to_codes.setdefault(tz, [])
159159+ if len(lst) < top_n:
160160+ lst.append((row["iata"], row["name"]))
161161+ # Early exit optimisation – if all tz have top_n we can break; but we
162162+ # don't know total tz count easily, so skip.
163163+164164+ # Now build final list
165165+ final_list: List[Tuple[str, str]] = []
166166+ for codes in tz_to_codes.values():
167167+ final_list.extend(codes)
168168+ return final_list
169169+170170+# ---------------------------------------------------------------------------
171171+# Parsing HTML (optional) ---------------------------------------------------
172172+# ---------------------------------------------------------------------------
173173+174174+def _parse_top1000(html_path: Path) -> List[Tuple[str, str]]:
175175+ """Return list of (IATA, Airport Name) found in the HTML table."""
176176+ soup = BeautifulSoup(html_path.read_text(encoding="utf-8"), "html.parser")
177177+ rows = soup.find_all("tr")
178178+ results: List[Tuple[str, str]] = []
179179+ for tr in rows:
180180+ tds = tr.find_all("td")
181181+ if len(tds) < 3:
182182+ continue
183183+ iata = tds[2].get_text(strip=True).upper()
184184+ if not (iata and len(iata) == 3):
185185+ continue # skip ads rows etc.
186186+ # Airport name usually inside the 2nd <td>, perhaps in an <h2>
187187+ name_cell_text = tds[1].get_text(" ", strip=True)
188188+ results.append((iata, name_cell_text))
189189+ return results
190190+191191+# ---------------------------------------------------------------------------
192192+# Main C-code generation routine
193193+# ---------------------------------------------------------------------------
194194+195195+def generate_c_code(airports_list: List[Tuple[str, str]], out_path: Path, group_size: int = 0, max_bucket: int = 0) -> None:
196196+ """Generate airport_tz_list.c:
197197+ 1) Build full buckets for every IATA tz variant (std, dst, transitions).
198198+ 2) Pick top group_size codes per std-offset from HTML list.
199199+ 3) Fallback for missing offsets (min 1, max max_bucket) using classification + traffic.
200200+ 4) Distribute codes evenly across DST buckets, cap each to max_bucket.
201201+ """
202202+ year = datetime.utcnow().year
203203+ airport_db = airportsdata.load("IATA")
204204+ # Ensure unique HTML airport entries by IATA code
205205+ seen_iatas: set[str] = set()
206206+ unique_airports: List[Tuple[str, str]] = []
207207+ for iata, name in airports_list:
208208+ if iata not in seen_iatas:
209209+ unique_airports.append((iata, name))
210210+ seen_iatas.add(iata)
211211+ airports_list = unique_airports
212212+ # Build fallback DataFrame with classification and traffic for missing offsets
213213+ df_all = pd.DataFrame.from_dict(airport_db, orient="index")
214214+ if 'iata' in df_all.columns:
215215+ df_all = df_all.drop(columns=['iata'])
216216+ df_all = df_all.reset_index().rename(columns={'index': 'iata'})
217217+ # Merge OurAirports classification
218218+ try:
219219+ oa = pd.read_csv("https://ourairports.com/data/airports.csv", usecols=["iata_code","type","scheduled_service"]) # type: ignore
220220+ oa = oa.rename(columns={"iata_code": "iata"}).dropna(subset=["iata"])
221221+ df_all = df_all.merge(oa[['iata','type','scheduled_service']], on='iata', how='left')
222222+ except Exception:
223223+ df_all['type'] = None
224224+ df_all['scheduled_service'] = None
225225+ # Add route hit counts
226226+ traffic_counts = _rank_airports_by_routes(df_all)
227227+ traffic_dict = traffic_counts.to_dict()
228228+ # Map route hits using apply to ensure a Series
229229+ df_all['route_hits'] = df_all['iata'].apply(lambda x: traffic_dict.get(x, 0)).astype(int)
230230+ # Compute standard offset seconds for each record
231231+ df_all['std_offset_s'] = df_all['tz'].apply(lambda tz: _find_dst_transitions(tz, year)[0])
232232+233233+ # Fallback selector for a given std_offset
234234+ def _fallback_codes(std_s: int) -> List[str]:
235235+ """Fallback hierarchy per std offset:
236236+ 1) up to max_bucket (or 3) large/international,
237237+ 2) up to 2 medium/regional,
238238+ 3) up to 1 small_airport,
239239+ 4) fill any remaining to reach at least 1, max_bucket total."""
240240+ seg = df_all[df_all['std_offset_s'] == std_s]
241241+ if seg.empty:
242242+ return []
243243+ seg_sorted = seg.sort_values('route_hits', ascending=False)
244244+ result: List[str] = []
245245+ # 1) large_international
246246+ large = seg_sorted[(seg_sorted['type'] == 'large_airport') & (seg_sorted['scheduled_service'] == 'yes')]
247247+ if not large.empty:
248248+ cap = max_bucket if max_bucket > 0 else 3
249249+ result = large['iata'].head(cap).tolist()
250250+ # 2) medium_regional
251251+ remain = (max_bucket - len(result)) if max_bucket > 0 else (3 - len(result))
252252+ if remain > 0:
253253+ medium = seg_sorted[(seg_sorted['type'] == 'medium_airport') & (seg_sorted['scheduled_service'] == 'yes')]
254254+ if not medium.empty:
255255+ mcap = min(remain, 2)
256256+ result.extend(medium['iata'].head(mcap).tolist())
257257+ remain = (max_bucket - len(result)) if max_bucket > 0 else (3 - len(result))
258258+ # 3) small_airport
259259+ if remain > 0:
260260+ small = seg_sorted[(seg_sorted['type'] == 'small_airport') & (seg_sorted['scheduled_service'] == 'yes')]
261261+ if not small.empty:
262262+ result.extend(small['iata'].head(1).tolist())
263263+ remain = (max_bucket - len(result)) if max_bucket > 0 else (3 - len(result))
264264+ # 4) any to ensure at least one
265265+ if not result:
266266+ result = [seg_sorted['iata'].iloc[0]]
267267+ # enforce max_bucket hard limit
268268+ if max_bucket > 0 and len(result) > max_bucket:
269269+ result = result[:max_bucket]
270270+ return result
271271+272272+ # 1) Build full buckets from all tz names in df_all
273273+ full_buckets: Dict[Tuple[int,int,int,int], Dict[str, object]] = {}
274274+ group_keys: Dict[int, List[Tuple[int,int,int,int]]] = {}
275275+ for tz_name in df_all['tz'].dropna().unique():
276276+ std_s, dst_s, start_ts, end_ts = _find_dst_transitions(tz_name, year)
277277+ key = (std_s, dst_s, start_ts, end_ts)
278278+ if key not in full_buckets:
279279+ full_buckets[key] = { 'std': std_s, 'dst': dst_s, 'start': start_ts, 'end': end_ts }
280280+ group_keys.setdefault(std_s, []).append(key)
281281+282282+ # 2) Collect codes from HTML for each std_offset (popular timezones)
283283+ group_codes: Dict[int, List[str]] = {}
284284+ for iata, _ in airports_list:
285285+ rec = airport_db.get(iata)
286286+ if not rec or not rec.get('tz'):
287287+ continue
288288+ std_s = _find_dst_transitions(rec['tz'], year)[0]
289289+ codes = group_codes.setdefault(std_s, [])
290290+ if iata not in codes:
291291+ codes.append(iata)
292292+ # Trim HTML-based codes to group_size for popular timezones
293293+ if group_size > 0:
294294+ for std_s, codes in list(group_codes.items()):
295295+ group_codes[std_s] = codes[:group_size]
296296+297297+ # 3) Fallback for offsets lacking HTML codes (unpopular timezones)
298298+ for std_s, keys in group_keys.items():
299299+ # ensure at least one code per std_offset
300300+ if not group_codes.get(std_s):
301301+ group_codes[std_s] = _fallback_codes(std_s)
302302+303303+ # 4) Assign popular codes to their actual DST buckets, then fallback for empty
304304+ # initialize codes list for each bucket
305305+ for key, meta in full_buckets.items():
306306+ meta['codes'] = []
307307+ # populate HTML-based codes into their real tz variant buckets
308308+ for std_s, codes in group_codes.items():
309309+ for iata in codes:
310310+ rec = airport_db.get(iata)
311311+ if rec and rec.get('tz'):
312312+ std2, dst2, st2, ed2 = _find_dst_transitions(rec['tz'], year)
313313+ bucket_key = (std2, dst2, st2, ed2)
314314+ if bucket_key in full_buckets:
315315+ full_buckets[bucket_key]['codes'].append(iata)
316316+ # fallback for buckets still empty: only populate the first empty bucket per std-offset
317317+ for std_s, keys in group_keys.items():
318318+ # track HTML-derived codes for this std-offset
319319+ assigned = set(group_codes.get(std_s, []))
320320+ # prepare a one-time fallback candidate list, filtered of already assigned
321321+ fallback_candidates = [c for c in _fallback_codes(std_s) if c not in assigned]
322322+ fallback_used = False
323323+ for bucket_key in keys:
324324+ codes_list = full_buckets[bucket_key].get('codes', [])
325325+ if not codes_list and not fallback_used and fallback_candidates:
326326+ # assign up to max_bucket fallback codes to first empty bucket
327327+ if max_bucket > 0:
328328+ codes_list = fallback_candidates[:max_bucket]
329329+ else:
330330+ codes_list = fallback_candidates[:]
331331+ fallback_used = True
332332+ # cap any list to max_bucket if needed
333333+ if codes_list and max_bucket > 0:
334334+ codes_list = codes_list[:max_bucket]
335335+ full_buckets[bucket_key]['codes'] = codes_list
336336+ # record assigned codes so we don't reuse them (though fallback is one-shot)
337337+ assigned.update(codes_list)
338338+339339+ # build ordered bucket list
340340+ buckets_list = [
341341+ full_buckets[k]
342342+ for k in sorted(
343343+ full_buckets.keys(),
344344+ key=lambda k: (full_buckets[k]['std'], full_buckets[k]['dst'], full_buckets[k]['start'])
345345+ )
346346+ ]
347347+348348+ # 5) Build flat pool and offsets
349349+ code_pool = []
350350+ for b in buckets_list:
351351+ b['offset'] = len(code_pool)
352352+ b['count'] = len(b.get('codes', []))
353353+ code_pool.extend(b.get('codes', []))
354354+355355+ # Build name pool parallel to code_pool
356356+ name_pool = []
357357+ for code in code_pool:
358358+ rec = airport_db.get(code)
359359+ if rec and rec.get('name'):
360360+ name = rec['name']
361361+ else:
362362+ name = code
363363+ # Remove ' International Airport' or ' Airport' from the end
364364+ if name.endswith(' International Airport'):
365365+ name = name[:-len(' International Airport')]
366366+ elif name.endswith(' Airport'):
367367+ name = name[:-len(' Airport')]
368368+ name = name.rstrip()
369369+ name_pool.append(name)
370370+371371+ # Emit C file
372372+ with out_path.open("w", encoding="utf-8") as f:
373373+ f.write("// Auto-generated by generate_airport_tz_list.py\n")
374374+ f.write(f"// Year-specific DST data for {year}\n\n")
375375+ f.write("#include <stdint.h>\n\n")
376376+ # Code pool
377377+ f.write("static const char* airport_code_pool[] = {\n")
378378+ for code in code_pool:
379379+ f.write(f" \"{code}\",\n")
380380+ f.write("};\n\n")
381381+382382+ # Name pool
383383+ f.write("static const char* airport_name_pool[] = {\n")
384384+ for name in name_pool:
385385+ f.write(f" \"{name}\",\n")
386386+ f.write("};\n\n")
387387+388388+ # Struct matches TzInfo definition
389389+ f.write("typedef struct {\n")
390390+ f.write(" float std_offset_hours;\n")
391391+ f.write(" float dst_offset_hours;\n")
392392+ f.write(" int64_t dst_start_utc;\n")
393393+ f.write(" int64_t dst_end_utc;\n")
394394+ f.write(" int name_offset;\n")
395395+ f.write(" int name_count;\n")
396396+ f.write("} TzInfo;\n\n")
397397+398398+ f.write("static const TzInfo airport_tz_list[] = {\n")
399399+ for bucket in buckets_list:
400400+ std_h = bucket["std"] / 3600.0
401401+ dst_h = bucket["dst"] / 3600.0
402402+ start = bucket["start"]
403403+ end = bucket["end"]
404404+ off = bucket["offset"]
405405+ cnt = bucket["count"]
406406+ f.write(f" {{ {std_h:.2f}f, {dst_h:.2f}f, {start}LL, {end}LL, {off}, {cnt} }},\n")
407407+ f.write("};\n\n")
408408+ f.write("#define AIRPORT_TZ_LIST_COUNT (sizeof(airport_tz_list)/sizeof(airport_tz_list[0]))\n")
409409+ f.write("#define AIRPORT_CODE_POOL_COUNT (sizeof(airport_code_pool)/sizeof(airport_code_pool[0]))\n")
410410+ f.write("#define AIRPORT_NAME_POOL_COUNT (sizeof(airport_name_pool)/sizeof(airport_name_pool[0]))\n")
411411+412412+ print(
413413+ f"Generated {out_path} with {len(buckets_list)} tz buckets and {len(code_pool)} airports."
414414+ )
415415+416416+# ---------------------------------------------------------------------------
417417+# CLI
418418+# ---------------------------------------------------------------------------
419419+420420+def main(argv: List[str] | None = None) -> None:
421421+ parser = argparse.ArgumentParser(
422422+ description="Generate airport_tz_list.c: hybrid grouping by standard offset, split DST buckets, fallback for missing offsets"
423423+ )
424424+ parser.add_argument(
425425+ "--html",
426426+ type=Path,
427427+ default=Path("top1000.html"),
428428+ help="Path to GetToCenter HTML file (top1000.html)",
429429+ )
430430+ parser.add_argument("--out", type=Path, default=Path("src/c/airport_tz_list.c"), help="C output file path")
431431+ parser.add_argument(
432432+ "--top",
433433+ type=int,
434434+ default=10,
435435+ help="Number of airports to pick per standard offset group before splitting across DST buckets",
436436+ )
437437+ parser.add_argument(
438438+ "--max-bucket",
439439+ type=int,
440440+ default=3,
441441+ help="Maximum number of airport codes to include per DST bucket (default: 3)",
442442+ )
443443+ args = parser.parse_args(argv)
444444+445445+ # Always parse the HTML source for list of top airports
446446+ if not args.html.exists():
447447+ print(f"ERROR: HTML file not found: {args.html}", file=sys.stderr)
448448+ sys.exit(1)
449449+ airports_list = _parse_top1000(args.html)
450450+ if not airports_list:
451451+ print(f"ERROR: No airports found in HTML: {args.html}", file=sys.stderr)
452452+ sys.exit(1)
453453+454454+ # group_size = top N per std-offset, max_bucket = cap per DST bucket
455455+ generate_c_code(airports_list, args.out, group_size=args.top, max_bucket=args.max_bucket)
456456+457457+458458+if __name__ == "__main__":
459459+ main()
+2-1
run.sh
···66# Function to run the timezone generation script
77generate_tz_list() {
88 echo "Generating timezone list..."
99- python generate_tz_list.py
99+ # python generate_tz_list.py
1010+ uv run python generate_airport_tz_list.py --top 10 --max-bucket 10 --out src/c/airport_tz_list.c
1011}
11121213# Function to build the project