···2626from tz_common import find_dst_transitions as _find_dst_transitions
2727from bs4 import BeautifulSoup # type: ignore
2828import airportsdata
2929+from timezonefinder import TimezoneFinder
2930import pandas as pd
30313132# ---------------------------------------------------------------------------
···117118 """
118119 year = datetime.now(timezone.utc).year
119120 airport_db = airportsdata.load("IATA")
121121+120122 # Ensure unique HTML airport entries by IATA code
121123 seen_iatas: set[str] = set()
122124 unique_airports: List[Tuple[str, str]] = []
···130132 if 'iata' in df_all.columns:
131133 df_all = df_all.drop(columns=['iata'])
132134 df_all = df_all.reset_index().rename(columns={'index': 'iata'})
135135+ # Recompute tz field from lat/lon to correct misclassified zones _before_ we
136136+ # derive any offset‑related columns (important for DUT / America/Adak etc.)
137137+ tf = TimezoneFinder()
138138+ df_all['tz'] = df_all.apply(
139139+ lambda row: tf.timezone_at(lat=row.get('lat'), lng=row.get('lon')) or row.get('tz'),
140140+ axis=1,
141141+ )
142142+133143 # Merge OurAirports classification
134144 try:
135145 oa = pd.read_csv("https://ourairports.com/data/airports.csv", usecols=["iata_code","type","scheduled_service"]) # type: ignore
···143153 traffic_dict = traffic_counts.to_dict()
144154 # Map route hits using apply to ensure a Series
145155 df_all['route_hits'] = df_all['iata'].apply(lambda x: traffic_dict.get(x, 0)).astype(int)
146146- # Compute standard offset seconds for each record
156156+ # Compute standard offset seconds for each record (now that tz is fixed)
147157 df_all['std_offset_s'] = df_all['tz'].apply(lambda tz: _find_dst_transitions(tz, year)[0])
148158149159 # Fallback selector for a given std_offset
···192202 std_s, dst_s, start_ts, end_ts = _find_dst_transitions(tz_name, year)
193203 key = (std_s, dst_s, start_ts, end_ts)
194204 if key not in full_buckets:
195195- full_buckets[key] = { 'std': std_s, 'dst': dst_s, 'start': start_ts, 'end': end_ts }
205205+ full_buckets[key] = {
206206+ 'std': std_s,
207207+ 'dst': dst_s,
208208+ 'start': start_ts,
209209+ 'end': end_ts,
210210+ 'tz_names': [tz_name],
211211+ }
196212 group_keys.setdefault(std_s, []).append(key)
213213+ else:
214214+ full_buckets[key]['tz_names'].append(tz_name)
197215198216 # 2) Collect codes from HTML for each std_offset (popular timezones)
199217 group_codes: Dict[int, List[str]] = {}
···216234 if not group_codes.get(std_s):
217235 group_codes[std_s] = _fallback_codes(std_s)
218236219219- # 4) Assign popular codes to their actual DST buckets, then fallback for empty
237237+ # 4) Assign popular & fallback codes to their actual DST buckets
220238 # initialize codes list for each bucket
221221- for key, meta in full_buckets.items():
239239+ for bucket_key, meta in full_buckets.items():
222240 meta['codes'] = []
223223- # populate HTML-based codes into their real tz variant buckets
241241+ used_codes: set[str] = set()
242242+243243+ def _assign_to_bucket(iata_code: str):
244244+ if iata_code in used_codes:
245245+ return False
246246+ rec = airport_db.get(iata_code)
247247+ if rec and rec.get('tz'):
248248+ std2, dst2, st2, ed2 = _find_dst_transitions(rec['tz'], year)
249249+ key = (std2, dst2, st2, ed2)
250250+ if key in full_buckets:
251251+ full_buckets[key]['codes'].append(iata_code)
252252+ used_codes.add(iata_code)
253253+ return True
254254+ return False
255255+224256 for std_s, codes in group_codes.items():
225257 for iata in codes:
226226- rec = airport_db.get(iata)
227227- if rec and rec.get('tz'):
228228- std2, dst2, st2, ed2 = _find_dst_transitions(rec['tz'], year)
229229- bucket_key = (std2, dst2, st2, ed2)
230230- if bucket_key in full_buckets:
231231- full_buckets[bucket_key]['codes'].append(iata)
258258+ _assign_to_bucket(iata)
259259+232260 # fallback for buckets still empty: only populate the first empty bucket per std-offset
233261 for std_s, keys in group_keys.items():
234234- # track HTML-derived codes for this std-offset
235262 assigned = set(group_codes.get(std_s, []))
236236- # prepare a one-time fallback candidate list, filtered of already assigned
237237- fallback_candidates = [c for c in _fallback_codes(std_s) if c not in assigned]
238238- fallback_used = False
263263+ fallback_candidates = [c for c in group_codes.get(std_s, []) if c not in used_codes]
264264+ # track if we've used fallback for this std-offset
265265+ used = False
239266 for bucket_key in keys:
240267 codes_list = full_buckets[bucket_key].get('codes', [])
241241- if not codes_list and not fallback_used and fallback_candidates:
242242- # assign up to max_bucket fallback codes to first empty bucket
243243- if max_bucket > 0:
244244- codes_list = fallback_candidates[:max_bucket]
268268+ if not codes_list and not used:
269269+ # pick first unassigned fallback candidates
270270+ fallback_pool = [c for c in group_codes.get(std_s, []) if c not in used_codes]
271271+ for candidate in fallback_pool:
272272+ if _assign_to_bucket(candidate):
273273+ codes_list = [candidate]
274274+ break
245275 else:
246246- codes_list = fallback_candidates[:]
247247- fallback_used = True
248248- # cap any list to max_bucket if needed
249249- if codes_list and max_bucket > 0:
250250- codes_list = codes_list[:max_bucket]
276276+ codes_list = []
277277+ used = True
251278 full_buckets[bucket_key]['codes'] = codes_list
252252- # record assigned codes so we don't reuse them (though fallback is one-shot)
253253- assigned.update(codes_list)
279279+280280+ # FINAL safety pass: if a bucket is still empty try to grab 1 airport that
281281+ # actually sits in *this* timezone (e.g. DUT for America/Adak). This never
282282+ # duplicates because we consult used_codes.
283283+ for bucket_key, meta in full_buckets.items():
284284+ if meta['codes']:
285285+ continue
286286+ tz_names = meta.get('tz_names', [])
287287+ if not tz_names:
288288+ continue
289289+ seg = df_all[df_all['tz'].isin(tz_names)].sort_values('route_hits', ascending=False)
290290+ for code in seg['iata']:
291291+ if _assign_to_bucket(code):
292292+ meta['codes'] = [code]
293293+ break
254294255295 # build ordered bucket list
256296 buckets_list = [
···263303264304 # 5) Build flat pool and offsets
265305 code_pool = []
306306+ seen_for_pool: set[str] = set()
266307 for b in buckets_list:
308308+ unique_codes = [c for c in b.get('codes', []) if c not in seen_for_pool]
267309 b['offset'] = len(code_pool)
268268- b['count'] = len(b.get('codes', []))
269269- code_pool.extend(b.get('codes', []))
310310+ b['count'] = len(unique_codes)
311311+ code_pool.extend(unique_codes)
312312+ seen_for_pool.update(unique_codes)
270313271314 # Build name pool parallel to code_pool
272315 name_pool = []