A fork of https://github.com/crosspoint-reader/crosspoint-reader
0
fork

Configure Feed

Select the types of activity you want to include in your feed.

at master 647 lines 22 kB view raw
1#!/usr/bin/env python3 2""" 3Generate I18n C++ files from per-language YAML translations. 4 5Reads YAML files from a translations directory (one file per language) and generates: 6- I18nKeys.h: Language enum, StrId enum, helper functions 7- I18nStrings.h: String array declarations 8- I18nStrings.cpp: String array definitions with all translations 9 10Each YAML file must contain: 11 _language_name: "Native Name" (e.g. "Español") 12 _language_code: "ENUM_NAME" (e.g. "ES") 13 STR_KEY: "translation text" 14 15The English file is the reference. Missing keys in other languages are 16automatically filled from English, with a warning. 17 18Usage: 19 python gen_i18n.py <translations_dir> <output_dir> 20 21Example: 22 python gen_i18n.py lib/I18n/translations lib/I18n/ 23""" 24 25import sys 26import os 27import re 28from pathlib import Path 29from typing import List, Dict, Tuple 30 31 32# --------------------------------------------------------------------------- 33# YAML file reading (simple key: "value" format, no PyYAML dependency) 34# --------------------------------------------------------------------------- 35 36def _unescape_yaml_value(raw: str, filepath: str = "", line_num: int = 0) -> str: 37 """ 38 Process escape sequences in a YAML value string. 39 40 Recognized escapes: \\\\\\ \\"" \\n → newline 41 """ 42 result: List[str] = [] 43 i = 0 44 while i < len(raw): 45 if raw[i] == "\\" and i + 1 < len(raw): 46 nxt = raw[i + 1] 47 if nxt == "\\": 48 result.append("\\") 49 elif nxt == '"': 50 result.append('"') 51 elif nxt == "n": 52 result.append("\n") 53 else: 54 raise ValueError( 55 f"{filepath}:{line_num}: unknown escape '\\{nxt}'" 56 ) 57 i += 2 58 else: 59 result.append(raw[i]) 60 i += 1 61 return "".join(result) 62 63 64def parse_yaml_file(filepath: str) -> Dict[str, str]: 65 """ 66 Parse a simple YAML file of the form: 67 key: "value" 68 69 Only supports flat key-value pairs with quoted string values. 70 Aborts on formatting errors. 71 """ 72 result = {} 73 with open(filepath, "r", encoding="utf-8") as f: 74 for line_num, raw_line in enumerate(f, start=1): 75 line = raw_line.rstrip("\n\r") 76 77 if not line.strip(): 78 continue 79 80 match = re.match(r'^([A-Za-z_][A-Za-z0-9_]*)\s*:\s*"(.*)"$', line) 81 if not match: 82 raise ValueError( 83 f"{filepath}:{line_num}: bad format: {line!r}\n" 84 f' Expected: KEY: "value"' 85 ) 86 87 key = match.group(1) 88 raw_value = match.group(2) 89 90 # Un-escape: process character by character to handle 91 # \\, \", and \n sequences correctly 92 value = _unescape_yaml_value(raw_value, filepath, line_num) 93 94 if key in result: 95 raise ValueError(f"{filepath}:{line_num}: duplicate key '{key}'") 96 97 result[key] = value 98 99 return result 100 101 102# --------------------------------------------------------------------------- 103# Load all languages from a directory of YAML files 104# --------------------------------------------------------------------------- 105 106def load_translations( 107 translations_dir: str, 108) -> Tuple[List[str], List[str], List[str], Dict[str, List[str]]]: 109 """ 110 Read every YAML file in *translations_dir* and return: 111 language_codes e.g. ["EN", "ES", ...] 112 language_names e.g. ["English", "Español", ...] 113 string_keys ordered list of STR_* keys (from English) 114 translations {key: [translation_per_language]} 115 116 English is always first; 117 """ 118 yaml_dir = Path(translations_dir) 119 if not yaml_dir.is_dir(): 120 raise FileNotFoundError(f"Translations directory not found: {translations_dir}") 121 122 yaml_files = sorted(yaml_dir.glob("*.yaml")) 123 if not yaml_files: 124 raise FileNotFoundError(f"No .yaml files found in {translations_dir}") 125 126 # Parse every file 127 parsed: Dict[str, Dict[str, str]] = {} 128 for yf in yaml_files: 129 parsed[yf.name] = parse_yaml_file(str(yf)) 130 131 # Identify the English file (must exist) 132 english_file = None 133 for name, data in parsed.items(): 134 if data.get("_language_code", "").upper() == "EN": 135 english_file = name 136 break 137 138 if english_file is None: 139 raise ValueError("No YAML file with _language_code: EN found") 140 141 # Order: English first, then by _order metadata (falls back to filename) 142 def sort_key(fname: str) -> Tuple[int, int, str]: 143 """English always first (0), then by _order, then by filename.""" 144 if fname == english_file: 145 return (0, 0, fname) 146 order = parsed[fname].get("_order", "999") 147 try: 148 order_int = int(order) 149 except ValueError: 150 order_int = 999 151 return (1, order_int, fname) 152 153 ordered_files = sorted(parsed, key=sort_key) 154 155 # Extract metadata 156 language_codes: List[str] = [] 157 language_names: List[str] = [] 158 for fname in ordered_files: 159 data = parsed[fname] 160 code = data.get("_language_code") 161 name = data.get("_language_name") 162 if not code or not name: 163 raise ValueError(f"{fname}: missing _language_code or _language_name") 164 language_codes.append(code) 165 language_names.append(name) 166 167 # String keys come from English (order matters) 168 english_data = parsed[english_file] 169 string_keys = [k for k in english_data if not k.startswith("_")] 170 171 # Validate all keys are valid C++ identifiers 172 for key in string_keys: 173 if not re.match(r"^[a-zA-Z_][a-zA-Z0-9_]*$", key): 174 raise ValueError(f"Invalid C++ identifier in English file: '{key}'") 175 176 # Build translations dict, filling missing keys from English 177 translations: Dict[str, List[str]] = {} 178 for key in string_keys: 179 row: List[str] = [] 180 for fname in ordered_files: 181 data = parsed[fname] 182 value = data.get(key, "") 183 if not value.strip() and fname != english_file: 184 value = english_data[key] 185 lang_code = parsed[fname].get("_language_code", fname) 186 print(f" INFO: '{key}' missing in {lang_code}, using English fallback") 187 row.append(value) 188 translations[key] = row 189 190 # Warn about extra keys in non-English files 191 for fname in ordered_files: 192 if fname == english_file: 193 continue 194 data = parsed[fname] 195 extra = [k for k in data if not k.startswith("_") and k not in english_data] 196 if extra: 197 lang_code = data.get("_language_code", fname) 198 print(f" WARNING: {lang_code} has keys not in English: {', '.join(extra)}") 199 200 print(f"Loaded {len(language_codes)} languages, {len(string_keys)} string keys") 201 return language_codes, language_names, string_keys, translations 202 203 204# --------------------------------------------------------------------------- 205# C++ string escaping 206# --------------------------------------------------------------------------- 207 208LANG_ABBREVIATIONS = { 209 "english": "EN", 210 "español": "ES", "espanol": "ES", 211 "italiano": "IT", 212 "svenska": "SV", 213 "français": "FR", "francais": "FR", 214 "deutsch": "DE", "german": "DE", 215 "polski": "PL", 216 "português": "PT", "portugues": "PT", "português (brasil)": "PO", 217 "中文": "ZH", "chinese": "ZH", 218 "日本語": "JA", "japanese": "JA", 219 "한국어": "KO", "korean": "KO", 220 "русский": "RU", "russian": "RU", 221 "العربية": "AR", "arabic": "AR", 222 "עברית": "HE", "hebrew": "HE", 223 "فارسی": "FA", "persian": "FA", 224 "čeština": "CS", 225 "türkçe": "TR", "turkish": "TR", 226 "Қазақша": "KK", "kazakh": "KK", 227} 228 229 230def get_lang_abbreviation(lang_code: str, lang_name: str) -> str: 231 """Return a 2-letter abbreviation for a language.""" 232 lower = lang_name.lower() 233 if lower in LANG_ABBREVIATIONS: 234 return LANG_ABBREVIATIONS[lower] 235 return lang_code[:2].upper() 236 237 238def escape_cpp_string(s: str) -> List[str]: 239 r""" 240 Convert *s* into one or more C++ string literal segments. 241 242 Non-ASCII characters are emitted as \xNN hex sequences. After each 243 hex escape a new segment is started so the compiler doesn't merge 244 subsequent hex digits into the escape. 245 246 Returns a list of string segments (without quotes). For simple ASCII 247 strings this is a single-element list. 248 """ 249 if not s: 250 return [""] 251 252 s = s.replace("\n", "\\n") 253 254 # Build a flat list of "tokens", where each token is either a regular 255 # character sequence or a hex escape. A segment break happens after 256 # every hex escape. 257 segments: List[str] = [] 258 current: List[str] = [] 259 i = 0 260 261 def _flush() -> None: 262 segments.append("".join(current)) 263 current.clear() 264 265 while i < len(s): 266 ch = s[i] 267 268 if ch == "\\" and i + 1 < len(s): 269 nxt = s[i + 1] 270 if nxt in "ntr\"\\": 271 current.append(ch + nxt) 272 i += 2 273 elif nxt == "x" and i + 3 < len(s): 274 current.append(s[i : i + 4]) 275 _flush() # segment break after hex 276 i += 4 277 else: 278 current.append("\\\\") 279 i += 1 280 elif ch == '"': 281 current.append('\\"') 282 i += 1 283 elif ord(ch) < 128: 284 current.append(ch) 285 i += 1 286 else: 287 for byte in ch.encode("utf-8"): 288 current.append(f"\\x{byte:02X}") 289 _flush() # segment break after hex 290 i += 1 291 292 # Flush remaining content 293 _flush() 294 295 return segments 296 297 298def format_cpp_string_literal(segments: List[str], indent: str = " ") -> List[str]: 299 """ 300 Format string segments (from escape_cpp_string) as indented C++ string 301 literal lines, each wrapped in quotes. 302 Also wraps long segments to respect ~120 column limit. 303 """ 304 # Effective limit for content: 120 - 4 (indent) - 2 (quotes) - 1 (comma/safety) = 113 305 # Using 113 to match clang-format exactly (120 - 4 - 2 - 1) 306 MAX_CONTENT_LEN = 113 307 308 lines: List[str] = [] 309 310 for seg in segments: 311 # Short segment (e.g. hex escape or short text) 312 if len(seg) <= MAX_CONTENT_LEN: 313 lines.append(f'{indent}"{seg}"') 314 continue 315 316 # Long segment - wrap it 317 current = seg 318 while len(current) > MAX_CONTENT_LEN: 319 # Find best split point 320 # Scan forward to find last space <= MAX_CONTENT_LEN 321 last_space = -1 322 idx = 0 323 while idx <= MAX_CONTENT_LEN and idx < len(current): 324 if current[idx] == ' ': 325 last_space = idx 326 327 # Handle escapes to step correctly 328 if current[idx] == '\\': 329 idx += 2 330 else: 331 idx += 1 332 333 # If we found a space, split after it 334 if last_space != -1: 335 # Include the space in the first line 336 split_point = last_space + 1 337 lines.append(f'{indent}"{current[:split_point]}"') 338 current = current[split_point:] 339 else: 340 # No space, forced break at MAX_CONTENT_LEN (or slightly less) 341 cut_at = MAX_CONTENT_LEN 342 # Don't cut in the middle of an escape sequence 343 if current[cut_at - 1] == '\\': 344 cut_at -= 1 345 346 lines.append(f'{indent}"{current[:cut_at]}"') 347 current = current[cut_at:] 348 349 if current: 350 lines.append(f'{indent}"{current}"') 351 352 return lines 353 354 355# --------------------------------------------------------------------------- 356# Character-set computation 357# --------------------------------------------------------------------------- 358 359def compute_character_set(translations: Dict[str, List[str]], lang_index: int) -> str: 360 """Return a sorted string of every unique character used in a language.""" 361 chars = set() 362 for values in translations.values(): 363 for ch in values[lang_index]: 364 chars.add(ord(ch)) 365 return "".join(chr(cp) for cp in sorted(chars)) 366 367 368# --------------------------------------------------------------------------- 369# Code generators 370# --------------------------------------------------------------------------- 371 372def generate_keys_header( 373 languages: List[str], 374 language_names: List[str], 375 string_keys: List[str], 376 output_path: str, 377) -> None: 378 """Generate I18nKeys.h.""" 379 lines: List[str] = [ 380 "#pragma once", 381 "#include <cstdint>", 382 "", 383 "// THIS FILE IS AUTO-GENERATED BY gen_i18n.py. DO NOT EDIT.", 384 "", 385 "// Forward declaration for string arrays", 386 "namespace i18n_strings {", 387 ] 388 389 for code, name in zip(languages, language_names): 390 abbrev = get_lang_abbreviation(code, name) 391 lines.append(f"extern const char* const STRINGS_{abbrev}[];") 392 393 lines.append("} // namespace i18n_strings") 394 lines.append("") 395 396 # Language enum 397 lines.append("// Language enum") 398 lines.append("enum class Language : uint8_t {") 399 for i, lang in enumerate(languages): 400 lines.append(f" {lang} = {i},") 401 lines.append(" _COUNT") 402 lines.append("};") 403 lines.append("") 404 405 # Extern declarations 406 lines.append("// Language display names (defined in I18nStrings.cpp)") 407 lines.append("extern const char* const LANGUAGE_NAMES[];") 408 lines.append("") 409 lines.append("// Character sets for each language (defined in I18nStrings.cpp)") 410 lines.append("extern const char* const CHARACTER_SETS[];") 411 lines.append("") 412 413 # StrId enum 414 lines.append("// String IDs") 415 lines.append("enum class StrId : uint16_t {") 416 for key in string_keys: 417 lines.append(f" {key},") 418 lines.append(" // Sentinel - must be last") 419 lines.append(" _COUNT") 420 lines.append("};") 421 lines.append("") 422 423 # getStringArray helper 424 lines.append("// Helper function to get string array for a language") 425 lines.append("inline const char* const* getStringArray(Language lang) {") 426 lines.append(" switch (lang) {") 427 for code, name in zip(languages, language_names): 428 abbrev = get_lang_abbreviation(code, name) 429 lines.append(f" case Language::{code}:") 430 lines.append(f" return i18n_strings::STRINGS_{abbrev};") 431 first_abbrev = get_lang_abbreviation(languages[0], language_names[0]) 432 lines.append(" default:") 433 lines.append(f" return i18n_strings::STRINGS_{first_abbrev};") 434 lines.append(" }") 435 lines.append("}") 436 lines.append("") 437 438 # getLanguageCount helper (single line to match checked-in format) 439 lines.append("// Helper function to get language count") 440 lines.append( 441 "constexpr uint8_t getLanguageCount() " 442 "{ return static_cast<uint8_t>(Language::_COUNT); }" 443 ) 444 lines.append("") 445 446 # Sorted language indices for display order 447 # (English first, then by language code alphabetically) 448 english_idx = languages.index("EN") 449 rest = sorted( 450 (i for i in range(len(languages)) if i != english_idx), 451 key=lambda i: languages[i], 452 ) 453 sorted_indices = [english_idx] + rest 454 comment_names = ", ".join(language_names[i] for i in sorted_indices) 455 lines.append("// Sorted language indices by code (auto-generated by gen_i18n.py)") 456 lines.append(f"// Order: {comment_names}") 457 lines.append( 458 "constexpr uint8_t SORTED_LANGUAGE_INDICES[] = {" 459 f"{', '.join(str(i) for i in sorted_indices)}" 460 "};" 461 ) 462 lines.append("") 463 lines.append( 464 "static_assert(sizeof(SORTED_LANGUAGE_INDICES) / sizeof(SORTED_LANGUAGE_INDICES[0]) == getLanguageCount()," 465 ) 466 lines.append( 467 ' "SORTED_LANGUAGE_INDICES size mismatch");' 468 ) 469 470 _write_file(output_path, lines) 471 472 473def generate_strings_header( 474 languages: List[str], 475 language_names: List[str], 476 output_path: str, 477) -> None: 478 """Generate I18nStrings.h.""" 479 lines: List[str] = [ 480 "#pragma once", 481 '#include <string>', 482 "", 483 '#include "I18nKeys.h"', 484 "", 485 "// THIS FILE IS AUTO-GENERATED BY gen_i18n.py. DO NOT EDIT.", 486 "", 487 "namespace i18n_strings {", 488 "", 489 ] 490 491 for code, name in zip(languages, language_names): 492 abbrev = get_lang_abbreviation(code, name) 493 lines.append(f"extern const char* const STRINGS_{abbrev}[];") 494 495 lines.append("") 496 lines.append("} // namespace i18n_strings") 497 _write_file(output_path, lines) 498 499 500def generate_strings_cpp( 501 languages: List[str], 502 language_names: List[str], 503 string_keys: List[str], 504 translations: Dict[str, List[str]], 505 output_path: str, 506) -> None: 507 """Generate I18nStrings.cpp.""" 508 lines: List[str] = [ 509 '#include "I18nStrings.h"', 510 "", 511 "// THIS FILE IS AUTO-GENERATED BY gen_i18n.py. DO NOT EDIT.", 512 "", 513 ] 514 515 # LANGUAGE_NAMES array 516 lines.append("// Language display names") 517 lines.append("const char* const LANGUAGE_NAMES[] = {") 518 for name in language_names: 519 _append_string_entry(lines, name) 520 lines.append("};") 521 lines.append("") 522 523 # CHARACTER_SETS array 524 lines.append("// Character sets for each language") 525 lines.append("const char* const CHARACTER_SETS[] = {") 526 for lang_idx, name in enumerate(language_names): 527 charset = compute_character_set(translations, lang_idx) 528 _append_string_entry(lines, charset, comment=name) 529 lines.append("};") 530 lines.append("") 531 532 # Per-language string arrays 533 lines.append("namespace i18n_strings {") 534 lines.append("") 535 536 for lang_idx, (code, name) in enumerate(zip(languages, language_names)): 537 abbrev = get_lang_abbreviation(code, name) 538 lines.append(f"const char* const STRINGS_{abbrev}[] = {{") 539 540 for key in string_keys: 541 text = translations[key][lang_idx] 542 _append_string_entry(lines, text) 543 544 lines.append("};") 545 lines.append("") 546 547 lines.append("} // namespace i18n_strings") 548 lines.append("") 549 550 # Compile-time size checks 551 lines.append("// Compile-time validation of array sizes") 552 for code, name in zip(languages, language_names): 553 abbrev = get_lang_abbreviation(code, name) 554 lines.append( 555 f"static_assert(sizeof(i18n_strings::STRINGS_{abbrev}) " 556 f"/ sizeof(i18n_strings::STRINGS_{abbrev}[0]) ==" 557 ) 558 lines.append(" static_cast<size_t>(StrId::_COUNT),") 559 lines.append(f' "STRINGS_{abbrev} size mismatch");') 560 561 _write_file(output_path, lines) 562 563 564# --------------------------------------------------------------------------- 565# Helpers 566# --------------------------------------------------------------------------- 567 568def _append_string_entry( 569 lines: List[str], text: str, comment: str = "" 570) -> None: 571 """Escape *text*, format as indented C++ lines, append comma (and optional comment).""" 572 segments = escape_cpp_string(text) 573 formatted = format_cpp_string_literal(segments) 574 suffix = f", // {comment}" if comment else "," 575 formatted[-1] += suffix 576 lines.extend(formatted) 577 578 579def _write_file(path: str, lines: List[str]) -> None: 580 with open(path, "w", encoding="utf-8", newline="\n") as f: 581 f.write("\n".join(lines)) 582 f.write("\n") 583 print(f"Generated: {path}") 584 585 586# --------------------------------------------------------------------------- 587# Main 588# --------------------------------------------------------------------------- 589 590def main(translations_dir=None, output_dir=None) -> None: 591 # Default paths (relative to project root) 592 default_translations_dir = "lib/I18n/translations" 593 default_output_dir = "lib/I18n/" 594 595 if translations_dir is None or output_dir is None: 596 if len(sys.argv) == 3: 597 translations_dir = sys.argv[1] 598 output_dir = sys.argv[2] 599 else: 600 # Default for no arguments or weird arguments (e.g. SCons) 601 translations_dir = default_translations_dir 602 output_dir = default_output_dir 603 604 605 if not os.path.isdir(translations_dir): 606 print(f"Error: Translations directory not found: {translations_dir}") 607 sys.exit(1) 608 609 if not os.path.isdir(output_dir): 610 print(f"Error: Output directory not found: {output_dir}") 611 sys.exit(1) 612 613 print(f"Reading translations from: {translations_dir}") 614 print(f"Output directory: {output_dir}") 615 print() 616 617 try: 618 languages, language_names, string_keys, translations = load_translations( 619 translations_dir 620 ) 621 622 out = Path(output_dir) 623 generate_keys_header(languages, language_names, string_keys, str(out / "I18nKeys.h")) 624 generate_strings_header(languages, language_names, str(out / "I18nStrings.h")) 625 generate_strings_cpp( 626 languages, language_names, string_keys, translations, str(out / "I18nStrings.cpp") 627 ) 628 629 print() 630 print("✓ Code generation complete!") 631 print(f" Languages: {len(languages)}") 632 print(f" String keys: {len(string_keys)}") 633 634 except Exception as e: 635 print(f"\nError: {e}") 636 sys.exit(1) 637 638 639if __name__ == "__main__": 640 main() 641else: 642 try: 643 Import("env") 644 print("Running i18n generation script from PlatformIO...") 645 main() 646 except NameError: 647 pass