scripts/gen_i18n.py at master · edouard.paris/crosspoint-reader

edouard.paris / crosspoint-reader
fork
A fork of https://github.com/crosspoint-reader/crosspoint-reader
fork
crosspoint-reader / scripts / gen_i18n.py
at master 647 lines 22 kB view raw
wrap content
fsocietyipa feat: Add Kazakh (kk) language support (#1377) 7w ago
16b73744
  1#!/usr/bin/env python3
  2"""
  3Generate I18n C++ files from per-language YAML translations.
  4
  5Reads YAML files from a translations directory (one file per language) and generates:
  6- I18nKeys.h:     Language enum, StrId enum, helper functions
  7- I18nStrings.h:  String array declarations
  8- I18nStrings.cpp: String array definitions with all translations
  9
 10Each YAML file must contain:
 11  _language_name: "Native Name"     (e.g. "Español")
 12  _language_code: "ENUM_NAME"       (e.g. "ES")
 13  STR_KEY: "translation text"
 14
 15The English file is the reference. Missing keys in other languages are
 16automatically filled from English, with a warning.
 17
 18Usage:
 19    python gen_i18n.py <translations_dir> <output_dir>
 20
 21Example:
 22    python gen_i18n.py lib/I18n/translations lib/I18n/
 23"""
 24
 25import sys
 26import os
 27import re
 28from pathlib import Path
 29from typing import List, Dict, Tuple
 30
 31
 32# ---------------------------------------------------------------------------
 33# YAML file reading (simple key: "value" format, no PyYAML dependency)
 34# ---------------------------------------------------------------------------
 35
 36def _unescape_yaml_value(raw: str, filepath: str = "", line_num: int = 0) -> str:
 37    """
 38    Process escape sequences in a YAML value string.
 39
 40    Recognized escapes:  \\\\  →  \\       \\"  →  "       \\n  →  newline
 41    """
 42    result: List[str] = []
 43    i = 0
 44    while i < len(raw):
 45        if raw[i] == "\\" and i + 1 < len(raw):
 46            nxt = raw[i + 1]
 47            if nxt == "\\":
 48                result.append("\\")
 49            elif nxt == '"':
 50                result.append('"')
 51            elif nxt == "n":
 52                result.append("\n")
 53            else:
 54                raise ValueError(
 55                    f"{filepath}:{line_num}: unknown escape '\\{nxt}'"
 56                )
 57            i += 2
 58        else:
 59            result.append(raw[i])
 60            i += 1
 61    return "".join(result)
 62
 63
 64def parse_yaml_file(filepath: str) -> Dict[str, str]:
 65    """
 66    Parse a simple YAML file of the form:
 67        key: "value"
 68
 69    Only supports flat key-value pairs with quoted string values.
 70    Aborts on formatting errors.
 71    """
 72    result = {}
 73    with open(filepath, "r", encoding="utf-8") as f:
 74        for line_num, raw_line in enumerate(f, start=1):
 75            line = raw_line.rstrip("\n\r")
 76
 77            if not line.strip():
 78                continue
 79
 80            match = re.match(r'^([A-Za-z_][A-Za-z0-9_]*)\s*:\s*"(.*)"$', line)
 81            if not match:
 82                raise ValueError(
 83                    f"{filepath}:{line_num}: bad format: {line!r}\n"
 84                    f'  Expected:  KEY: "value"'
 85                )
 86
 87            key = match.group(1)
 88            raw_value = match.group(2)
 89
 90            # Un-escape: process character by character to handle
 91            # \\, \", and \n sequences correctly
 92            value = _unescape_yaml_value(raw_value, filepath, line_num)
 93
 94            if key in result:
 95                raise ValueError(f"{filepath}:{line_num}: duplicate key '{key}'")
 96
 97            result[key] = value
 98
 99    return result
100
101
102# ---------------------------------------------------------------------------
103# Load all languages from a directory of YAML files
104# ---------------------------------------------------------------------------
105
106def load_translations(
107    translations_dir: str,
108) -> Tuple[List[str], List[str], List[str], Dict[str, List[str]]]:
109    """
110    Read every YAML file in *translations_dir* and return:
111        language_codes   e.g. ["EN", "ES", ...]
112        language_names   e.g. ["English", "Español", ...]
113        string_keys      ordered list of STR_* keys (from English)
114        translations     {key: [translation_per_language]}
115
116    English is always first;
117    """
118    yaml_dir = Path(translations_dir)
119    if not yaml_dir.is_dir():
120        raise FileNotFoundError(f"Translations directory not found: {translations_dir}")
121
122    yaml_files = sorted(yaml_dir.glob("*.yaml"))
123    if not yaml_files:
124        raise FileNotFoundError(f"No .yaml files found in {translations_dir}")
125
126    # Parse every file
127    parsed: Dict[str, Dict[str, str]] = {}
128    for yf in yaml_files:
129        parsed[yf.name] = parse_yaml_file(str(yf))
130
131    # Identify the English file (must exist)
132    english_file = None
133    for name, data in parsed.items():
134        if data.get("_language_code", "").upper() == "EN":
135            english_file = name
136            break
137
138    if english_file is None:
139        raise ValueError("No YAML file with _language_code: EN found")
140
141    # Order: English first, then by _order metadata (falls back to filename)
142    def sort_key(fname: str) -> Tuple[int, int, str]:
143        """English always first (0), then by _order, then by filename."""
144        if fname == english_file:
145            return (0, 0, fname)
146        order = parsed[fname].get("_order", "999")
147        try:
148            order_int = int(order)
149        except ValueError:
150            order_int = 999
151        return (1, order_int, fname)
152
153    ordered_files = sorted(parsed, key=sort_key)
154
155    # Extract metadata
156    language_codes: List[str] = []
157    language_names: List[str] = []
158    for fname in ordered_files:
159        data = parsed[fname]
160        code = data.get("_language_code")
161        name = data.get("_language_name")
162        if not code or not name:
163            raise ValueError(f"{fname}: missing _language_code or _language_name")
164        language_codes.append(code)
165        language_names.append(name)
166
167    # String keys come from English (order matters)
168    english_data = parsed[english_file]
169    string_keys = [k for k in english_data if not k.startswith("_")]
170
171    # Validate all keys are valid C++ identifiers
172    for key in string_keys:
173        if not re.match(r"^[a-zA-Z_][a-zA-Z0-9_]*$", key):
174            raise ValueError(f"Invalid C++ identifier in English file: '{key}'")
175
176    # Build translations dict, filling missing keys from English
177    translations: Dict[str, List[str]] = {}
178    for key in string_keys:
179        row: List[str] = []
180        for fname in ordered_files:
181            data = parsed[fname]
182            value = data.get(key, "")
183            if not value.strip() and fname != english_file:
184                value = english_data[key]
185                lang_code = parsed[fname].get("_language_code", fname)
186                print(f"  INFO: '{key}' missing in {lang_code}, using English fallback")
187            row.append(value)
188        translations[key] = row
189
190    # Warn about extra keys in non-English files
191    for fname in ordered_files:
192        if fname == english_file:
193            continue
194        data = parsed[fname]
195        extra = [k for k in data if not k.startswith("_") and k not in english_data]
196        if extra:
197            lang_code = data.get("_language_code", fname)
198            print(f"  WARNING: {lang_code} has keys not in English: {', '.join(extra)}")
199
200    print(f"Loaded {len(language_codes)} languages, {len(string_keys)} string keys")
201    return language_codes, language_names, string_keys, translations
202
203
204# ---------------------------------------------------------------------------
205# C++ string escaping
206# ---------------------------------------------------------------------------
207
208LANG_ABBREVIATIONS = {
209    "english": "EN",
210    "español": "ES", "espanol": "ES",
211    "italiano": "IT",
212    "svenska": "SV",
213    "français": "FR", "francais": "FR",
214    "deutsch": "DE", "german": "DE",
215    "polski": "PL",
216    "português": "PT", "portugues": "PT", "português (brasil)": "PO",
217    "中文": "ZH", "chinese": "ZH",
218    "日本語": "JA", "japanese": "JA",
219    "한국어": "KO", "korean": "KO",
220    "русский": "RU", "russian": "RU",
221    "العربية": "AR", "arabic": "AR",
222    "עברית": "HE", "hebrew": "HE",
223    "فارسی": "FA", "persian": "FA",
224    "čeština": "CS",
225    "türkçe": "TR", "turkish": "TR",
226    "Қазақша": "KK", "kazakh": "KK",
227}
228
229
230def get_lang_abbreviation(lang_code: str, lang_name: str) -> str:
231    """Return a 2-letter abbreviation for a language."""
232    lower = lang_name.lower()
233    if lower in LANG_ABBREVIATIONS:
234        return LANG_ABBREVIATIONS[lower]
235    return lang_code[:2].upper()
236
237
238def escape_cpp_string(s: str) -> List[str]:
239    r"""
240    Convert *s* into one or more C++ string literal segments.
241
242    Non-ASCII characters are emitted as \xNN hex sequences. After each
243    hex escape a new segment is started so the compiler doesn't merge
244    subsequent hex digits into the escape.
245
246    Returns a list of string segments (without quotes). For simple ASCII
247    strings this is a single-element list.
248    """
249    if not s:
250        return [""]
251
252    s = s.replace("\n", "\\n")
253
254    # Build a flat list of "tokens", where each token is either a regular
255    # character sequence or a hex escape.  A segment break happens after
256    # every hex escape.
257    segments: List[str] = []
258    current: List[str] = []
259    i = 0
260
261    def _flush() -> None:
262        segments.append("".join(current))
263        current.clear()
264
265    while i < len(s):
266        ch = s[i]
267
268        if ch == "\\" and i + 1 < len(s):
269            nxt = s[i + 1]
270            if nxt in "ntr\"\\":
271                current.append(ch + nxt)
272                i += 2
273            elif nxt == "x" and i + 3 < len(s):
274                current.append(s[i : i + 4])
275                _flush()                       # segment break after hex
276                i += 4
277            else:
278                current.append("\\\\")
279                i += 1
280        elif ch == '"':
281            current.append('\\"')
282            i += 1
283        elif ord(ch) < 128:
284            current.append(ch)
285            i += 1
286        else:
287            for byte in ch.encode("utf-8"):
288                current.append(f"\\x{byte:02X}")
289                _flush()                       # segment break after hex
290            i += 1
291
292    # Flush remaining content
293    _flush()
294
295    return segments
296
297
298def format_cpp_string_literal(segments: List[str], indent: str = "    ") -> List[str]:
299    """
300    Format string segments (from escape_cpp_string) as indented C++ string
301    literal lines, each wrapped in quotes.
302    Also wraps long segments to respect ~120 column limit.
303    """
304    # Effective limit for content: 120 - 4 (indent) - 2 (quotes) - 1 (comma/safety) = 113
305    # Using 113 to match clang-format exactly (120 - 4 - 2 - 1)
306    MAX_CONTENT_LEN = 113
307
308    lines: List[str] = []
309
310    for seg in segments:
311        # Short segment (e.g. hex escape or short text)
312        if len(seg) <= MAX_CONTENT_LEN:
313            lines.append(f'{indent}"{seg}"')
314            continue
315
316        # Long segment - wrap it
317        current = seg
318        while len(current) > MAX_CONTENT_LEN:
319            # Find best split point
320            # Scan forward to find last space <= MAX_CONTENT_LEN
321            last_space = -1
322            idx = 0
323            while idx <= MAX_CONTENT_LEN and idx < len(current):
324                if current[idx] == ' ':
325                    last_space = idx
326
327                # Handle escapes to step correctly
328                if current[idx] == '\\':
329                    idx += 2
330                else:
331                    idx += 1
332
333            # If we found a space, split after it
334            if last_space != -1:
335                # Include the space in the first line
336                split_point = last_space + 1
337                lines.append(f'{indent}"{current[:split_point]}"')
338                current = current[split_point:]
339            else:
340                # No space, forced break at MAX_CONTENT_LEN (or slightly less)
341                cut_at = MAX_CONTENT_LEN
342                # Don't cut in the middle of an escape sequence
343                if current[cut_at - 1] == '\\':
344                    cut_at -= 1
345
346                lines.append(f'{indent}"{current[:cut_at]}"')
347                current = current[cut_at:]
348
349        if current:
350            lines.append(f'{indent}"{current}"')
351
352    return lines
353
354
355# ---------------------------------------------------------------------------
356# Character-set computation
357# ---------------------------------------------------------------------------
358
359def compute_character_set(translations: Dict[str, List[str]], lang_index: int) -> str:
360    """Return a sorted string of every unique character used in a language."""
361    chars = set()
362    for values in translations.values():
363        for ch in values[lang_index]:
364            chars.add(ord(ch))
365    return "".join(chr(cp) for cp in sorted(chars))
366
367
368# ---------------------------------------------------------------------------
369# Code generators
370# ---------------------------------------------------------------------------
371
372def generate_keys_header(
373    languages: List[str],
374    language_names: List[str],
375    string_keys: List[str],
376    output_path: str,
377) -> None:
378    """Generate I18nKeys.h."""
379    lines: List[str] = [
380        "#pragma once",
381        "#include <cstdint>",
382        "",
383        "// THIS FILE IS AUTO-GENERATED BY gen_i18n.py. DO NOT EDIT.",
384        "",
385        "// Forward declaration for string arrays",
386        "namespace i18n_strings {",
387    ]
388
389    for code, name in zip(languages, language_names):
390        abbrev = get_lang_abbreviation(code, name)
391        lines.append(f"extern const char* const STRINGS_{abbrev}[];")
392
393    lines.append("}  // namespace i18n_strings")
394    lines.append("")
395
396    # Language enum
397    lines.append("// Language enum")
398    lines.append("enum class Language : uint8_t {")
399    for i, lang in enumerate(languages):
400        lines.append(f"  {lang} = {i},")
401    lines.append("  _COUNT")
402    lines.append("};")
403    lines.append("")
404
405    # Extern declarations
406    lines.append("// Language display names (defined in I18nStrings.cpp)")
407    lines.append("extern const char* const LANGUAGE_NAMES[];")
408    lines.append("")
409    lines.append("// Character sets for each language (defined in I18nStrings.cpp)")
410    lines.append("extern const char* const CHARACTER_SETS[];")
411    lines.append("")
412
413    # StrId enum
414    lines.append("// String IDs")
415    lines.append("enum class StrId : uint16_t {")
416    for key in string_keys:
417        lines.append(f"  {key},")
418    lines.append("  // Sentinel - must be last")
419    lines.append("  _COUNT")
420    lines.append("};")
421    lines.append("")
422
423    # getStringArray helper
424    lines.append("// Helper function to get string array for a language")
425    lines.append("inline const char* const* getStringArray(Language lang) {")
426    lines.append("  switch (lang) {")
427    for code, name in zip(languages, language_names):
428        abbrev = get_lang_abbreviation(code, name)
429        lines.append(f"    case Language::{code}:")
430        lines.append(f"      return i18n_strings::STRINGS_{abbrev};")
431    first_abbrev = get_lang_abbreviation(languages[0], language_names[0])
432    lines.append("    default:")
433    lines.append(f"      return i18n_strings::STRINGS_{first_abbrev};")
434    lines.append("  }")
435    lines.append("}")
436    lines.append("")
437
438    # getLanguageCount helper (single line to match checked-in format)
439    lines.append("// Helper function to get language count")
440    lines.append(
441        "constexpr uint8_t getLanguageCount() "
442        "{ return static_cast<uint8_t>(Language::_COUNT); }"
443    )
444    lines.append("")
445
446    # Sorted language indices for display order
447    # (English first, then by language code alphabetically)
448    english_idx = languages.index("EN")
449    rest = sorted(
450        (i for i in range(len(languages)) if i != english_idx),
451        key=lambda i: languages[i],
452    )
453    sorted_indices = [english_idx] + rest
454    comment_names = ", ".join(language_names[i] for i in sorted_indices)
455    lines.append("// Sorted language indices by code (auto-generated by gen_i18n.py)")
456    lines.append(f"// Order: {comment_names}")
457    lines.append(
458        "constexpr uint8_t SORTED_LANGUAGE_INDICES[] = {"
459        f"{', '.join(str(i) for i in sorted_indices)}"
460        "};"
461    )
462    lines.append("")
463    lines.append(
464        "static_assert(sizeof(SORTED_LANGUAGE_INDICES) / sizeof(SORTED_LANGUAGE_INDICES[0]) == getLanguageCount(),"
465    )
466    lines.append(
467        '              "SORTED_LANGUAGE_INDICES size mismatch");'
468    )
469
470    _write_file(output_path, lines)
471
472
473def generate_strings_header(
474    languages: List[str],
475    language_names: List[str],
476    output_path: str,
477) -> None:
478    """Generate I18nStrings.h."""
479    lines: List[str] = [
480        "#pragma once",
481        '#include <string>',
482        "",
483        '#include "I18nKeys.h"',
484        "",
485        "// THIS FILE IS AUTO-GENERATED BY gen_i18n.py. DO NOT EDIT.",
486        "",
487        "namespace i18n_strings {",
488        "",
489    ]
490
491    for code, name in zip(languages, language_names):
492        abbrev = get_lang_abbreviation(code, name)
493        lines.append(f"extern const char* const STRINGS_{abbrev}[];")
494
495    lines.append("")
496    lines.append("}  // namespace i18n_strings")
497    _write_file(output_path, lines)
498
499
500def generate_strings_cpp(
501    languages: List[str],
502    language_names: List[str],
503    string_keys: List[str],
504    translations: Dict[str, List[str]],
505    output_path: str,
506) -> None:
507    """Generate I18nStrings.cpp."""
508    lines: List[str] = [
509        '#include "I18nStrings.h"',
510        "",
511        "// THIS FILE IS AUTO-GENERATED BY gen_i18n.py. DO NOT EDIT.",
512        "",
513    ]
514
515    # LANGUAGE_NAMES array
516    lines.append("// Language display names")
517    lines.append("const char* const LANGUAGE_NAMES[] = {")
518    for name in language_names:
519        _append_string_entry(lines, name)
520    lines.append("};")
521    lines.append("")
522
523    # CHARACTER_SETS array
524    lines.append("// Character sets for each language")
525    lines.append("const char* const CHARACTER_SETS[] = {")
526    for lang_idx, name in enumerate(language_names):
527        charset = compute_character_set(translations, lang_idx)
528        _append_string_entry(lines, charset, comment=name)
529    lines.append("};")
530    lines.append("")
531
532    # Per-language string arrays
533    lines.append("namespace i18n_strings {")
534    lines.append("")
535
536    for lang_idx, (code, name) in enumerate(zip(languages, language_names)):
537        abbrev = get_lang_abbreviation(code, name)
538        lines.append(f"const char* const STRINGS_{abbrev}[] = {{")
539
540        for key in string_keys:
541            text = translations[key][lang_idx]
542            _append_string_entry(lines, text)
543
544        lines.append("};")
545        lines.append("")
546
547    lines.append("}  // namespace i18n_strings")
548    lines.append("")
549
550    # Compile-time size checks
551    lines.append("// Compile-time validation of array sizes")
552    for code, name in zip(languages, language_names):
553        abbrev = get_lang_abbreviation(code, name)
554        lines.append(
555            f"static_assert(sizeof(i18n_strings::STRINGS_{abbrev}) "
556            f"/ sizeof(i18n_strings::STRINGS_{abbrev}[0]) =="
557        )
558        lines.append("                  static_cast<size_t>(StrId::_COUNT),")
559        lines.append(f'              "STRINGS_{abbrev} size mismatch");')
560
561    _write_file(output_path, lines)
562
563
564# ---------------------------------------------------------------------------
565# Helpers
566# ---------------------------------------------------------------------------
567
568def _append_string_entry(
569    lines: List[str], text: str, comment: str = ""
570) -> None:
571    """Escape *text*, format as indented C++ lines, append comma (and optional comment)."""
572    segments = escape_cpp_string(text)
573    formatted = format_cpp_string_literal(segments)
574    suffix = f",  // {comment}" if comment else ","
575    formatted[-1] += suffix
576    lines.extend(formatted)
577
578
579def _write_file(path: str, lines: List[str]) -> None:
580    with open(path, "w", encoding="utf-8", newline="\n") as f:
581        f.write("\n".join(lines))
582        f.write("\n")
583    print(f"Generated: {path}")
584
585
586# ---------------------------------------------------------------------------
587# Main
588# ---------------------------------------------------------------------------
589
590def main(translations_dir=None, output_dir=None) -> None:
591    # Default paths (relative to project root)
592    default_translations_dir = "lib/I18n/translations"
593    default_output_dir = "lib/I18n/"
594
595    if translations_dir is None or output_dir is None:
596        if len(sys.argv) == 3:
597            translations_dir = sys.argv[1]
598            output_dir = sys.argv[2]
599        else:
600            # Default for no arguments or weird arguments (e.g. SCons)
601            translations_dir = default_translations_dir
602            output_dir = default_output_dir
603
604
605    if not os.path.isdir(translations_dir):
606        print(f"Error: Translations directory not found: {translations_dir}")
607        sys.exit(1)
608
609    if not os.path.isdir(output_dir):
610        print(f"Error: Output directory not found: {output_dir}")
611        sys.exit(1)
612
613    print(f"Reading translations from: {translations_dir}")
614    print(f"Output directory: {output_dir}")
615    print()
616
617    try:
618        languages, language_names, string_keys, translations = load_translations(
619            translations_dir
620        )
621
622        out = Path(output_dir)
623        generate_keys_header(languages, language_names, string_keys, str(out / "I18nKeys.h"))
624        generate_strings_header(languages, language_names, str(out / "I18nStrings.h"))
625        generate_strings_cpp(
626            languages, language_names, string_keys, translations, str(out / "I18nStrings.cpp")
627        )
628
629        print()
630        print("✓ Code generation complete!")
631        print(f"  Languages: {len(languages)}")
632        print(f"  String keys: {len(string_keys)}")
633
634    except Exception as e:
635        print(f"\nError: {e}")
636        sys.exit(1)
637
638
639if __name__ == "__main__":
640    main()
641else:
642    try:
643        Import("env")
644        print("Running i18n generation script from PlatformIO...")
645        main()
646    except NameError:
647        pass
Configure Feed

Configure Feed