A fork of https://github.com/crosspoint-reader/crosspoint-reader
1#!/usr/bin/env python3
2"""
3Generate I18n C++ files from per-language YAML translations.
4
5Reads YAML files from a translations directory (one file per language) and generates:
6- I18nKeys.h: Language enum, StrId enum, helper functions
7- I18nStrings.h: String array declarations
8- I18nStrings.cpp: String array definitions with all translations
9
10Each YAML file must contain:
11 _language_name: "Native Name" (e.g. "Español")
12 _language_code: "ENUM_NAME" (e.g. "ES")
13 STR_KEY: "translation text"
14
15The English file is the reference. Missing keys in other languages are
16automatically filled from English, with a warning.
17
18Usage:
19 python gen_i18n.py <translations_dir> <output_dir>
20
21Example:
22 python gen_i18n.py lib/I18n/translations lib/I18n/
23"""
24
25import sys
26import os
27import re
28from pathlib import Path
29from typing import List, Dict, Tuple
30
31
32# ---------------------------------------------------------------------------
33# YAML file reading (simple key: "value" format, no PyYAML dependency)
34# ---------------------------------------------------------------------------
35
36def _unescape_yaml_value(raw: str, filepath: str = "", line_num: int = 0) -> str:
37 """
38 Process escape sequences in a YAML value string.
39
40 Recognized escapes: \\\\ → \\ \\" → " \\n → newline
41 """
42 result: List[str] = []
43 i = 0
44 while i < len(raw):
45 if raw[i] == "\\" and i + 1 < len(raw):
46 nxt = raw[i + 1]
47 if nxt == "\\":
48 result.append("\\")
49 elif nxt == '"':
50 result.append('"')
51 elif nxt == "n":
52 result.append("\n")
53 else:
54 raise ValueError(
55 f"{filepath}:{line_num}: unknown escape '\\{nxt}'"
56 )
57 i += 2
58 else:
59 result.append(raw[i])
60 i += 1
61 return "".join(result)
62
63
64def parse_yaml_file(filepath: str) -> Dict[str, str]:
65 """
66 Parse a simple YAML file of the form:
67 key: "value"
68
69 Only supports flat key-value pairs with quoted string values.
70 Aborts on formatting errors.
71 """
72 result = {}
73 with open(filepath, "r", encoding="utf-8") as f:
74 for line_num, raw_line in enumerate(f, start=1):
75 line = raw_line.rstrip("\n\r")
76
77 if not line.strip():
78 continue
79
80 match = re.match(r'^([A-Za-z_][A-Za-z0-9_]*)\s*:\s*"(.*)"$', line)
81 if not match:
82 raise ValueError(
83 f"{filepath}:{line_num}: bad format: {line!r}\n"
84 f' Expected: KEY: "value"'
85 )
86
87 key = match.group(1)
88 raw_value = match.group(2)
89
90 # Un-escape: process character by character to handle
91 # \\, \", and \n sequences correctly
92 value = _unescape_yaml_value(raw_value, filepath, line_num)
93
94 if key in result:
95 raise ValueError(f"{filepath}:{line_num}: duplicate key '{key}'")
96
97 result[key] = value
98
99 return result
100
101
102# ---------------------------------------------------------------------------
103# Load all languages from a directory of YAML files
104# ---------------------------------------------------------------------------
105
106def load_translations(
107 translations_dir: str,
108) -> Tuple[List[str], List[str], List[str], Dict[str, List[str]]]:
109 """
110 Read every YAML file in *translations_dir* and return:
111 language_codes e.g. ["EN", "ES", ...]
112 language_names e.g. ["English", "Español", ...]
113 string_keys ordered list of STR_* keys (from English)
114 translations {key: [translation_per_language]}
115
116 English is always first;
117 """
118 yaml_dir = Path(translations_dir)
119 if not yaml_dir.is_dir():
120 raise FileNotFoundError(f"Translations directory not found: {translations_dir}")
121
122 yaml_files = sorted(yaml_dir.glob("*.yaml"))
123 if not yaml_files:
124 raise FileNotFoundError(f"No .yaml files found in {translations_dir}")
125
126 # Parse every file
127 parsed: Dict[str, Dict[str, str]] = {}
128 for yf in yaml_files:
129 parsed[yf.name] = parse_yaml_file(str(yf))
130
131 # Identify the English file (must exist)
132 english_file = None
133 for name, data in parsed.items():
134 if data.get("_language_code", "").upper() == "EN":
135 english_file = name
136 break
137
138 if english_file is None:
139 raise ValueError("No YAML file with _language_code: EN found")
140
141 # Order: English first, then by _order metadata (falls back to filename)
142 def sort_key(fname: str) -> Tuple[int, int, str]:
143 """English always first (0), then by _order, then by filename."""
144 if fname == english_file:
145 return (0, 0, fname)
146 order = parsed[fname].get("_order", "999")
147 try:
148 order_int = int(order)
149 except ValueError:
150 order_int = 999
151 return (1, order_int, fname)
152
153 ordered_files = sorted(parsed, key=sort_key)
154
155 # Extract metadata
156 language_codes: List[str] = []
157 language_names: List[str] = []
158 for fname in ordered_files:
159 data = parsed[fname]
160 code = data.get("_language_code")
161 name = data.get("_language_name")
162 if not code or not name:
163 raise ValueError(f"{fname}: missing _language_code or _language_name")
164 language_codes.append(code)
165 language_names.append(name)
166
167 # String keys come from English (order matters)
168 english_data = parsed[english_file]
169 string_keys = [k for k in english_data if not k.startswith("_")]
170
171 # Validate all keys are valid C++ identifiers
172 for key in string_keys:
173 if not re.match(r"^[a-zA-Z_][a-zA-Z0-9_]*$", key):
174 raise ValueError(f"Invalid C++ identifier in English file: '{key}'")
175
176 # Build translations dict, filling missing keys from English
177 translations: Dict[str, List[str]] = {}
178 for key in string_keys:
179 row: List[str] = []
180 for fname in ordered_files:
181 data = parsed[fname]
182 value = data.get(key, "")
183 if not value.strip() and fname != english_file:
184 value = english_data[key]
185 lang_code = parsed[fname].get("_language_code", fname)
186 print(f" INFO: '{key}' missing in {lang_code}, using English fallback")
187 row.append(value)
188 translations[key] = row
189
190 # Warn about extra keys in non-English files
191 for fname in ordered_files:
192 if fname == english_file:
193 continue
194 data = parsed[fname]
195 extra = [k for k in data if not k.startswith("_") and k not in english_data]
196 if extra:
197 lang_code = data.get("_language_code", fname)
198 print(f" WARNING: {lang_code} has keys not in English: {', '.join(extra)}")
199
200 print(f"Loaded {len(language_codes)} languages, {len(string_keys)} string keys")
201 return language_codes, language_names, string_keys, translations
202
203
204# ---------------------------------------------------------------------------
205# C++ string escaping
206# ---------------------------------------------------------------------------
207
208LANG_ABBREVIATIONS = {
209 "english": "EN",
210 "español": "ES", "espanol": "ES",
211 "italiano": "IT",
212 "svenska": "SV",
213 "français": "FR", "francais": "FR",
214 "deutsch": "DE", "german": "DE",
215 "polski": "PL",
216 "português": "PT", "portugues": "PT", "português (brasil)": "PO",
217 "中文": "ZH", "chinese": "ZH",
218 "日本語": "JA", "japanese": "JA",
219 "한국어": "KO", "korean": "KO",
220 "русский": "RU", "russian": "RU",
221 "العربية": "AR", "arabic": "AR",
222 "עברית": "HE", "hebrew": "HE",
223 "فارسی": "FA", "persian": "FA",
224 "čeština": "CS",
225 "türkçe": "TR", "turkish": "TR",
226 "Қазақша": "KK", "kazakh": "KK",
227}
228
229
230def get_lang_abbreviation(lang_code: str, lang_name: str) -> str:
231 """Return a 2-letter abbreviation for a language."""
232 lower = lang_name.lower()
233 if lower in LANG_ABBREVIATIONS:
234 return LANG_ABBREVIATIONS[lower]
235 return lang_code[:2].upper()
236
237
238def escape_cpp_string(s: str) -> List[str]:
239 r"""
240 Convert *s* into one or more C++ string literal segments.
241
242 Non-ASCII characters are emitted as \xNN hex sequences. After each
243 hex escape a new segment is started so the compiler doesn't merge
244 subsequent hex digits into the escape.
245
246 Returns a list of string segments (without quotes). For simple ASCII
247 strings this is a single-element list.
248 """
249 if not s:
250 return [""]
251
252 s = s.replace("\n", "\\n")
253
254 # Build a flat list of "tokens", where each token is either a regular
255 # character sequence or a hex escape. A segment break happens after
256 # every hex escape.
257 segments: List[str] = []
258 current: List[str] = []
259 i = 0
260
261 def _flush() -> None:
262 segments.append("".join(current))
263 current.clear()
264
265 while i < len(s):
266 ch = s[i]
267
268 if ch == "\\" and i + 1 < len(s):
269 nxt = s[i + 1]
270 if nxt in "ntr\"\\":
271 current.append(ch + nxt)
272 i += 2
273 elif nxt == "x" and i + 3 < len(s):
274 current.append(s[i : i + 4])
275 _flush() # segment break after hex
276 i += 4
277 else:
278 current.append("\\\\")
279 i += 1
280 elif ch == '"':
281 current.append('\\"')
282 i += 1
283 elif ord(ch) < 128:
284 current.append(ch)
285 i += 1
286 else:
287 for byte in ch.encode("utf-8"):
288 current.append(f"\\x{byte:02X}")
289 _flush() # segment break after hex
290 i += 1
291
292 # Flush remaining content
293 _flush()
294
295 return segments
296
297
298def format_cpp_string_literal(segments: List[str], indent: str = " ") -> List[str]:
299 """
300 Format string segments (from escape_cpp_string) as indented C++ string
301 literal lines, each wrapped in quotes.
302 Also wraps long segments to respect ~120 column limit.
303 """
304 # Effective limit for content: 120 - 4 (indent) - 2 (quotes) - 1 (comma/safety) = 113
305 # Using 113 to match clang-format exactly (120 - 4 - 2 - 1)
306 MAX_CONTENT_LEN = 113
307
308 lines: List[str] = []
309
310 for seg in segments:
311 # Short segment (e.g. hex escape or short text)
312 if len(seg) <= MAX_CONTENT_LEN:
313 lines.append(f'{indent}"{seg}"')
314 continue
315
316 # Long segment - wrap it
317 current = seg
318 while len(current) > MAX_CONTENT_LEN:
319 # Find best split point
320 # Scan forward to find last space <= MAX_CONTENT_LEN
321 last_space = -1
322 idx = 0
323 while idx <= MAX_CONTENT_LEN and idx < len(current):
324 if current[idx] == ' ':
325 last_space = idx
326
327 # Handle escapes to step correctly
328 if current[idx] == '\\':
329 idx += 2
330 else:
331 idx += 1
332
333 # If we found a space, split after it
334 if last_space != -1:
335 # Include the space in the first line
336 split_point = last_space + 1
337 lines.append(f'{indent}"{current[:split_point]}"')
338 current = current[split_point:]
339 else:
340 # No space, forced break at MAX_CONTENT_LEN (or slightly less)
341 cut_at = MAX_CONTENT_LEN
342 # Don't cut in the middle of an escape sequence
343 if current[cut_at - 1] == '\\':
344 cut_at -= 1
345
346 lines.append(f'{indent}"{current[:cut_at]}"')
347 current = current[cut_at:]
348
349 if current:
350 lines.append(f'{indent}"{current}"')
351
352 return lines
353
354
355# ---------------------------------------------------------------------------
356# Character-set computation
357# ---------------------------------------------------------------------------
358
359def compute_character_set(translations: Dict[str, List[str]], lang_index: int) -> str:
360 """Return a sorted string of every unique character used in a language."""
361 chars = set()
362 for values in translations.values():
363 for ch in values[lang_index]:
364 chars.add(ord(ch))
365 return "".join(chr(cp) for cp in sorted(chars))
366
367
368# ---------------------------------------------------------------------------
369# Code generators
370# ---------------------------------------------------------------------------
371
372def generate_keys_header(
373 languages: List[str],
374 language_names: List[str],
375 string_keys: List[str],
376 output_path: str,
377) -> None:
378 """Generate I18nKeys.h."""
379 lines: List[str] = [
380 "#pragma once",
381 "#include <cstdint>",
382 "",
383 "// THIS FILE IS AUTO-GENERATED BY gen_i18n.py. DO NOT EDIT.",
384 "",
385 "// Forward declaration for string arrays",
386 "namespace i18n_strings {",
387 ]
388
389 for code, name in zip(languages, language_names):
390 abbrev = get_lang_abbreviation(code, name)
391 lines.append(f"extern const char* const STRINGS_{abbrev}[];")
392
393 lines.append("} // namespace i18n_strings")
394 lines.append("")
395
396 # Language enum
397 lines.append("// Language enum")
398 lines.append("enum class Language : uint8_t {")
399 for i, lang in enumerate(languages):
400 lines.append(f" {lang} = {i},")
401 lines.append(" _COUNT")
402 lines.append("};")
403 lines.append("")
404
405 # Extern declarations
406 lines.append("// Language display names (defined in I18nStrings.cpp)")
407 lines.append("extern const char* const LANGUAGE_NAMES[];")
408 lines.append("")
409 lines.append("// Character sets for each language (defined in I18nStrings.cpp)")
410 lines.append("extern const char* const CHARACTER_SETS[];")
411 lines.append("")
412
413 # StrId enum
414 lines.append("// String IDs")
415 lines.append("enum class StrId : uint16_t {")
416 for key in string_keys:
417 lines.append(f" {key},")
418 lines.append(" // Sentinel - must be last")
419 lines.append(" _COUNT")
420 lines.append("};")
421 lines.append("")
422
423 # getStringArray helper
424 lines.append("// Helper function to get string array for a language")
425 lines.append("inline const char* const* getStringArray(Language lang) {")
426 lines.append(" switch (lang) {")
427 for code, name in zip(languages, language_names):
428 abbrev = get_lang_abbreviation(code, name)
429 lines.append(f" case Language::{code}:")
430 lines.append(f" return i18n_strings::STRINGS_{abbrev};")
431 first_abbrev = get_lang_abbreviation(languages[0], language_names[0])
432 lines.append(" default:")
433 lines.append(f" return i18n_strings::STRINGS_{first_abbrev};")
434 lines.append(" }")
435 lines.append("}")
436 lines.append("")
437
438 # getLanguageCount helper (single line to match checked-in format)
439 lines.append("// Helper function to get language count")
440 lines.append(
441 "constexpr uint8_t getLanguageCount() "
442 "{ return static_cast<uint8_t>(Language::_COUNT); }"
443 )
444 lines.append("")
445
446 # Sorted language indices for display order
447 # (English first, then by language code alphabetically)
448 english_idx = languages.index("EN")
449 rest = sorted(
450 (i for i in range(len(languages)) if i != english_idx),
451 key=lambda i: languages[i],
452 )
453 sorted_indices = [english_idx] + rest
454 comment_names = ", ".join(language_names[i] for i in sorted_indices)
455 lines.append("// Sorted language indices by code (auto-generated by gen_i18n.py)")
456 lines.append(f"// Order: {comment_names}")
457 lines.append(
458 "constexpr uint8_t SORTED_LANGUAGE_INDICES[] = {"
459 f"{', '.join(str(i) for i in sorted_indices)}"
460 "};"
461 )
462 lines.append("")
463 lines.append(
464 "static_assert(sizeof(SORTED_LANGUAGE_INDICES) / sizeof(SORTED_LANGUAGE_INDICES[0]) == getLanguageCount(),"
465 )
466 lines.append(
467 ' "SORTED_LANGUAGE_INDICES size mismatch");'
468 )
469
470 _write_file(output_path, lines)
471
472
473def generate_strings_header(
474 languages: List[str],
475 language_names: List[str],
476 output_path: str,
477) -> None:
478 """Generate I18nStrings.h."""
479 lines: List[str] = [
480 "#pragma once",
481 '#include <string>',
482 "",
483 '#include "I18nKeys.h"',
484 "",
485 "// THIS FILE IS AUTO-GENERATED BY gen_i18n.py. DO NOT EDIT.",
486 "",
487 "namespace i18n_strings {",
488 "",
489 ]
490
491 for code, name in zip(languages, language_names):
492 abbrev = get_lang_abbreviation(code, name)
493 lines.append(f"extern const char* const STRINGS_{abbrev}[];")
494
495 lines.append("")
496 lines.append("} // namespace i18n_strings")
497 _write_file(output_path, lines)
498
499
500def generate_strings_cpp(
501 languages: List[str],
502 language_names: List[str],
503 string_keys: List[str],
504 translations: Dict[str, List[str]],
505 output_path: str,
506) -> None:
507 """Generate I18nStrings.cpp."""
508 lines: List[str] = [
509 '#include "I18nStrings.h"',
510 "",
511 "// THIS FILE IS AUTO-GENERATED BY gen_i18n.py. DO NOT EDIT.",
512 "",
513 ]
514
515 # LANGUAGE_NAMES array
516 lines.append("// Language display names")
517 lines.append("const char* const LANGUAGE_NAMES[] = {")
518 for name in language_names:
519 _append_string_entry(lines, name)
520 lines.append("};")
521 lines.append("")
522
523 # CHARACTER_SETS array
524 lines.append("// Character sets for each language")
525 lines.append("const char* const CHARACTER_SETS[] = {")
526 for lang_idx, name in enumerate(language_names):
527 charset = compute_character_set(translations, lang_idx)
528 _append_string_entry(lines, charset, comment=name)
529 lines.append("};")
530 lines.append("")
531
532 # Per-language string arrays
533 lines.append("namespace i18n_strings {")
534 lines.append("")
535
536 for lang_idx, (code, name) in enumerate(zip(languages, language_names)):
537 abbrev = get_lang_abbreviation(code, name)
538 lines.append(f"const char* const STRINGS_{abbrev}[] = {{")
539
540 for key in string_keys:
541 text = translations[key][lang_idx]
542 _append_string_entry(lines, text)
543
544 lines.append("};")
545 lines.append("")
546
547 lines.append("} // namespace i18n_strings")
548 lines.append("")
549
550 # Compile-time size checks
551 lines.append("// Compile-time validation of array sizes")
552 for code, name in zip(languages, language_names):
553 abbrev = get_lang_abbreviation(code, name)
554 lines.append(
555 f"static_assert(sizeof(i18n_strings::STRINGS_{abbrev}) "
556 f"/ sizeof(i18n_strings::STRINGS_{abbrev}[0]) =="
557 )
558 lines.append(" static_cast<size_t>(StrId::_COUNT),")
559 lines.append(f' "STRINGS_{abbrev} size mismatch");')
560
561 _write_file(output_path, lines)
562
563
564# ---------------------------------------------------------------------------
565# Helpers
566# ---------------------------------------------------------------------------
567
568def _append_string_entry(
569 lines: List[str], text: str, comment: str = ""
570) -> None:
571 """Escape *text*, format as indented C++ lines, append comma (and optional comment)."""
572 segments = escape_cpp_string(text)
573 formatted = format_cpp_string_literal(segments)
574 suffix = f", // {comment}" if comment else ","
575 formatted[-1] += suffix
576 lines.extend(formatted)
577
578
579def _write_file(path: str, lines: List[str]) -> None:
580 with open(path, "w", encoding="utf-8", newline="\n") as f:
581 f.write("\n".join(lines))
582 f.write("\n")
583 print(f"Generated: {path}")
584
585
586# ---------------------------------------------------------------------------
587# Main
588# ---------------------------------------------------------------------------
589
590def main(translations_dir=None, output_dir=None) -> None:
591 # Default paths (relative to project root)
592 default_translations_dir = "lib/I18n/translations"
593 default_output_dir = "lib/I18n/"
594
595 if translations_dir is None or output_dir is None:
596 if len(sys.argv) == 3:
597 translations_dir = sys.argv[1]
598 output_dir = sys.argv[2]
599 else:
600 # Default for no arguments or weird arguments (e.g. SCons)
601 translations_dir = default_translations_dir
602 output_dir = default_output_dir
603
604
605 if not os.path.isdir(translations_dir):
606 print(f"Error: Translations directory not found: {translations_dir}")
607 sys.exit(1)
608
609 if not os.path.isdir(output_dir):
610 print(f"Error: Output directory not found: {output_dir}")
611 sys.exit(1)
612
613 print(f"Reading translations from: {translations_dir}")
614 print(f"Output directory: {output_dir}")
615 print()
616
617 try:
618 languages, language_names, string_keys, translations = load_translations(
619 translations_dir
620 )
621
622 out = Path(output_dir)
623 generate_keys_header(languages, language_names, string_keys, str(out / "I18nKeys.h"))
624 generate_strings_header(languages, language_names, str(out / "I18nStrings.h"))
625 generate_strings_cpp(
626 languages, language_names, string_keys, translations, str(out / "I18nStrings.cpp")
627 )
628
629 print()
630 print("✓ Code generation complete!")
631 print(f" Languages: {len(languages)}")
632 print(f" String keys: {len(string_keys)}")
633
634 except Exception as e:
635 print(f"\nError: {e}")
636 sys.exit(1)
637
638
639if __name__ == "__main__":
640 main()
641else:
642 try:
643 Import("env")
644 print("Running i18n generation script from PlatformIO...")
645 main()
646 except NameError:
647 pass