mirror of Walter-Sparrow / lunar-tear
0
fork

Configure Feed

Select the types of activity you want to include in your feed.

Add UTF-8 handling for non-ASCII characters in path processing

+61
+61
server/internal/service/listbin.go
··· 7 7 "strconv" 8 8 "strings" 9 9 "sync" 10 + "unicode/utf8" 10 11 ) 11 12 12 13 // listBinEntry holds path (')' as segment separator) and size from list.bin; Size is 0 when not present. ··· 318 319 IsLocaleFallback bool 319 320 } 320 321 322 + // utf8ToMojibake re-encodes non-ASCII runes as if each of their UTF-8 bytes 323 + // were a Latin-1 codepoint. This matches filenames extracted by tools that 324 + // misinterpret UTF-8 paths as Latin-1 (double-encoding). For example, 325 + // U+FF12 (fullwidth 2, bytes EF BC 92) becomes U+00EF U+00BC U+0092 326 + // (bytes C3 AF C2 BC C2 92). 327 + func utf8ToMojibake(s string) string { 328 + var b strings.Builder 329 + changed := false 330 + for _, r := range s { 331 + if r >= 0x80 { 332 + var buf [4]byte 333 + n := utf8.EncodeRune(buf[:], r) 334 + for i := 0; i < n; i++ { 335 + b.WriteRune(rune(buf[i])) 336 + } 337 + changed = true 338 + } else { 339 + b.WriteRune(r) 340 + } 341 + } 342 + if !changed { 343 + return s 344 + } 345 + return b.String() 346 + } 347 + 348 + // normalizeFullwidth replaces fullwidth Unicode characters (U+FF01–U+FF5E) 349 + // with their ASCII equivalents (U+0021–U+007E). 350 + func normalizeFullwidth(s string) string { 351 + var b strings.Builder 352 + changed := false 353 + for _, r := range s { 354 + if r >= 0xFF01 && r <= 0xFF5E { 355 + b.WriteByte(byte(r - 0xFF01 + 0x21)) 356 + changed = true 357 + } else { 358 + b.WriteRune(r) 359 + } 360 + } 361 + if !changed { 362 + return s 363 + } 364 + return b.String() 365 + } 366 + 367 + func hasNonASCII(s string) bool { 368 + for _, r := range s { 369 + if r >= 0x80 { 370 + return true 371 + } 372 + } 373 + return false 374 + } 375 + 321 376 // pathStrToFullPaths converts a list.bin path string (using ')' separators) into filesystem 322 377 // candidates. The original locale path is returned first; if the path contains ja or ko, 323 378 // an en locale fallback is appended (marked IsLocaleFallback so callers can skip MD5 validation). 379 + // For paths with non-ASCII characters, mojibake (double-encoded) and fullwidth-to-ASCII 380 + // variants are also tried. 324 381 func pathStrToFullPaths(revision, assetType, pathStr string) []pathCandidate { 325 382 fsPath := strings.ReplaceAll(pathStr, ")", "/") 326 383 if strings.Contains(fsPath, "..") || filepath.IsAbs(fsPath) || strings.HasPrefix(fsPath, "/") { ··· 335 392 fallback bool 336 393 } 337 394 entries := []tagged{{pathStr, false}} 395 + if hasNonASCII(pathStr) { 396 + entries = append(entries, tagged{utf8ToMojibake(pathStr), false}) 397 + entries = append(entries, tagged{normalizeFullwidth(pathStr), false}) 398 + } 338 399 if strings.Contains(pathStr, ")ja)") { 339 400 entries = append(entries, tagged{strings.ReplaceAll(pathStr, ")ja)", ")en)"), true}) 340 401 }