unicode: Unify check for UTF-16 BOM · tsiry-sandratraina.com/rockbox-zig@a23002c

+7 -8

apps/cuesheet.c

··· 231 231 char_enc = CHAR_ENC_UTF_8; 232 232 bom_read = BOM_UTF_8_SIZE; 233 233 } 234 - else if(!memcmp(line, BOM_UTF_16_LE, BOM_UTF_16_SIZE)) 234 + else 235 235 { 236 - char_enc = CHAR_ENC_UTF_16_LE; 237 - bom_read = BOM_UTF_16_SIZE; 238 - } 239 - else if(!memcmp(line, BOM_UTF_16_BE, BOM_UTF_16_SIZE)) 240 - { 241 - char_enc = CHAR_ENC_UTF_16_BE; 242 - bom_read = BOM_UTF_16_SIZE; 236 + bool le; 237 + if (utf16_has_bom(line, &le)) 238 + { 239 + char_enc = le ? CHAR_ENC_UTF_16_LE : CHAR_ENC_UTF_16_BE; 240 + bom_read = BOM_UTF_16_SIZE; 241 + } 243 242 } 244 243 } 245 244

+22

firmware/common/unicode.c

··· 437 437 return utf8; 438 438 } 439 439 440 + bool utf16_has_bom(const unsigned char *utf16, bool *le) 441 + { 442 + unsigned long ucs = utf16[0] << 8 | utf16[1]; 443 + 444 + if (ucs == 0xFEFF) /* Check for BOM */ 445 + { 446 + *le = false; 447 + return true; 448 + } 449 + 450 + if (ucs == 0xFFFE) 451 + { 452 + *le = true; 453 + return true; 454 + } 455 + 456 + /* If there is no BOM let's try to guess it. If one of the bytes is 0x00, it is 457 + probably the most significant one. */ 458 + *le = utf16[1] == 0; 459 + return false; 460 + } 461 + 440 462 #if 0 /* currently unused */ 441 463 /* Recode any UTF-16 string to UTF-8 */ 442 464 unsigned char* utf16decode(const unsigned char *utf16, unsigned char *utf8,

+2

firmware/include/rbunicode.h

··· 29 29 #define _RBUNICODE_H_ 30 30 31 31 #include "config.h" 32 + #include <stdbool.h> 32 33 33 34 #define MASK 0xC0 /* 11000000 */ 34 35 #define COMP 0x80 /* 10x */ ··· 58 59 unsigned char* iso_decode(const unsigned char *latin1, unsigned char *utf8, int cp, int count); 59 60 unsigned char* utf16LEdecode(const unsigned char *utf16, unsigned char *utf8, int count); 60 61 unsigned char* utf16BEdecode(const unsigned char *utf16, unsigned char *utf8, int count); 62 + bool utf16_has_bom(const unsigned char *utf16, bool *le); 61 63 unsigned long utf8length(const unsigned char *utf8); 62 64 const unsigned char* utf8decode(const unsigned char *utf8, unsigned short *ucs); 63 65 void set_codepage(int cp);

+12 -25

lib/rbcodec/metadata/id3tags.c

··· 570 570 string. If it is, we convert it to a UTF-8 string. If it's not unicode, 571 571 we convert from the default codepage */ 572 572 static void unicode_munge(char* string, char* utf8buf, int *len) { 573 - long tmp; 574 - bool le = false; 575 573 int i = 0; 576 574 unsigned char *str = (unsigned char *)string; 577 575 int templen = 0; ··· 590 588 case 0x02: 591 589 (*len)--; 592 590 str++; 591 + bool le; 592 + 593 593 594 594 /* Handle frames with more than one string 595 595 (needed for TXXX frames).*/ 596 596 do { 597 - tmp = bytes2int(0, 0, str[0], str[1]); 598 - 599 - /* Now check if there is a BOM 600 - (zero-width non-breaking space, 0xfeff) 601 - and if it is in little or big endian format */ 602 - if(tmp == 0xfffe) { /* Little endian? */ 603 - le = true; 604 - str += 2; 605 - (*len)-=2; 606 - } else if(tmp == 0xfeff) { /* Big endian? */ 607 - str += 2; 608 - (*len)-=2; 609 - } else 610 - /* If there is no BOM (which is a specification violation), 611 - let's try to guess it. If one of the bytes is 0x00, it is 612 - probably the most significant one. */ 613 - if(str[1] == 0) 614 - le = true; 597 + if (utf16_has_bom(str, &le)) 598 + { 599 + str += BOM_UTF_16_SIZE; 600 + *len -= BOM_UTF_16_SIZE; 601 + } 615 602 616 603 while ((i < *len) && (str[0] || str[1])) { 617 604 if(le) ··· 734 721 switch (*(tag++)) 735 722 { 736 723 case 0x01: 737 - if (!memcmp(tag, BOM_UTF_16_BE, BOM_UTF_16_SIZE)) 738 - *char_enc = CHAR_ENC_UTF_16_BE; 739 - else if (!memcmp(tag, BOM_UTF_16_LE, BOM_UTF_16_SIZE)) 740 - *char_enc = CHAR_ENC_UTF_16_LE; 741 - else 724 + { 725 + bool le; 726 + if (!utf16_has_bom(tag, &le)) 742 727 return false; 743 728 729 + *char_enc = le ? CHAR_ENC_UTF_16_LE: CHAR_ENC_UTF_16_BE; 744 730 tag+= BOM_UTF_16_SIZE; 745 731 /* \1 + BOM(2) + C0U0E0S0H0E0E0T000 = 21 */ 746 732 *cuesheet_offset = 21; 747 733 break; 734 + } 748 735 749 736 case 0x02: 750 737 *char_enc = CHAR_ENC_UTF_16_BE;

Configure Feed

Configure Feed