fix: Account for `nbsp;` character as non-breaking space (#757) · edouard.paris/crosspoint-reader@6e51afb

+3

lib/Epub/Epub/ParsedText.cpp

··· 32 32 // Returns the rendered width for a word while ignoring soft hyphen glyphs and optionally appending a visible hyphen. 33 33 uint16_t measureWordWidth(const GfxRenderer& renderer, const int fontId, const std::string& word, 34 34 const EpdFontFamily::Style style, const bool appendHyphen = false) { 35 + if (word.size() == 1 && word[0] == ' ' && !appendHyphen) { 36 + return renderer.getSpaceWidth(fontId); 37 + } 35 38 const bool hasSoftHyphen = containsSoftHyphen(word); 36 39 if (!hasSoftHyphen && !appendHyphen) { 37 40 return renderer.getTextWidth(fontId, word.c_str(), style);

+76

lib/Epub/Epub/htmlEntities.cpp

··· 1 + // from 2 + // https://github.com/atomic14/diy-esp32-epub-reader/blob/2c2f57fdd7e2a788d14a0bcb26b9e845a47aac42/lib/Epub/RubbishHtmlParser/htmlEntities.cpp 3 + 4 + #include "htmlEntities.h" 5 + 6 + #include <cstring> 7 + 8 + struct EntityPair { 9 + const char* key; 10 + const char* value; 11 + }; 12 + 13 + static const EntityPair ENTITY_LOOKUP[] = { 14 + {""", "\""}, {"&frasl;", "⁄"}, {"&", "&"}, {"<", "<"}, {">", ">"}, 15 + {"À", "À"}, {"Á", "Á"}, {"Â", "Â"}, {"Ã", "Ã"}, {"Ä", "Ä"}, 16 + {"Å", "Å"}, {"Æ", "Æ"}, {"Ç", "Ç"}, {"È", "È"}, {"É", "É"}, 17 + {"Ê", "Ê"}, {"Ë", "Ë"}, {"Ì", "Ì"}, {"Í", "Í"}, {"Î", "Î"}, 18 + {"Ï", "Ï"}, {"Ð", "Ð"}, {"Ñ", "Ñ"}, {"Ò", "Ò"}, {"Ó", "Ó"}, 19 + {"Ô", "Ô"}, {"Õ", "Õ"}, {"Ö", "Ö"}, {"Ø", "Ø"}, {"Ù", "Ù"}, 20 + {"Ú", "Ú"}, {"Û", "Û"}, {"Ü", "Ü"}, {"Ý", "Ý"}, {"Þ", "Þ"}, 21 + {"ß", "ß"}, {"à", "à"}, {"á", "á"}, {"â", "â"}, {"ã", "ã"}, 22 + {"ä", "ä"}, {"å", "å"}, {"æ", "æ"}, {"ç", "ç"}, {"è", "è"}, 23 + {"é", "é"}, {"ê", "ê"}, {"ë", "ë"}, {"ì", "ì"}, {"í", "í"}, 24 + {"î", "î"}, {"ï", "ï"}, {"ð", "ð"}, {"ñ", "ñ"}, {"ò", "ò"}, 25 + {"ó", "ó"}, {"ô", "ô"}, {"õ", "õ"}, {"ö", "ö"}, {"ø", "ø"}, 26 + {"ù", "ù"}, {"ú", "ú"}, {"û", "û"}, {"ü", "ü"}, {"ý", "ý"}, 27 + {"þ", "þ"}, {"ÿ", "ÿ"}, {" ", "\xC2\xA0"}, {"¡", "¡"}, {"¢", "¢"}, 28 + {"£", "£"}, {"¤", "¤"}, {"¥", "¥"}, {"¦", "¦"}, {"§", "§"}, 29 + {"¨", "¨"}, {"©", "©"}, {"ª", "ª"}, {"«", "«"}, {"¬", "¬"}, 30 + {"", ""}, {"®", "®"}, {"¯", "¯"}, {"°", "°"}, {"±", "±"}, 31 + {"²", "²"}, {"³", "³"}, {"´", "´"}, {"µ", "µ"}, {"¶", "¶"}, 32 + {"¸", "¸"}, {"¹", "¹"}, {"º", "º"}, {"»", "»"}, {"¼", "¼"}, 33 + {"½", "½"}, {"¾", "¾"}, {"¿", "¿"}, {"×", "×"}, {"÷", "÷"}, 34 + {"∀", "∀"}, {"∂", "∂"}, {"∃", "∃"}, {"∅", "∅"}, {"∇", "∇"}, 35 + {"∈", "∈"}, {"∉", "∉"}, {"&ni;", "∋"}, {"∏", "∏"}, {"∑", "∑"}, 36 + {"−", "−"}, {"&lowast;", "∗"}, {"√", "√"}, {"&prop;", "∝"}, {"∞", "∞"}, 37 + {"&ang;", "∠"}, {"&and;", "∧"}, {"&or;", "∨"}, {"∩", "∩"}, {"∪", "∪"}, 38 + {"∫", "∫"}, {"&there4;", "∴"}, {"&sim;", "∼"}, {"&cong;", "≅"}, {"≈", "≈"}, 39 + {"≠", "≠"}, {"&equiv;", "≡"}, {"≤", "≤"}, {"≥", "≥"}, {"⊂", "⊂"}, 40 + {"⊃", "⊃"}, {"&nsub;", "⊄"}, {"&sube;", "⊆"}, {"&supe;", "⊇"}, {"&oplus;", "⊕"}, 41 + {"&otimes;", "⊗"}, {"&perp;", "⊥"}, {"⋅", "⋅"}, {"Α", "Α"}, {"Β", "Β"}, 42 + {"Γ", "Γ"}, {"Δ", "Δ"}, {"Ε", "Ε"}, {"Ζ", "Ζ"}, {"Η", "Η"}, 43 + {"Θ", "Θ"}, {"Ι", "Ι"}, {"Κ", "Κ"}, {"Λ", "Λ"}, {"Μ", "Μ"}, 44 + {"Ν", "Ν"}, {"Ξ", "Ξ"}, {"Ο", "Ο"}, {"Π", "Π"}, {"Ρ", "Ρ"}, 45 + {"Σ", "Σ"}, {"Τ", "Τ"}, {"Υ", "Υ"}, {"Φ", "Φ"}, {"Χ", "Χ"}, 46 + {"Ψ", "Ψ"}, {"Ω", "Ω"}, {"α", "α"}, {"β", "β"}, {"γ", "γ"}, 47 + {"δ", "δ"}, {"ε", "ε"}, {"ζ", "ζ"}, {"η", "η"}, {"θ", "θ"}, 48 + {"ι", "ι"}, {"κ", "κ"}, {"λ", "λ"}, {"μ", "μ"}, {"ν", "ν"}, 49 + {"ξ", "ξ"}, {"ο", "ο"}, {"π", "π"}, {"ρ", "ρ"}, {"&sigmaf;", "ς"}, 50 + {"σ", "σ"}, {"τ", "τ"}, {"υ", "υ"}, {"φ", "φ"}, {"χ", "χ"}, 51 + {"ψ", "ψ"}, {"ω", "ω"}, {"&thetasym;", "ϑ"}, {"&upsih;", "ϒ"}, {"ϖ", "ϖ"}, 52 + {"&OElig;", "Œ"}, {"&oelig;", "œ"}, {"&Scaron;", "Š"}, {"&scaron;", "š"}, {"&Yuml;", "Ÿ"}, 53 + {"&fnof;", "ƒ"}, {"&circ;", "ˆ"}, {"&tilde;", "˜"}, {"&ensp;", " "}, {"&emsp;", " "}, 54 + {" ", " "}, {"&zwnj;", "‌"}, {"&zwj;", "‍"}, {"&lrm;", "‎"}, {"&rlm;", "‏"}, 55 + {"–", "–"}, {"—", "—"}, {"‘", "‘"}, {"’", "’"}, {"&sbquo;", "‚"}, 56 + {"“", "“"}, {"”", "”"}, {"&bdquo;", "„"}, {"&dagger;", "†"}, {"&Dagger;", "‡"}, 57 + {"•", "•"}, {"…", "…"}, {"&permil;", "‰"}, {"′", "′"}, {"″", "″"}, 58 + {"&lsaquo;", "‹"}, {"&rsaquo;", "›"}, {"&oline;", "‾"}, {"€", "€"}, {"™", "™"}, 59 + {"←", "←"}, {"↑", "↑"}, {"→", "→"}, {"↓", "↓"}, {"↔", "↔"}, 60 + {"&crarr;", "↵"}, {"&lceil;", "⌈"}, {"&rceil;", "⌉"}, {"&lfloor;", "⌊"}, {"&rfloor;", "⌋"}, 61 + {"&loz;", "◊"}, {"&spades;", "♠"}, {"&clubs;", "♣"}, {"&hearts;", "♥"}, {"&diams;", "♦"}}; 62 + 63 + static const size_t ENTITY_LOOKUP_COUNT = sizeof(ENTITY_LOOKUP) / sizeof(ENTITY_LOOKUP[0]); 64 + 65 + // Lookup a single HTML entity and return its UTF-8 value 66 + const char* lookupHtmlEntity(const char* entity, int len) { 67 + for (size_t i = 0; i < ENTITY_LOOKUP_COUNT; i++) { 68 + const char* key = ENTITY_LOOKUP[i].key; 69 + const size_t keyLen = strlen(key); 70 + if (static_cast<size_t>(len) == keyLen && memcmp(entity, key, keyLen) == 0) { 71 + return ENTITY_LOOKUP[i].value; 72 + } 73 + } 74 + 75 + return nullptr; // Entity not found 76 + }

+9

lib/Epub/Epub/htmlEntities.h

··· 1 + // from 2 + // https://github.com/atomic14/diy-esp32-epub-reader/blob/2c2f57fdd7e2a788d14a0bcb26b9e845a47aac42/lib/Epub/RubbishHtmlParser/htmlEntities.cpp 3 + 4 + #pragma once 5 + #include <string> 6 + 7 + // Lookup a single HTML entity (including & and ;) and return its UTF-8 value 8 + // Returns nullptr if entity is not found 9 + const char* lookupHtmlEntity(const char* entity, int len);

+43

lib/Epub/Epub/parsers/ChapterHtmlSlimParser.cpp

··· 6 6 #include <expat.h> 7 7 8 8 #include "../Page.h" 9 + #include "../htmlEntities.h" 9 10 10 11 const char* HEADER_TAGS[] = {"h1", "h2", "h3", "h4", "h5", "h6"}; 11 12 constexpr int NUM_HEADER_TAGS = sizeof(HEADER_TAGS) / sizeof(HEADER_TAGS[0]); ··· 359 360 continue; 360 361 } 361 362 363 + // Detect U+00A0 (non-breaking space): UTF-8 encoding is 0xC2 0xA0 364 + // Render a visible space without allowing a line break around it. 365 + if (static_cast<uint8_t>(s[i]) == 0xC2 && i + 1 < len && static_cast<uint8_t>(s[i + 1]) == 0xA0) { 366 + // Flush any pending text so style is applied correctly. 367 + if (self->partWordBufferIndex > 0) { 368 + self->flushPartWordBuffer(); 369 + } 370 + 371 + // Add a standalone space that attaches to the previous word. 372 + self->partWordBuffer[0] = ' '; 373 + self->partWordBuffer[1] = '\0'; 374 + self->partWordBufferIndex = 1; 375 + self->nextWordContinues = true; // Attach space to previous word (no break). 376 + self->flushPartWordBuffer(); 377 + 378 + // Ensure the next real word attaches to this space (no break). 379 + self->nextWordContinues = true; 380 + 381 + i++; // Skip the second byte (0xA0) 382 + continue; 383 + } 384 + 362 385 // Skip Zero Width No-Break Space / BOM (U+FEFF) = 0xEF 0xBB 0xBF 363 386 const XML_Char FEFF_BYTE_1 = static_cast<XML_Char>(0xEF); 364 387 const XML_Char FEFF_BYTE_2 = static_cast<XML_Char>(0xBB); ··· 391 414 self->renderer, self->fontId, self->viewportWidth, 392 415 [self](const std::shared_ptr<TextBlock>& textBlock) { self->addLineToPage(textBlock); }, false); 393 416 } 417 + } 418 + 419 + void XMLCALL ChapterHtmlSlimParser::defaultHandlerExpand(void* userData, const XML_Char* s, const int len) { 420 + // Check if this looks like an entity reference (&...;) 421 + if (len >= 3 && s[0] == '&' && s[len - 1] == ';') { 422 + const char* utf8Value = lookupHtmlEntity(s, len); 423 + if (utf8Value != nullptr) { 424 + // Known entity: expand to its UTF-8 value 425 + characterData(userData, utf8Value, strlen(utf8Value)); 426 + return; 427 + } 428 + // Unknown entity: preserve original &...; sequence 429 + characterData(userData, s, len); 430 + return; 431 + } 432 + // Not an entity we recognize - skip it 394 433 } 395 434 396 435 void XMLCALL ChapterHtmlSlimParser::endElement(void* userData, const XML_Char* name) { ··· 480 519 LOG_ERR("EHP", "Couldn't allocate memory for parser"); 481 520 return false; 482 521 } 522 + 523 + // Handle HTML entities (like  ) that aren't in XML spec or DTD 524 + // Using DefaultHandlerExpand preserves normal entity expansion from DOCTYPE 525 + XML_SetDefaultHandlerExpand(parser, defaultHandlerExpand); 483 526 484 527 FsFile file; 485 528 if (!Storage.openFileForRead("EHP", filepath, file)) {

+1

lib/Epub/Epub/parsers/ChapterHtmlSlimParser.h

··· 64 64 // XML callbacks 65 65 static void XMLCALL startElement(void* userData, const XML_Char* name, const XML_Char** atts); 66 66 static void XMLCALL characterData(void* userData, const XML_Char* s, int len); 67 + static void XMLCALL defaultHandlerExpand(void* userData, const XML_Char* s, int len); 67 68 static void XMLCALL endElement(void* userData, const XML_Char* name); 68 69 69 70 public:

Configure Feed

Configure Feed