A fork of https://github.com/crosspoint-reader/crosspoint-reader
0
fork

Configure Feed

Select the types of activity you want to include in your feed.

fix: Account for `nbsp;` character as non-breaking space (#757)

## Summary

Closes #743.

**What is the goal of this PR?**

- Add back handling for HTML entities in expat. This was originally part
of the code that got removed
[here](https://github.com/crosspoint-reader/crosspoint-reader/pull/274)
- Handle ` ` characters to resolve issue #743

**What changes are included?**

- Brought back HTML entity table from previous commit and refactored it
to use a static const char * table with linear lookup to reduce heap
allocations.
- Used `XML_SetDefaultHandlerExpand` in expat to parse out the entities
correctly, without needing them defined in DOCTYPE
- Added handling for ` ` so that the text stays together and
doesn't break onto a new line with text separated by an ` `

## Additional Context

- This supersedes [this
PR](https://github.com/crosspoint-reader/crosspoint-reader/pull/751)
that simply handled `nbsp;` as whitespace. Instead, we want that
character to serve its true purpose and affect the line-breaking
algorithm.
- Updated my test EPUB [here](https://github.com/jdk2pq/css-test-epub)
with ` ` characters examples at the end of the book

---

### AI Usage

While CrossPoint doesn't have restrictions on AI tools in contributing,
please be transparent about their usage as it
helps set the right context for reviewers.

Did you use AI tools to help write this code? _**YES**_, Claude Code

authored by

Jake Kenneally and committed by
GitHub
6e51afb9 cb249474

+132
+3
lib/Epub/Epub/ParsedText.cpp
··· 32 32 // Returns the rendered width for a word while ignoring soft hyphen glyphs and optionally appending a visible hyphen. 33 33 uint16_t measureWordWidth(const GfxRenderer& renderer, const int fontId, const std::string& word, 34 34 const EpdFontFamily::Style style, const bool appendHyphen = false) { 35 + if (word.size() == 1 && word[0] == ' ' && !appendHyphen) { 36 + return renderer.getSpaceWidth(fontId); 37 + } 35 38 const bool hasSoftHyphen = containsSoftHyphen(word); 36 39 if (!hasSoftHyphen && !appendHyphen) { 37 40 return renderer.getTextWidth(fontId, word.c_str(), style);
+76
lib/Epub/Epub/htmlEntities.cpp
··· 1 + // from 2 + // https://github.com/atomic14/diy-esp32-epub-reader/blob/2c2f57fdd7e2a788d14a0bcb26b9e845a47aac42/lib/Epub/RubbishHtmlParser/htmlEntities.cpp 3 + 4 + #include "htmlEntities.h" 5 + 6 + #include <cstring> 7 + 8 + struct EntityPair { 9 + const char* key; 10 + const char* value; 11 + }; 12 + 13 + static const EntityPair ENTITY_LOOKUP[] = { 14 + {"&quot;", "\""}, {"&frasl;", "⁄"}, {"&amp;", "&"}, {"&lt;", "<"}, {"&gt;", ">"}, 15 + {"&Agrave;", "À"}, {"&Aacute;", "Á"}, {"&Acirc;", "Â"}, {"&Atilde;", "Ã"}, {"&Auml;", "Ä"}, 16 + {"&Aring;", "Å"}, {"&AElig;", "Æ"}, {"&Ccedil;", "Ç"}, {"&Egrave;", "È"}, {"&Eacute;", "É"}, 17 + {"&Ecirc;", "Ê"}, {"&Euml;", "Ë"}, {"&Igrave;", "Ì"}, {"&Iacute;", "Í"}, {"&Icirc;", "Î"}, 18 + {"&Iuml;", "Ï"}, {"&ETH;", "Ð"}, {"&Ntilde;", "Ñ"}, {"&Ograve;", "Ò"}, {"&Oacute;", "Ó"}, 19 + {"&Ocirc;", "Ô"}, {"&Otilde;", "Õ"}, {"&Ouml;", "Ö"}, {"&Oslash;", "Ø"}, {"&Ugrave;", "Ù"}, 20 + {"&Uacute;", "Ú"}, {"&Ucirc;", "Û"}, {"&Uuml;", "Ü"}, {"&Yacute;", "Ý"}, {"&THORN;", "Þ"}, 21 + {"&szlig;", "ß"}, {"&agrave;", "à"}, {"&aacute;", "á"}, {"&acirc;", "â"}, {"&atilde;", "ã"}, 22 + {"&auml;", "ä"}, {"&aring;", "å"}, {"&aelig;", "æ"}, {"&ccedil;", "ç"}, {"&egrave;", "è"}, 23 + {"&eacute;", "é"}, {"&ecirc;", "ê"}, {"&euml;", "ë"}, {"&igrave;", "ì"}, {"&iacute;", "í"}, 24 + {"&icirc;", "î"}, {"&iuml;", "ï"}, {"&eth;", "ð"}, {"&ntilde;", "ñ"}, {"&ograve;", "ò"}, 25 + {"&oacute;", "ó"}, {"&ocirc;", "ô"}, {"&otilde;", "õ"}, {"&ouml;", "ö"}, {"&oslash;", "ø"}, 26 + {"&ugrave;", "ù"}, {"&uacute;", "ú"}, {"&ucirc;", "û"}, {"&uuml;", "ü"}, {"&yacute;", "ý"}, 27 + {"&thorn;", "þ"}, {"&yuml;", "ÿ"}, {"&nbsp;", "\xC2\xA0"}, {"&iexcl;", "¡"}, {"&cent;", "¢"}, 28 + {"&pound;", "£"}, {"&curren;", "¤"}, {"&yen;", "¥"}, {"&brvbar;", "¦"}, {"&sect;", "§"}, 29 + {"&uml;", "¨"}, {"&copy;", "©"}, {"&ordf;", "ª"}, {"&laquo;", "«"}, {"&not;", "¬"}, 30 + {"&shy;", "­"}, {"&reg;", "®"}, {"&macr;", "¯"}, {"&deg;", "°"}, {"&plusmn;", "±"}, 31 + {"&sup2;", "²"}, {"&sup3;", "³"}, {"&acute;", "´"}, {"&micro;", "µ"}, {"&para;", "¶"}, 32 + {"&cedil;", "¸"}, {"&sup1;", "¹"}, {"&ordm;", "º"}, {"&raquo;", "»"}, {"&frac14;", "¼"}, 33 + {"&frac12;", "½"}, {"&frac34;", "¾"}, {"&iquest;", "¿"}, {"&times;", "×"}, {"&divide;", "÷"}, 34 + {"&forall;", "∀"}, {"&part;", "∂"}, {"&exist;", "∃"}, {"&empty;", "∅"}, {"&nabla;", "∇"}, 35 + {"&isin;", "∈"}, {"&notin;", "∉"}, {"&ni;", "∋"}, {"&prod;", "∏"}, {"&sum;", "∑"}, 36 + {"&minus;", "−"}, {"&lowast;", "∗"}, {"&radic;", "√"}, {"&prop;", "∝"}, {"&infin;", "∞"}, 37 + {"&ang;", "∠"}, {"&and;", "∧"}, {"&or;", "∨"}, {"&cap;", "∩"}, {"&cup;", "∪"}, 38 + {"&int;", "∫"}, {"&there4;", "∴"}, {"&sim;", "∼"}, {"&cong;", "≅"}, {"&asymp;", "≈"}, 39 + {"&ne;", "≠"}, {"&equiv;", "≡"}, {"&le;", "≤"}, {"&ge;", "≥"}, {"&sub;", "⊂"}, 40 + {"&sup;", "⊃"}, {"&nsub;", "⊄"}, {"&sube;", "⊆"}, {"&supe;", "⊇"}, {"&oplus;", "⊕"}, 41 + {"&otimes;", "⊗"}, {"&perp;", "⊥"}, {"&sdot;", "⋅"}, {"&Alpha;", "Α"}, {"&Beta;", "Β"}, 42 + {"&Gamma;", "Γ"}, {"&Delta;", "Δ"}, {"&Epsilon;", "Ε"}, {"&Zeta;", "Ζ"}, {"&Eta;", "Η"}, 43 + {"&Theta;", "Θ"}, {"&Iota;", "Ι"}, {"&Kappa;", "Κ"}, {"&Lambda;", "Λ"}, {"&Mu;", "Μ"}, 44 + {"&Nu;", "Ν"}, {"&Xi;", "Ξ"}, {"&Omicron;", "Ο"}, {"&Pi;", "Π"}, {"&Rho;", "Ρ"}, 45 + {"&Sigma;", "Σ"}, {"&Tau;", "Τ"}, {"&Upsilon;", "Υ"}, {"&Phi;", "Φ"}, {"&Chi;", "Χ"}, 46 + {"&Psi;", "Ψ"}, {"&Omega;", "Ω"}, {"&alpha;", "α"}, {"&beta;", "β"}, {"&gamma;", "γ"}, 47 + {"&delta;", "δ"}, {"&epsilon;", "ε"}, {"&zeta;", "ζ"}, {"&eta;", "η"}, {"&theta;", "θ"}, 48 + {"&iota;", "ι"}, {"&kappa;", "κ"}, {"&lambda;", "λ"}, {"&mu;", "μ"}, {"&nu;", "ν"}, 49 + {"&xi;", "ξ"}, {"&omicron;", "ο"}, {"&pi;", "π"}, {"&rho;", "ρ"}, {"&sigmaf;", "ς"}, 50 + {"&sigma;", "σ"}, {"&tau;", "τ"}, {"&upsilon;", "υ"}, {"&phi;", "φ"}, {"&chi;", "χ"}, 51 + {"&psi;", "ψ"}, {"&omega;", "ω"}, {"&thetasym;", "ϑ"}, {"&upsih;", "ϒ"}, {"&piv;", "ϖ"}, 52 + {"&OElig;", "Œ"}, {"&oelig;", "œ"}, {"&Scaron;", "Š"}, {"&scaron;", "š"}, {"&Yuml;", "Ÿ"}, 53 + {"&fnof;", "ƒ"}, {"&circ;", "ˆ"}, {"&tilde;", "˜"}, {"&ensp;", " "}, {"&emsp;", " "}, 54 + {"&thinsp;", " "}, {"&zwnj;", "‌"}, {"&zwj;", "‍"}, {"&lrm;", "‎"}, {"&rlm;", "‏"}, 55 + {"&ndash;", "–"}, {"&mdash;", "—"}, {"&lsquo;", "‘"}, {"&rsquo;", "’"}, {"&sbquo;", "‚"}, 56 + {"&ldquo;", "“"}, {"&rdquo;", "”"}, {"&bdquo;", "„"}, {"&dagger;", "†"}, {"&Dagger;", "‡"}, 57 + {"&bull;", "•"}, {"&hellip;", "…"}, {"&permil;", "‰"}, {"&prime;", "′"}, {"&Prime;", "″"}, 58 + {"&lsaquo;", "‹"}, {"&rsaquo;", "›"}, {"&oline;", "‾"}, {"&euro;", "€"}, {"&trade;", "™"}, 59 + {"&larr;", "←"}, {"&uarr;", "↑"}, {"&rarr;", "→"}, {"&darr;", "↓"}, {"&harr;", "↔"}, 60 + {"&crarr;", "↵"}, {"&lceil;", "⌈"}, {"&rceil;", "⌉"}, {"&lfloor;", "⌊"}, {"&rfloor;", "⌋"}, 61 + {"&loz;", "◊"}, {"&spades;", "♠"}, {"&clubs;", "♣"}, {"&hearts;", "♥"}, {"&diams;", "♦"}}; 62 + 63 + static const size_t ENTITY_LOOKUP_COUNT = sizeof(ENTITY_LOOKUP) / sizeof(ENTITY_LOOKUP[0]); 64 + 65 + // Lookup a single HTML entity and return its UTF-8 value 66 + const char* lookupHtmlEntity(const char* entity, int len) { 67 + for (size_t i = 0; i < ENTITY_LOOKUP_COUNT; i++) { 68 + const char* key = ENTITY_LOOKUP[i].key; 69 + const size_t keyLen = strlen(key); 70 + if (static_cast<size_t>(len) == keyLen && memcmp(entity, key, keyLen) == 0) { 71 + return ENTITY_LOOKUP[i].value; 72 + } 73 + } 74 + 75 + return nullptr; // Entity not found 76 + }
+9
lib/Epub/Epub/htmlEntities.h
··· 1 + // from 2 + // https://github.com/atomic14/diy-esp32-epub-reader/blob/2c2f57fdd7e2a788d14a0bcb26b9e845a47aac42/lib/Epub/RubbishHtmlParser/htmlEntities.cpp 3 + 4 + #pragma once 5 + #include <string> 6 + 7 + // Lookup a single HTML entity (including & and ;) and return its UTF-8 value 8 + // Returns nullptr if entity is not found 9 + const char* lookupHtmlEntity(const char* entity, int len);
+43
lib/Epub/Epub/parsers/ChapterHtmlSlimParser.cpp
··· 6 6 #include <expat.h> 7 7 8 8 #include "../Page.h" 9 + #include "../htmlEntities.h" 9 10 10 11 const char* HEADER_TAGS[] = {"h1", "h2", "h3", "h4", "h5", "h6"}; 11 12 constexpr int NUM_HEADER_TAGS = sizeof(HEADER_TAGS) / sizeof(HEADER_TAGS[0]); ··· 359 360 continue; 360 361 } 361 362 363 + // Detect U+00A0 (non-breaking space): UTF-8 encoding is 0xC2 0xA0 364 + // Render a visible space without allowing a line break around it. 365 + if (static_cast<uint8_t>(s[i]) == 0xC2 && i + 1 < len && static_cast<uint8_t>(s[i + 1]) == 0xA0) { 366 + // Flush any pending text so style is applied correctly. 367 + if (self->partWordBufferIndex > 0) { 368 + self->flushPartWordBuffer(); 369 + } 370 + 371 + // Add a standalone space that attaches to the previous word. 372 + self->partWordBuffer[0] = ' '; 373 + self->partWordBuffer[1] = '\0'; 374 + self->partWordBufferIndex = 1; 375 + self->nextWordContinues = true; // Attach space to previous word (no break). 376 + self->flushPartWordBuffer(); 377 + 378 + // Ensure the next real word attaches to this space (no break). 379 + self->nextWordContinues = true; 380 + 381 + i++; // Skip the second byte (0xA0) 382 + continue; 383 + } 384 + 362 385 // Skip Zero Width No-Break Space / BOM (U+FEFF) = 0xEF 0xBB 0xBF 363 386 const XML_Char FEFF_BYTE_1 = static_cast<XML_Char>(0xEF); 364 387 const XML_Char FEFF_BYTE_2 = static_cast<XML_Char>(0xBB); ··· 391 414 self->renderer, self->fontId, self->viewportWidth, 392 415 [self](const std::shared_ptr<TextBlock>& textBlock) { self->addLineToPage(textBlock); }, false); 393 416 } 417 + } 418 + 419 + void XMLCALL ChapterHtmlSlimParser::defaultHandlerExpand(void* userData, const XML_Char* s, const int len) { 420 + // Check if this looks like an entity reference (&...;) 421 + if (len >= 3 && s[0] == '&' && s[len - 1] == ';') { 422 + const char* utf8Value = lookupHtmlEntity(s, len); 423 + if (utf8Value != nullptr) { 424 + // Known entity: expand to its UTF-8 value 425 + characterData(userData, utf8Value, strlen(utf8Value)); 426 + return; 427 + } 428 + // Unknown entity: preserve original &...; sequence 429 + characterData(userData, s, len); 430 + return; 431 + } 432 + // Not an entity we recognize - skip it 394 433 } 395 434 396 435 void XMLCALL ChapterHtmlSlimParser::endElement(void* userData, const XML_Char* name) { ··· 480 519 LOG_ERR("EHP", "Couldn't allocate memory for parser"); 481 520 return false; 482 521 } 522 + 523 + // Handle HTML entities (like &nbsp;) that aren't in XML spec or DTD 524 + // Using DefaultHandlerExpand preserves normal entity expansion from DOCTYPE 525 + XML_SetDefaultHandlerExpand(parser, defaultHandlerExpand); 483 526 484 527 FsFile file; 485 528 if (!Storage.openFileForRead("EHP", filepath, file)) {
+1
lib/Epub/Epub/parsers/ChapterHtmlSlimParser.h
··· 64 64 // XML callbacks 65 65 static void XMLCALL startElement(void* userData, const XML_Char* name, const XML_Char** atts); 66 66 static void XMLCALL characterData(void* userData, const XML_Char* s, int len); 67 + static void XMLCALL defaultHandlerExpand(void* userData, const XML_Char* s, int len); 67 68 static void XMLCALL endElement(void* userData, const XML_Char* name); 68 69 69 70 public: