A fork of https://github.com/crosspoint-reader/crosspoint-reader
0
fork

Configure Feed

Select the types of activity you want to include in your feed.

perf: Optimize HTML entities lookup to O(log(n)) (#1194)

## Summary

**What is the goal of this PR?** Replace the linear scan of
`lookupHtmlEntity` with a simple binary search to improve lookup
performance.

**What changes are included?**
`lib/Epub/Epub/Entities/htmlEntities.cpp`:
- Sorted the `ENTITY_LOOKUP` array.
- Added a compile-time assertion to guarantee the array remains sorted.
- Rewrote `lookupHtmlEntity` to use a binary search.

## Additional Context

Benchmarked on my x64 laptop (probably will be different on RISC-V)
```
=== Benchmark (53 entities x 10000 iterations) ===

Version Total time Avg per lookup
----------------------------------------------
linear 236.97 ms total 447.11 ns/lookup
binary search 22.09 ms total 41.68 ns/lookup

=== Summary ===

Binary search is 10.73x faster than linear scan.
```

This is a simplified alternative to #1180, focused on keeping the
implementation clean, and maintainable.

### AI Usage


Did you use AI tools to help write this code? _**< NO >**_

---------

Co-authored-by: Zach Nelson <zach@zdnelson.com>

authored by

Uri Tauber
Zach Nelson
and committed by
GitHub
1abe307f f7814cd1

+98 -60
+95 -57
lib/Epub/Epub/htmlEntities.cpp
··· 1 - // from 1 + // based on 2 2 // https://github.com/atomic14/diy-esp32-epub-reader/blob/2c2f57fdd7e2a788d14a0bcb26b9e845a47aac42/lib/Epub/RubbishHtmlParser/htmlEntities.cpp 3 3 4 4 #include "htmlEntities.h" ··· 10 10 const char* value; 11 11 }; 12 12 13 - static const EntityPair ENTITY_LOOKUP[] = { 14 - {"&quot;", "\""}, {"&frasl;", "⁄"}, {"&amp;", "&"}, {"&lt;", "<"}, {"&gt;", ">"}, 15 - {"&Agrave;", "À"}, {"&Aacute;", "Á"}, {"&Acirc;", "Â"}, {"&Atilde;", "Ã"}, {"&Auml;", "Ä"}, 16 - {"&Aring;", "Å"}, {"&AElig;", "Æ"}, {"&Ccedil;", "Ç"}, {"&Egrave;", "È"}, {"&Eacute;", "É"}, 17 - {"&Ecirc;", "Ê"}, {"&Euml;", "Ë"}, {"&Igrave;", "Ì"}, {"&Iacute;", "Í"}, {"&Icirc;", "Î"}, 18 - {"&Iuml;", "Ï"}, {"&ETH;", "Ð"}, {"&Ntilde;", "Ñ"}, {"&Ograve;", "Ò"}, {"&Oacute;", "Ó"}, 19 - {"&Ocirc;", "Ô"}, {"&Otilde;", "Õ"}, {"&Ouml;", "Ö"}, {"&Oslash;", "Ø"}, {"&Ugrave;", "Ù"}, 20 - {"&Uacute;", "Ú"}, {"&Ucirc;", "Û"}, {"&Uuml;", "Ü"}, {"&Yacute;", "Ý"}, {"&THORN;", "Þ"}, 21 - {"&szlig;", "ß"}, {"&agrave;", "à"}, {"&aacute;", "á"}, {"&acirc;", "â"}, {"&atilde;", "ã"}, 22 - {"&auml;", "ä"}, {"&aring;", "å"}, {"&aelig;", "æ"}, {"&ccedil;", "ç"}, {"&egrave;", "è"}, 23 - {"&eacute;", "é"}, {"&ecirc;", "ê"}, {"&euml;", "ë"}, {"&igrave;", "ì"}, {"&iacute;", "í"}, 24 - {"&icirc;", "î"}, {"&iuml;", "ï"}, {"&eth;", "ð"}, {"&ntilde;", "ñ"}, {"&ograve;", "ò"}, 25 - {"&oacute;", "ó"}, {"&ocirc;", "ô"}, {"&otilde;", "õ"}, {"&ouml;", "ö"}, {"&oslash;", "ø"}, 26 - {"&ugrave;", "ù"}, {"&uacute;", "ú"}, {"&ucirc;", "û"}, {"&uuml;", "ü"}, {"&yacute;", "ý"}, 27 - {"&thorn;", "þ"}, {"&yuml;", "ÿ"}, {"&nbsp;", "\xC2\xA0"}, {"&iexcl;", "¡"}, {"&cent;", "¢"}, 28 - {"&pound;", "£"}, {"&curren;", "¤"}, {"&yen;", "¥"}, {"&brvbar;", "¦"}, {"&sect;", "§"}, 29 - {"&uml;", "¨"}, {"&copy;", "©"}, {"&ordf;", "ª"}, {"&laquo;", "«"}, {"&not;", "¬"}, 30 - {"&shy;", "­"}, {"&reg;", "®"}, {"&macr;", "¯"}, {"&deg;", "°"}, {"&plusmn;", "±"}, 31 - {"&sup2;", "²"}, {"&sup3;", "³"}, {"&acute;", "´"}, {"&micro;", "µ"}, {"&para;", "¶"}, 32 - {"&cedil;", "¸"}, {"&sup1;", "¹"}, {"&ordm;", "º"}, {"&raquo;", "»"}, {"&frac14;", "¼"}, 33 - {"&frac12;", "½"}, {"&frac34;", "¾"}, {"&iquest;", "¿"}, {"&times;", "×"}, {"&divide;", "÷"}, 34 - {"&forall;", "∀"}, {"&part;", "∂"}, {"&exist;", "∃"}, {"&empty;", "∅"}, {"&nabla;", "∇"}, 35 - {"&isin;", "∈"}, {"&notin;", "∉"}, {"&ni;", "∋"}, {"&prod;", "∏"}, {"&sum;", "∑"}, 36 - {"&minus;", "−"}, {"&lowast;", "∗"}, {"&radic;", "√"}, {"&prop;", "∝"}, {"&infin;", "∞"}, 37 - {"&ang;", "∠"}, {"&and;", "∧"}, {"&or;", "∨"}, {"&cap;", "∩"}, {"&cup;", "∪"}, 38 - {"&int;", "∫"}, {"&there4;", "∴"}, {"&sim;", "∼"}, {"&cong;", "≅"}, {"&asymp;", "≈"}, 39 - {"&ne;", "≠"}, {"&equiv;", "≡"}, {"&le;", "≤"}, {"&ge;", "≥"}, {"&sub;", "⊂"}, 40 - {"&sup;", "⊃"}, {"&nsub;", "⊄"}, {"&sube;", "⊆"}, {"&supe;", "⊇"}, {"&oplus;", "⊕"}, 41 - {"&otimes;", "⊗"}, {"&perp;", "⊥"}, {"&sdot;", "⋅"}, {"&Alpha;", "Α"}, {"&Beta;", "Β"}, 42 - {"&Gamma;", "Γ"}, {"&Delta;", "Δ"}, {"&Epsilon;", "Ε"}, {"&Zeta;", "Ζ"}, {"&Eta;", "Η"}, 43 - {"&Theta;", "Θ"}, {"&Iota;", "Ι"}, {"&Kappa;", "Κ"}, {"&Lambda;", "Λ"}, {"&Mu;", "Μ"}, 44 - {"&Nu;", "Ν"}, {"&Xi;", "Ξ"}, {"&Omicron;", "Ο"}, {"&Pi;", "Π"}, {"&Rho;", "Ρ"}, 45 - {"&Sigma;", "Σ"}, {"&Tau;", "Τ"}, {"&Upsilon;", "Υ"}, {"&Phi;", "Φ"}, {"&Chi;", "Χ"}, 46 - {"&Psi;", "Ψ"}, {"&Omega;", "Ω"}, {"&alpha;", "α"}, {"&beta;", "β"}, {"&gamma;", "γ"}, 47 - {"&delta;", "δ"}, {"&epsilon;", "ε"}, {"&zeta;", "ζ"}, {"&eta;", "η"}, {"&theta;", "θ"}, 48 - {"&iota;", "ι"}, {"&kappa;", "κ"}, {"&lambda;", "λ"}, {"&mu;", "μ"}, {"&nu;", "ν"}, 49 - {"&xi;", "ξ"}, {"&omicron;", "ο"}, {"&pi;", "π"}, {"&rho;", "ρ"}, {"&sigmaf;", "ς"}, 50 - {"&sigma;", "σ"}, {"&tau;", "τ"}, {"&upsilon;", "υ"}, {"&phi;", "φ"}, {"&chi;", "χ"}, 51 - {"&psi;", "ψ"}, {"&omega;", "ω"}, {"&thetasym;", "ϑ"}, {"&upsih;", "ϒ"}, {"&piv;", "ϖ"}, 52 - {"&OElig;", "Œ"}, {"&oelig;", "œ"}, {"&Scaron;", "Š"}, {"&scaron;", "š"}, {"&Yuml;", "Ÿ"}, 53 - {"&fnof;", "ƒ"}, {"&circ;", "ˆ"}, {"&tilde;", "˜"}, {"&ensp;", " "}, {"&emsp;", " "}, 54 - {"&thinsp;", " "}, {"&zwnj;", "‌"}, {"&zwj;", "‍"}, {"&lrm;", "‎"}, {"&rlm;", "‏"}, 55 - {"&ndash;", "–"}, {"&mdash;", "—"}, {"&lsquo;", "‘"}, {"&rsquo;", "’"}, {"&sbquo;", "‚"}, 56 - {"&ldquo;", "“"}, {"&rdquo;", "”"}, {"&bdquo;", "„"}, {"&dagger;", "†"}, {"&Dagger;", "‡"}, 57 - {"&bull;", "•"}, {"&hellip;", "…"}, {"&permil;", "‰"}, {"&prime;", "′"}, {"&Prime;", "″"}, 58 - {"&lsaquo;", "‹"}, {"&rsaquo;", "›"}, {"&oline;", "‾"}, {"&euro;", "€"}, {"&trade;", "™"}, 59 - {"&larr;", "←"}, {"&uarr;", "↑"}, {"&rarr;", "→"}, {"&darr;", "↓"}, {"&harr;", "↔"}, 60 - {"&crarr;", "↵"}, {"&lceil;", "⌈"}, {"&rceil;", "⌉"}, {"&lfloor;", "⌊"}, {"&rfloor;", "⌋"}, 61 - {"&loz;", "◊"}, {"&spades;", "♠"}, {"&clubs;", "♣"}, {"&hearts;", "♥"}, {"&diams;", "♦"}}; 13 + // Sorted lexicographically by key to allow binary search. 14 + static constexpr EntityPair ENTITY_LOOKUP[] = { 15 + {"&AElig;", "Æ"}, {"&Aacute;", "Á"}, {"&Acirc;", "Â"}, {"&Agrave;", "À"}, {"&Alpha;", "Α"}, 16 + {"&Aring;", "Å"}, {"&Atilde;", "Ã"}, {"&Auml;", "Ä"}, {"&Beta;", "Β"}, {"&Ccedil;", "Ç"}, 17 + {"&Chi;", "Χ"}, {"&Dagger;", "‡"}, {"&Delta;", "Δ"}, {"&ETH;", "Ð"}, {"&Eacute;", "É"}, 18 + {"&Ecirc;", "Ê"}, {"&Egrave;", "È"}, {"&Epsilon;", "Ε"}, {"&Eta;", "Η"}, {"&Euml;", "Ë"}, 19 + {"&Gamma;", "Γ"}, {"&Iacute;", "Í"}, {"&Icirc;", "Î"}, {"&Igrave;", "Ì"}, {"&Iota;", "Ι"}, 20 + {"&Iuml;", "Ï"}, {"&Kappa;", "Κ"}, {"&Lambda;", "Λ"}, {"&Mu;", "Μ"}, {"&Ntilde;", "Ñ"}, 21 + {"&Nu;", "Ν"}, {"&OElig;", "Œ"}, {"&Oacute;", "Ó"}, {"&Ocirc;", "Ô"}, {"&Ograve;", "Ò"}, 22 + {"&Omega;", "Ω"}, {"&Omicron;", "Ο"}, {"&Oslash;", "Ø"}, {"&Otilde;", "Õ"}, {"&Ouml;", "Ö"}, 23 + {"&Phi;", "Φ"}, {"&Pi;", "Π"}, {"&Prime;", "″"}, {"&Psi;", "Ψ"}, {"&Rho;", "Ρ"}, 24 + {"&Scaron;", "Š"}, {"&Sigma;", "Σ"}, {"&THORN;", "Þ"}, {"&Tau;", "Τ"}, {"&Theta;", "Θ"}, 25 + {"&Uacute;", "Ú"}, {"&Ucirc;", "Û"}, {"&Ugrave;", "Ù"}, {"&Upsilon;", "Υ"}, {"&Uuml;", "Ü"}, 26 + {"&Xi;", "Ξ"}, {"&Yacute;", "Ý"}, {"&Yuml;", "Ÿ"}, {"&Zeta;", "Ζ"}, {"&aacute;", "á"}, 27 + {"&acirc;", "â"}, {"&acute;", "´"}, {"&aelig;", "æ"}, {"&agrave;", "à"}, {"&alpha;", "α"}, 28 + {"&amp;", "&"}, {"&and;", "∧"}, {"&ang;", "∠"}, {"&aring;", "å"}, {"&asymp;", "≈"}, 29 + {"&atilde;", "ã"}, {"&auml;", "ä"}, {"&bdquo;", "„"}, {"&beta;", "β"}, {"&brvbar;", "¦"}, 30 + {"&bull;", "•"}, {"&cap;", "∩"}, {"&ccedil;", "ç"}, {"&cedil;", "¸"}, {"&cent;", "¢"}, 31 + {"&chi;", "χ"}, {"&circ;", "ˆ"}, {"&clubs;", "♣"}, {"&cong;", "≅"}, {"&copy;", "©"}, 32 + {"&crarr;", "↵"}, {"&cup;", "∪"}, {"&curren;", "¤"}, {"&dagger;", "†"}, {"&darr;", "↓"}, 33 + {"&deg;", "°"}, {"&delta;", "δ"}, {"&diams;", "♦"}, {"&divide;", "÷"}, {"&eacute;", "é"}, 34 + {"&ecirc;", "ê"}, {"&egrave;", "è"}, {"&empty;", "∅"}, {"&emsp;", " "}, {"&ensp;", " "}, 35 + {"&epsilon;", "ε"}, {"&equiv;", "≡"}, {"&eta;", "η"}, {"&eth;", "ð"}, {"&euml;", "ë"}, 36 + {"&euro;", "€"}, {"&exist;", "∃"}, {"&fnof;", "ƒ"}, {"&forall;", "∀"}, {"&frac12;", "½"}, 37 + {"&frac14;", "¼"}, {"&frac34;", "¾"}, {"&frasl;", "⁄"}, {"&gamma;", "γ"}, {"&ge;", "≥"}, 38 + {"&gt;", ">"}, {"&harr;", "↔"}, {"&hearts;", "♥"}, {"&hellip;", "…"}, {"&iacute;", "í"}, 39 + {"&icirc;", "î"}, {"&iexcl;", "¡"}, {"&igrave;", "ì"}, {"&infin;", "∞"}, {"&int;", "∫"}, 40 + {"&iota;", "ι"}, {"&iquest;", "¿"}, {"&isin;", "∈"}, {"&iuml;", "ï"}, {"&kappa;", "κ"}, 41 + {"&lambda;", "λ"}, {"&laquo;", "«"}, {"&larr;", "←"}, {"&lceil;", "⌈"}, {"&ldquo;", "\u201C"}, 42 + {"&le;", "≤"}, {"&lfloor;", "⌊"}, {"&lowast;", "∗"}, {"&loz;", "◊"}, {"&lrm;", "\u200E"}, 43 + {"&lsaquo;", "‹"}, {"&lsquo;", "\u2018"}, {"&lt;", "<"}, {"&macr;", "¯"}, {"&mdash;", "—"}, 44 + {"&micro;", "µ"}, {"&minus;", "−"}, {"&mu;", "μ"}, {"&nabla;", "∇"}, {"&nbsp;", "\xC2\xA0"}, 45 + {"&ndash;", "–"}, {"&ne;", "≠"}, {"&ni;", "∋"}, {"&not;", "¬"}, {"&notin;", "∉"}, 46 + {"&nsub;", "⊄"}, {"&ntilde;", "ñ"}, {"&nu;", "ν"}, {"&oacute;", "ó"}, {"&ocirc;", "ô"}, 47 + {"&oelig;", "œ"}, {"&ograve;", "ò"}, {"&oline;", "‾"}, {"&omega;", "ω"}, {"&omicron;", "ο"}, 48 + {"&oplus;", "⊕"}, {"&or;", "∨"}, {"&ordf;", "ª"}, {"&ordm;", "º"}, {"&oslash;", "ø"}, 49 + {"&otilde;", "õ"}, {"&otimes;", "⊗"}, {"&ouml;", "ö"}, {"&para;", "¶"}, {"&part;", "∂"}, 50 + {"&permil;", "‰"}, {"&perp;", "⊥"}, {"&phi;", "φ"}, {"&pi;", "π"}, {"&piv;", "ϖ"}, 51 + {"&plusmn;", "±"}, {"&pound;", "£"}, {"&prime;", "′"}, {"&prod;", "∏"}, {"&prop;", "∝"}, 52 + {"&psi;", "ψ"}, {"&quot;", "\""}, {"&radic;", "√"}, {"&raquo;", "»"}, {"&rarr;", "→"}, 53 + {"&rceil;", "⌉"}, {"&rdquo;", "\u201D"}, {"&reg;", "®"}, {"&rfloor;", "⌋"}, {"&rho;", "ρ"}, 54 + {"&rlm;", "\u200F"}, {"&rsaquo;", "›"}, {"&rsquo;", "\u2019"}, {"&sbquo;", "‚"}, {"&scaron;", "š"}, 55 + {"&sdot;", "⋅"}, {"&sect;", "§"}, {"&shy;", "\xC2\xAD"}, {"&sigma;", "σ"}, {"&sigmaf;", "ς"}, 56 + {"&sim;", "∼"}, {"&spades;", "♠"}, {"&sub;", "⊂"}, {"&sube;", "⊆"}, {"&sum;", "∑"}, 57 + {"&sup1;", "¹"}, {"&sup2;", "²"}, {"&sup3;", "³"}, {"&sup;", "⊃"}, {"&supe;", "⊇"}, 58 + {"&szlig;", "ß"}, {"&tau;", "τ"}, {"&there4;", "∴"}, {"&theta;", "θ"}, {"&thetasym;", "ϑ"}, 59 + {"&thinsp;", " "}, {"&thorn;", "þ"}, {"&tilde;", "˜"}, {"&times;", "×"}, {"&trade;", "™"}, 60 + {"&uacute;", "ú"}, {"&uarr;", "↑"}, {"&ucirc;", "û"}, {"&ugrave;", "ù"}, {"&uml;", "¨"}, 61 + {"&upsih;", "ϒ"}, {"&upsilon;", "υ"}, {"&uuml;", "ü"}, {"&xi;", "ξ"}, {"&yacute;", "ý"}, 62 + {"&yen;", "¥"}, {"&yuml;", "ÿ"}, {"&zeta;", "ζ"}, {"&zwj;", "\u200D"}, {"&zwnj;", "\u200C"}, 63 + }; 62 64 63 65 static const size_t ENTITY_LOOKUP_COUNT = sizeof(ENTITY_LOOKUP) / sizeof(ENTITY_LOOKUP[0]); 64 66 65 - // Lookup a single HTML entity and return its UTF-8 value 66 - const char* lookupHtmlEntity(const char* entity, int len) { 67 - for (size_t i = 0; i < ENTITY_LOOKUP_COUNT; i++) { 68 - const char* key = ENTITY_LOOKUP[i].key; 67 + // Verify the table is sorted at compile time. 68 + static constexpr int constexprStrcmp(const char* a, const char* b) { 69 + for (size_t i = 0;; i++) { 70 + if (a[i] != b[i]) return (unsigned char)a[i] < (unsigned char)b[i] ? -1 : 1; 71 + if (a[i] == '\0') return 0; 72 + } 73 + } 74 + 75 + static constexpr bool isTableSorted() { 76 + for (size_t i = 1; i < ENTITY_LOOKUP_COUNT; i++) { 77 + if (constexprStrcmp(ENTITY_LOOKUP[i - 1].key, ENTITY_LOOKUP[i].key) >= 0) return false; 78 + } 79 + return true; 80 + } 81 + static_assert(isTableSorted(), "ENTITY_LOOKUP must be sorted lexicographically by key"); 82 + 83 + // Lookup a single HTML entity and return its UTF-8 value. 84 + const char* lookupHtmlEntity(const char* entity, size_t len) { 85 + if (entity == nullptr || len == 0) return nullptr; 86 + 87 + size_t lo = 0; 88 + size_t hi = ENTITY_LOOKUP_COUNT; 89 + 90 + while (lo < hi) { 91 + const size_t mid = lo + (hi - lo) / 2; 92 + const char* key = ENTITY_LOOKUP[mid].key; 69 93 const size_t keyLen = strlen(key); 70 - if (static_cast<size_t>(len) == keyLen && memcmp(entity, key, keyLen) == 0) { 71 - return ENTITY_LOOKUP[i].value; 94 + const size_t cmpLen = (len < keyLen) ? len : keyLen; 95 + int cmp = memcmp(entity, key, cmpLen); 96 + if (cmp == 0) { 97 + // safety net: if prefix equal, shorter string is considered smaller 98 + if (len < keyLen) 99 + cmp = -1; 100 + else if (len > keyLen) 101 + cmp = 1; 102 + else 103 + cmp = 0; 72 104 } 105 + 106 + if (cmp == 0) return ENTITY_LOOKUP[mid].value; 107 + if (cmp < 0) 108 + hi = mid; 109 + else 110 + lo = mid + 1; 73 111 } 74 112 75 - return nullptr; // Entity not found 113 + return nullptr; 76 114 }
+2 -2
lib/Epub/Epub/htmlEntities.h
··· 1 - // from 1 + // based on 2 2 // https://github.com/atomic14/diy-esp32-epub-reader/blob/2c2f57fdd7e2a788d14a0bcb26b9e845a47aac42/lib/Epub/RubbishHtmlParser/htmlEntities.cpp 3 3 4 4 #pragma once ··· 6 6 7 7 // Lookup a single HTML entity (including & and ;) and return its UTF-8 value 8 8 // Returns nullptr if entity is not found 9 - const char* lookupHtmlEntity(const char* entity, int len); 9 + const char* lookupHtmlEntity(const char* entity, size_t len);
+1 -1
lib/Epub/Epub/parsers/ChapterHtmlSlimParser.cpp
··· 761 761 void XMLCALL ChapterHtmlSlimParser::defaultHandlerExpand(void* userData, const XML_Char* s, const int len) { 762 762 // Check if this looks like an entity reference (&...;) 763 763 if (len >= 3 && s[0] == '&' && s[len - 1] == ';') { 764 - const char* utf8Value = lookupHtmlEntity(s, len); 764 + const char* utf8Value = lookupHtmlEntity(s, static_cast<size_t>(len)); 765 765 if (utf8Value != nullptr) { 766 766 // Known entity: expand to its UTF-8 value 767 767 characterData(userData, utf8Value, strlen(utf8Value));