A fork of https://github.com/crosspoint-reader/crosspoint-reader
0
fork

Configure Feed

Select the types of activity you want to include in your feed.

fix: Add special handling for apostrophe hyphenation (#1318)

## Summary

* **What is the goal of this PR?** Fixing / extending the hyphenation
logic to deal with words containing an apostophe as raised in #1186
* **What changes are included?**

## Additional Context

---

### AI Usage

While CrossPoint doesn't have restrictions on AI tools in contributing,
please be transparent about their usage as it
helps set the right context for reviewers.

Did you use AI tools to help write this code? _**PARTIALLY**_ (as the
user provided a thorough analysis that I followed)

authored by

jpirnay and committed by
GitHub
3dabd302 f1e9dc7f

+142 -25
+11
lib/Epub/Epub/hyphenation/HyphenationCommon.cpp
··· 107 107 108 108 bool isAsciiDigit(const uint32_t cp) { return cp >= '0' && cp <= '9'; } 109 109 110 + bool isApostrophe(const uint32_t cp) { 111 + switch (cp) { 112 + case '\'': 113 + case 0x2018: // left single quotation mark 114 + case 0x2019: // right single quotation mark 115 + return true; 116 + default: 117 + return false; 118 + } 119 + } 120 + 110 121 bool isExplicitHyphen(const uint32_t cp) { 111 122 switch (cp) { 112 123 case '-':
+1
lib/Epub/Epub/hyphenation/HyphenationCommon.h
··· 19 19 bool isAlphabetic(uint32_t cp); 20 20 bool isPunctuation(uint32_t cp); 21 21 bool isAsciiDigit(uint32_t cp); 22 + bool isApostrophe(uint32_t cp); 22 23 bool isExplicitHyphen(uint32_t cp); 23 24 bool isSoftHyphen(uint32_t cp); 24 25 void trimSurroundingPunctuationAndFootnote(std::vector<CodepointInfo>& cps);
+120 -21
lib/Epub/Epub/hyphenation/Hyphenator.cpp
··· 1 1 #include "Hyphenator.h" 2 2 3 3 #include <algorithm> 4 + #include <cassert> 4 5 #include <vector> 5 6 6 7 #include "HyphenationCommon.h" ··· 59 60 return breaks; 60 61 } 61 62 63 + bool isSegmentSeparator(const uint32_t cp) { return isExplicitHyphen(cp) || isApostrophe(cp); } 64 + 65 + void appendSegmentPatternBreaks(const std::vector<CodepointInfo>& cps, const LanguageHyphenator& hyphenator, 66 + const bool includeFallback, std::vector<Hyphenator::BreakInfo>& outBreaks) { 67 + size_t segStart = 0; 68 + 69 + for (size_t i = 0; i <= cps.size(); ++i) { 70 + const bool atEnd = i == cps.size(); 71 + const bool atSeparator = !atEnd && isSegmentSeparator(cps[i].value); 72 + if (!atEnd && !atSeparator) { 73 + continue; 74 + } 75 + 76 + if (i > segStart) { 77 + std::vector<CodepointInfo> segment(cps.begin() + segStart, cps.begin() + i); 78 + auto segIndexes = hyphenator.breakIndexes(segment); 79 + 80 + if (includeFallback && segIndexes.empty()) { 81 + const size_t minPrefix = hyphenator.minPrefix(); 82 + const size_t minSuffix = hyphenator.minSuffix(); 83 + for (size_t idx = minPrefix; idx + minSuffix <= segment.size(); ++idx) { 84 + segIndexes.push_back(idx); 85 + } 86 + } 87 + 88 + for (const size_t idx : segIndexes) { 89 + assert(idx > 0 && idx < segment.size()); 90 + if (idx == 0 || idx >= segment.size()) continue; 91 + const size_t cpIdx = segStart + idx; 92 + if (cpIdx < cps.size()) { 93 + outBreaks.push_back({cps[cpIdx].byteOffset, true}); 94 + } 95 + } 96 + } 97 + 98 + segStart = i + 1; 99 + } 100 + } 101 + 102 + void appendApostropheContractionBreaks(const std::vector<CodepointInfo>& cps, 103 + std::vector<Hyphenator::BreakInfo>& outBreaks) { 104 + constexpr size_t kMinLeftSegmentLen = 3; 105 + constexpr size_t kMinRightSegmentLen = 2; 106 + size_t segmentStart = 0; 107 + 108 + for (size_t i = 0; i < cps.size(); ++i) { 109 + if (isSegmentSeparator(cps[i].value)) { 110 + if (isApostrophe(cps[i].value) && i > 0 && i + 1 < cps.size() && isAlphabetic(cps[i - 1].value) && 111 + isAlphabetic(cps[i + 1].value)) { 112 + size_t leftPrefixLen = 0; 113 + for (size_t j = segmentStart; j < i; ++j) { 114 + if (isAlphabetic(cps[j].value)) { 115 + ++leftPrefixLen; 116 + } 117 + } 118 + 119 + size_t rightSuffixLen = 0; 120 + for (size_t j = i + 1; j < cps.size() && !isSegmentSeparator(cps[j].value); ++j) { 121 + if (isAlphabetic(cps[j].value)) { 122 + ++rightSuffixLen; 123 + } 124 + } 125 + 126 + // Avoid stranding short clitics like "l'"/"d'" or tiny suffixes like "'t". 127 + if (leftPrefixLen >= kMinLeftSegmentLen && rightSuffixLen >= kMinRightSegmentLen) { 128 + outBreaks.push_back({cps[i + 1].byteOffset, false}); 129 + } 130 + } 131 + segmentStart = i + 1; 132 + } 133 + } 134 + } 135 + 136 + void sortAndDedupeBreakInfos(std::vector<Hyphenator::BreakInfo>& infos) { 137 + std::sort(infos.begin(), infos.end(), [](const Hyphenator::BreakInfo& a, const Hyphenator::BreakInfo& b) { 138 + if (a.byteOffset != b.byteOffset) { 139 + return a.byteOffset < b.byteOffset; 140 + } 141 + return a.requiresInsertedHyphen < b.requiresInsertedHyphen; 142 + }); 143 + 144 + infos.erase(std::unique(infos.begin(), infos.end(), 145 + [](const Hyphenator::BreakInfo& a, const Hyphenator::BreakInfo& b) { 146 + return a.byteOffset == b.byteOffset; 147 + }), 148 + infos.end()); 149 + } 150 + 62 151 } // namespace 63 152 64 153 std::vector<Hyphenator::BreakInfo> Hyphenator::breakOffsets(const std::string& word, const bool includeFallback) { ··· 71 160 trimSurroundingPunctuationAndFootnote(cps); 72 161 const auto* hyphenator = cachedHyphenator_; 73 162 163 + // Detect apostrophe-like separators early; used by both branches below. 164 + bool hasApostropheLikeSeparator = false; 165 + for (const auto& cp : cps) { 166 + if (isApostrophe(cp.value)) { 167 + hasApostropheLikeSeparator = true; 168 + break; 169 + } 170 + } 171 + 74 172 // Explicit hyphen markers (soft or hard) take precedence over language breaks. 75 173 auto explicitBreakInfos = buildExplicitBreakInfos(cps); 76 174 if (!explicitBreakInfos.empty()) { ··· 89 187 // @16 Satellitensys|tems (+hyphen) 90 188 // Result: 6 sorted break points; the line-breaker picks the widest prefix that fits. 91 189 if (hyphenator) { 92 - size_t segStart = 0; 93 - for (size_t i = 0; i <= cps.size(); ++i) { 94 - const bool atEnd = (i == cps.size()); 95 - const bool atHyphen = !atEnd && isExplicitHyphen(cps[i].value); 96 - if (atEnd || atHyphen) { 97 - if (i > segStart) { 98 - std::vector<CodepointInfo> segment(cps.begin() + segStart, cps.begin() + i); 99 - auto segIndexes = hyphenator->breakIndexes(segment); 100 - for (const size_t idx : segIndexes) { 101 - const size_t cpIdx = segStart + idx; 102 - if (cpIdx < cps.size()) { 103 - explicitBreakInfos.push_back({cps[cpIdx].byteOffset, true}); 104 - } 105 - } 106 - } 107 - segStart = i + 1; 108 - } 109 - } 110 - // Merge explicit and pattern breaks into ascending byte-offset order. 111 - std::sort(explicitBreakInfos.begin(), explicitBreakInfos.end(), 112 - [](const BreakInfo& a, const BreakInfo& b) { return a.byteOffset < b.byteOffset; }); 190 + appendSegmentPatternBreaks(cps, *hyphenator, /*includeFallback=*/false, explicitBreakInfos); 191 + } 192 + // Also add apostrophe contraction breaks when present (e.g. "l'état-major" 193 + // has both an explicit hyphen and an apostrophe that can independently break). 194 + if (hasApostropheLikeSeparator) { 195 + appendApostropheContractionBreaks(cps, explicitBreakInfos); 113 196 } 197 + // Merge all break points into ascending byte-offset order. 198 + sortAndDedupeBreakInfos(explicitBreakInfos); 114 199 return explicitBreakInfos; 200 + } 201 + 202 + // Apostrophe-like separators split compounds into alphabetic segments; run Liang on each segment. 203 + // This allows words like "all'improvviso" to hyphenate within "improvviso" instead of becoming 204 + // completely unsplittable due to the apostrophe punctuation. Apostrophe contraction breaks are 205 + // applied regardless of whether a language hyphenator is available. 206 + if (hasApostropheLikeSeparator) { 207 + std::vector<BreakInfo> segmentedBreaks; 208 + if (hyphenator) { 209 + appendSegmentPatternBreaks(cps, *hyphenator, includeFallback, segmentedBreaks); 210 + } 211 + appendApostropheContractionBreaks(cps, segmentedBreaks); 212 + sortAndDedupeBreakInfos(segmentedBreaks); 213 + return segmentedBreaks; 115 214 } 116 215 117 216 // Ask language hyphenator for legal break points.
+10 -4
lib/Epub/Epub/hyphenation/Hyphenator.h
··· 11 11 struct BreakInfo { 12 12 size_t byteOffset; // Byte position inside the UTF-8 word where a break may occur. 13 13 bool requiresInsertedHyphen; // true = a visible '-' must be rendered at the break (pattern/fallback breaks). 14 - // false = the word already contains a hyphen at this position (explicit '-'). 14 + // false = break occurs at an existing visible separator boundary 15 + // (explicit '-' or eligible apostrophe contraction boundary). 15 16 }; 16 17 17 18 // Returns byte offsets where the word may be hyphenated. ··· 19 20 // Break sources (in priority order): 20 21 // 1. Explicit hyphens already present in the word (e.g. '-' or soft-hyphen U+00AD). 21 22 // When found, language patterns are additionally run on each alphabetic segment 22 - // between hyphens so compound words can break within their parts. 23 + // between separators so compound words can break within their parts. 23 24 // Example: "US-Satellitensystems" yields breaks after "US-" (no inserted hyphen) 24 25 // plus pattern breaks inside "Satellitensystems" (Sa|tel|li|ten|sys|tems). 25 - // 2. Language-specific Liang patterns (e.g. German de_patterns). 26 + // 2. Apostrophe contractions between letters (e.g. all'improvviso). 27 + // Liang patterns are run per alphabetic segment around apostrophes. 28 + // A direct break at the apostrophe boundary is allowed only when the left 29 + // segment has at least 3 letters and the right segment has at least 2 letters, 30 + // avoiding short clitics (e.g. l', d') and short contraction tails (e.g. can't). 31 + // 3. Language-specific Liang patterns (e.g. German de_patterns). 26 32 // Example: "Quadratkilometer" -> Qua|drat|ki|lo|me|ter. 27 - // 3. Fallback every-N-chars splitting (only when includeFallback is true AND no 33 + // 4. Fallback every-N-chars splitting (only when includeFallback is true AND no 28 34 // pattern breaks were found). Used as a last resort to prevent a single oversized 29 35 // word from overflowing the page width. 30 36 static std::vector<BreakInfo> breakOffsets(const std::string& word, bool includeFallback);