A fork of https://github.com/crosspoint-reader/crosspoint-reader
0
fork

Configure Feed

Select the types of activity you want to include in your feed.

fix: Remove separations after style changes (#720)

Closes #182. Closes #710. Closes #711.

## Summary

**What is the goal of this PR?**
- A longer-term, more robust fix for the issue with spurious spaces
appearing after style changes. Replaces solution from #694.

**What changes are included?**
- Add continuation flags to determine if to add a space after a word or
if the word connects to the previous word. Replaces simple solution that
only considered ending punctuation.
- Fixed an issue with greedy line-breaking algorithm where punctuation
could appear on the next line, separated from the word, if there was a
style change between the word and punctuation

---

### AI Usage

While CrossPoint doesn't have restrictions on AI tools in contributing,
please be transparent about their usage as it
helps set the right context for reviewers.

Did you use AI tools to help write this code? _**YES**_, Claude Code

authored by

Jake Kenneally and committed by
GitHub
9f78fd33 bd8132a2

+108 -70
+68 -64
lib/Epub/Epub/ParsedText.cpp
··· 19 19 constexpr char SOFT_HYPHEN_UTF8[] = "\xC2\xAD"; 20 20 constexpr size_t SOFT_HYPHEN_BYTES = 2; 21 21 22 - // Known attaching punctuation (including UTF-8 sequences) 23 - const std::vector<std::string> punctuation = { 24 - ".", 25 - ",", 26 - "!", 27 - "?", 28 - ";", 29 - ":", 30 - "\"", 31 - "'", 32 - "\xE2\x80\x99", // ’ (U+2019 right single quote) 33 - "\xE2\x80\x9D" // ” (U+201D right double quote) 34 - }; 35 - 36 - bool isAttachingPunctuationWord(const std::string& word) { 37 - if (word.empty()) return false; 38 - 39 - size_t pos = 0; 40 - while (pos < word.size()) { 41 - bool matched = false; 42 - for (const auto& p : punctuation) { 43 - if (word.compare(pos, p.size(), p) == 0) { 44 - pos += p.size(); 45 - matched = true; 46 - break; 47 - } 48 - } 49 - if (!matched) return false; 50 - } 51 - return true; 52 - } 53 - 54 22 bool containsSoftHyphen(const std::string& word) { return word.find(SOFT_HYPHEN_UTF8) != std::string::npos; } 55 23 56 24 // Removes every soft hyphen in-place so rendered glyphs match measured widths. ··· 81 49 82 50 } // namespace 83 51 84 - void ParsedText::addWord(std::string word, const EpdFontFamily::Style style, const bool underline) { 52 + void ParsedText::addWord(std::string word, const EpdFontFamily::Style fontStyle, const bool underline, 53 + const bool attachToPrevious) { 85 54 if (word.empty()) return; 86 55 87 56 words.push_back(std::move(word)); 88 - EpdFontFamily::Style combinedStyle = style; 57 + EpdFontFamily::Style combinedStyle = fontStyle; 89 58 if (underline) { 90 59 combinedStyle = static_cast<EpdFontFamily::Style>(combinedStyle | EpdFontFamily::UNDERLINE); 91 60 } 92 61 wordStyles.push_back(combinedStyle); 62 + wordContinues.push_back(attachToPrevious); 93 63 } 94 64 95 65 // Consumes data to minimize memory usage ··· 106 76 const int pageWidth = viewportWidth; 107 77 const int spaceWidth = renderer.getSpaceWidth(fontId); 108 78 auto wordWidths = calculateWordWidths(renderer, fontId); 79 + 80 + // Build indexed continues vector from the parallel list for O(1) access during layout 81 + std::vector<bool> continuesVec(wordContinues.begin(), wordContinues.end()); 82 + 109 83 std::vector<size_t> lineBreakIndices; 110 84 if (hyphenationEnabled) { 111 85 // Use greedy layout that can split words mid-loop when a hyphenated prefix fits. 112 - lineBreakIndices = computeHyphenatedLineBreaks(renderer, fontId, pageWidth, spaceWidth, wordWidths); 86 + lineBreakIndices = computeHyphenatedLineBreaks(renderer, fontId, pageWidth, spaceWidth, wordWidths, continuesVec); 113 87 } else { 114 - lineBreakIndices = computeLineBreaks(renderer, fontId, pageWidth, spaceWidth, wordWidths); 88 + lineBreakIndices = computeLineBreaks(renderer, fontId, pageWidth, spaceWidth, wordWidths, continuesVec); 115 89 } 116 90 const size_t lineCount = includeLastLine ? lineBreakIndices.size() : lineBreakIndices.size() - 1; 117 91 118 92 for (size_t i = 0; i < lineCount; ++i) { 119 - extractLine(i, pageWidth, spaceWidth, wordWidths, lineBreakIndices, processLine); 93 + extractLine(i, pageWidth, spaceWidth, wordWidths, continuesVec, lineBreakIndices, processLine); 120 94 } 121 95 } 122 96 ··· 140 114 } 141 115 142 116 std::vector<size_t> ParsedText::computeLineBreaks(const GfxRenderer& renderer, const int fontId, const int pageWidth, 143 - const int spaceWidth, std::vector<uint16_t>& wordWidths) { 117 + const int spaceWidth, std::vector<uint16_t>& wordWidths, 118 + std::vector<bool>& continuesVec) { 144 119 if (words.empty()) { 145 120 return {}; 146 121 } ··· 157 132 // First word needs to fit in reduced width if there's an indent 158 133 const int effectiveWidth = i == 0 ? pageWidth - firstLineIndent : pageWidth; 159 134 while (wordWidths[i] > effectiveWidth) { 160 - if (!hyphenateWordAtIndex(i, effectiveWidth, renderer, fontId, wordWidths, /*allowFallbackBreaks=*/true)) { 135 + if (!hyphenateWordAtIndex(i, effectiveWidth, renderer, fontId, wordWidths, /*allowFallbackBreaks=*/true, 136 + &continuesVec)) { 161 137 break; 162 138 } 163 139 } ··· 175 151 ans[totalWordCount - 1] = totalWordCount - 1; 176 152 177 153 for (int i = totalWordCount - 2; i >= 0; --i) { 178 - int currlen = -spaceWidth; 154 + int currlen = 0; 179 155 dp[i] = MAX_COST; 180 156 181 157 // First line has reduced width due to text-indent 182 158 const int effectivePageWidth = i == 0 ? pageWidth - firstLineIndent : pageWidth; 183 159 184 160 for (size_t j = i; j < totalWordCount; ++j) { 185 - // Current line length: previous width + space + current word width 186 - currlen += wordWidths[j] + spaceWidth; 161 + // Add space before word j, unless it's the first word on the line or a continuation 162 + const int gap = j > static_cast<size_t>(i) && !continuesVec[j] ? spaceWidth : 0; 163 + currlen += wordWidths[j] + gap; 187 164 188 165 if (currlen > effectivePageWidth) { 189 166 break; 167 + } 168 + 169 + // Cannot break after word j if the next word attaches to it (continuation group) 170 + if (j + 1 < totalWordCount && continuesVec[j + 1]) { 171 + continue; 190 172 } 191 173 192 174 int cost; ··· 260 242 // Builds break indices while opportunistically splitting the word that would overflow the current line. 261 243 std::vector<size_t> ParsedText::computeHyphenatedLineBreaks(const GfxRenderer& renderer, const int fontId, 262 244 const int pageWidth, const int spaceWidth, 263 - std::vector<uint16_t>& wordWidths) { 245 + std::vector<uint16_t>& wordWidths, 246 + std::vector<bool>& continuesVec) { 264 247 // Calculate first line indent (only for left/justified text without extra paragraph spacing) 265 248 const int firstLineIndent = 266 249 blockStyle.textIndent > 0 && !extraParagraphSpacing && ··· 282 265 // Consume as many words as possible for current line, splitting when prefixes fit 283 266 while (currentIndex < wordWidths.size()) { 284 267 const bool isFirstWord = currentIndex == lineStart; 285 - const int spacing = isFirstWord ? 0 : spaceWidth; 268 + const int spacing = isFirstWord || continuesVec[currentIndex] ? 0 : spaceWidth; 286 269 const int candidateWidth = spacing + wordWidths[currentIndex]; 287 270 288 271 // Word fits on current line ··· 296 279 const int availableWidth = effectivePageWidth - lineWidth - spacing; 297 280 const bool allowFallbackBreaks = isFirstWord; // Only for first word on line 298 281 299 - if (availableWidth > 0 && 300 - hyphenateWordAtIndex(currentIndex, availableWidth, renderer, fontId, wordWidths, allowFallbackBreaks)) { 282 + if (availableWidth > 0 && hyphenateWordAtIndex(currentIndex, availableWidth, renderer, fontId, wordWidths, 283 + allowFallbackBreaks, &continuesVec)) { 301 284 // Prefix now fits; append it to this line and move to next line 302 285 lineWidth += spacing + wordWidths[currentIndex]; 303 286 ++currentIndex; ··· 312 295 break; 313 296 } 314 297 298 + // Don't break before a continuation word (e.g., orphaned "?" after "question"). 299 + // Backtrack to the start of the continuation group so the whole group moves to the next line. 300 + while (currentIndex > lineStart + 1 && currentIndex < wordWidths.size() && continuesVec[currentIndex]) { 301 + --currentIndex; 302 + } 303 + 315 304 lineBreakIndices.push_back(currentIndex); 316 305 isFirstLine = false; 317 306 } ··· 323 312 // available width. 324 313 bool ParsedText::hyphenateWordAtIndex(const size_t wordIndex, const int availableWidth, const GfxRenderer& renderer, 325 314 const int fontId, std::vector<uint16_t>& wordWidths, 326 - const bool allowFallbackBreaks) { 315 + const bool allowFallbackBreaks, std::vector<bool>* continuesVec) { 327 316 // Guard against invalid indices or zero available width before attempting to split. 328 317 if (availableWidth <= 0 || wordIndex >= words.size()) { 329 318 return false; ··· 378 367 wordIt->push_back('-'); 379 368 } 380 369 381 - // Insert the remainder word (with matching style) directly after the prefix. 370 + // Insert the remainder word (with matching style and continuation flag) directly after the prefix. 382 371 auto insertWordIt = std::next(wordIt); 383 372 auto insertStyleIt = std::next(styleIt); 384 373 words.insert(insertWordIt, remainder); 385 374 wordStyles.insert(insertStyleIt, style); 386 375 376 + // The remainder inherits whatever continuation status the original word had with the word after it. 377 + // Find the continues entry for the original word and insert the remainder's entry after it. 378 + auto continuesIt = wordContinues.begin(); 379 + std::advance(continuesIt, wordIndex); 380 + const bool originalContinuedToNext = *continuesIt; 381 + // The original word (now prefix) does NOT continue to remainder (hyphen separates them) 382 + *continuesIt = false; 383 + const auto insertContinuesIt = std::next(continuesIt); 384 + wordContinues.insert(insertContinuesIt, originalContinuedToNext); 385 + 386 + // Keep the indexed vector in sync if provided 387 + if (continuesVec) { 388 + (*continuesVec)[wordIndex] = false; 389 + continuesVec->insert(continuesVec->begin() + wordIndex + 1, originalContinuedToNext); 390 + } 391 + 387 392 // Update cached widths to reflect the new prefix/remainder pairing. 388 393 wordWidths[wordIndex] = static_cast<uint16_t>(chosenWidth); 389 394 const uint16_t remainderWidth = measureWordWidth(renderer, fontId, remainder, style); ··· 392 397 } 393 398 394 399 void ParsedText::extractLine(const size_t breakIndex, const int pageWidth, const int spaceWidth, 395 - const std::vector<uint16_t>& wordWidths, const std::vector<size_t>& lineBreakIndices, 400 + const std::vector<uint16_t>& wordWidths, const std::vector<bool>& continuesVec, 401 + const std::vector<size_t>& lineBreakIndices, 396 402 const std::function<void(std::shared_ptr<TextBlock>)>& processLine) { 397 403 const size_t lineBreak = lineBreakIndices[breakIndex]; 398 404 const size_t lastBreakAt = breakIndex > 0 ? lineBreakIndices[breakIndex - 1] : 0; ··· 407 413 : 0; 408 414 409 415 // Calculate total word width for this line and count actual word gaps 410 - // (punctuation that attaches to previous word doesn't count as a gap) 411 - // Note: words list starts at the beginning because previous lines were spliced out 416 + // (continuation words attach to previous word with no gap) 412 417 int lineWordWidthSum = 0; 413 418 size_t actualGapCount = 0; 414 - auto countWordIt = words.begin(); 415 419 416 420 for (size_t wordIdx = 0; wordIdx < lineWordCount; wordIdx++) { 417 421 lineWordWidthSum += wordWidths[lastBreakAt + wordIdx]; 418 - // Count gaps: each word after the first creates a gap, unless it's attaching punctuation 419 - if (wordIdx > 0 && !isAttachingPunctuationWord(*countWordIt)) { 422 + // Count gaps: each word after the first creates a gap, unless it's a continuation 423 + if (wordIdx > 0 && !continuesVec[lastBreakAt + wordIdx]) { 420 424 actualGapCount++; 421 425 } 422 - ++countWordIt; 423 426 } 424 427 425 428 // Calculate spacing (account for indent reducing effective page width on first line) ··· 443 446 } 444 447 445 448 // Pre-calculate X positions for words 446 - // Punctuation that attaches to the previous word doesn't get space before it 447 - // Note: words list starts at the beginning because previous lines were spliced out 449 + // Continuation words attach to the previous word with no space before them 448 450 std::list<uint16_t> lineXPos; 449 - auto wordIt = words.begin(); 450 451 451 452 for (size_t wordIdx = 0; wordIdx < lineWordCount; wordIdx++) { 452 453 const uint16_t currentWordWidth = wordWidths[lastBreakAt + wordIdx]; 453 454 454 455 lineXPos.push_back(xpos); 455 456 456 - // Add spacing after this word, unless the next word is attaching punctuation 457 - auto nextWordIt = wordIt; 458 - ++nextWordIt; 459 - const bool nextIsAttachingPunctuation = wordIdx + 1 < lineWordCount && isAttachingPunctuationWord(*nextWordIt); 457 + // Add spacing after this word, unless the next word is a continuation 458 + const bool nextIsContinuation = wordIdx + 1 < lineWordCount && continuesVec[lastBreakAt + wordIdx + 1]; 460 459 461 - xpos += currentWordWidth + (nextIsAttachingPunctuation ? 0 : spacing); 462 - ++wordIt; 460 + xpos += currentWordWidth + (nextIsContinuation ? 0 : spacing); 463 461 } 464 462 465 463 // Iterators always start at the beginning as we are moving content with splice below 466 464 auto wordEndIt = words.begin(); 467 465 auto wordStyleEndIt = wordStyles.begin(); 466 + auto wordContinuesEndIt = wordContinues.begin(); 468 467 std::advance(wordEndIt, lineWordCount); 469 468 std::advance(wordStyleEndIt, lineWordCount); 469 + std::advance(wordContinuesEndIt, lineWordCount); 470 470 471 471 // *** CRITICAL STEP: CONSUME DATA USING SPLICE *** 472 472 std::list<std::string> lineWords; 473 473 lineWords.splice(lineWords.begin(), words, words.begin(), wordEndIt); 474 474 std::list<EpdFontFamily::Style> lineWordStyles; 475 475 lineWordStyles.splice(lineWordStyles.begin(), wordStyles, wordStyles.begin(), wordStyleEndIt); 476 + 477 + // Consume continues flags (not passed to TextBlock, but must be consumed to stay in sync) 478 + std::list<bool> lineContinues; 479 + lineContinues.splice(lineContinues.begin(), wordContinues, wordContinues.begin(), wordContinuesEndIt); 476 480 477 481 for (auto& word : lineWords) { 478 482 if (containsSoftHyphen(word)) {
+8 -5
lib/Epub/Epub/ParsedText.h
··· 16 16 class ParsedText { 17 17 std::list<std::string> words; 18 18 std::list<EpdFontFamily::Style> wordStyles; 19 + std::list<bool> wordContinues; // true = word attaches to previous (no space before it) 19 20 BlockStyle blockStyle; 20 21 bool extraParagraphSpacing; 21 22 bool hyphenationEnabled; 22 23 23 24 void applyParagraphIndent(); 24 25 std::vector<size_t> computeLineBreaks(const GfxRenderer& renderer, int fontId, int pageWidth, int spaceWidth, 25 - std::vector<uint16_t>& wordWidths); 26 + std::vector<uint16_t>& wordWidths, std::vector<bool>& continuesVec); 26 27 std::vector<size_t> computeHyphenatedLineBreaks(const GfxRenderer& renderer, int fontId, int pageWidth, 27 - int spaceWidth, std::vector<uint16_t>& wordWidths); 28 + int spaceWidth, std::vector<uint16_t>& wordWidths, 29 + std::vector<bool>& continuesVec); 28 30 bool hyphenateWordAtIndex(size_t wordIndex, int availableWidth, const GfxRenderer& renderer, int fontId, 29 - std::vector<uint16_t>& wordWidths, bool allowFallbackBreaks); 31 + std::vector<uint16_t>& wordWidths, bool allowFallbackBreaks, 32 + std::vector<bool>* continuesVec = nullptr); 30 33 void extractLine(size_t breakIndex, int pageWidth, int spaceWidth, const std::vector<uint16_t>& wordWidths, 31 - const std::vector<size_t>& lineBreakIndices, 34 + const std::vector<bool>& continuesVec, const std::vector<size_t>& lineBreakIndices, 32 35 const std::function<void(std::shared_ptr<TextBlock>)>& processLine); 33 36 std::vector<uint16_t> calculateWordWidths(const GfxRenderer& renderer, int fontId); 34 37 ··· 38 41 : blockStyle(blockStyle), extraParagraphSpacing(extraParagraphSpacing), hyphenationEnabled(hyphenationEnabled) {} 39 42 ~ParsedText() = default; 40 43 41 - void addWord(std::string word, EpdFontFamily::Style fontStyle, bool underline = false); 44 + void addWord(std::string word, EpdFontFamily::Style fontStyle, bool underline = false, bool attachToPrevious = false); 42 45 void setBlockStyle(const BlockStyle& blockStyle) { this->blockStyle = blockStyle; } 43 46 BlockStyle& getBlockStyle() { return blockStyle; } 44 47 size_t size() const { return words.size(); }
+31 -1
lib/Epub/Epub/parsers/ChapterHtmlSlimParser.cpp
··· 90 90 91 91 // flush the buffer 92 92 partWordBuffer[partWordBufferIndex] = '\0'; 93 - currentTextBlock->addWord(partWordBuffer, fontStyle); 93 + currentTextBlock->addWord(partWordBuffer, fontStyle, false, nextWordContinues); 94 94 partWordBufferIndex = 0; 95 + nextWordContinues = false; 95 96 } 96 97 97 98 // start a new text block if needed 98 99 void ChapterHtmlSlimParser::startNewTextBlock(const BlockStyle& blockStyle) { 100 + nextWordContinues = false; // New block = new paragraph, no continuation 99 101 if (currentTextBlock) { 100 102 // already have a text block running and it is empty - just reuse it 101 103 if (currentTextBlock->isEmpty()) { ··· 241 243 } 242 244 } 243 245 } else if (matches(name, UNDERLINE_TAGS, NUM_UNDERLINE_TAGS)) { 246 + // Flush buffer before style change so preceding text gets current style 247 + if (self->partWordBufferIndex > 0) { 248 + self->flushPartWordBuffer(); 249 + self->nextWordContinues = true; 250 + } 244 251 self->underlineUntilDepth = std::min(self->underlineUntilDepth, self->depth); 245 252 // Push inline style entry for underline tag 246 253 StyleStackEntry entry; ··· 258 265 self->inlineStyleStack.push_back(entry); 259 266 self->updateEffectiveInlineStyle(); 260 267 } else if (matches(name, BOLD_TAGS, NUM_BOLD_TAGS)) { 268 + // Flush buffer before style change so preceding text gets current style 269 + if (self->partWordBufferIndex > 0) { 270 + self->flushPartWordBuffer(); 271 + self->nextWordContinues = true; 272 + } 261 273 self->boldUntilDepth = std::min(self->boldUntilDepth, self->depth); 262 274 // Push inline style entry for bold tag 263 275 StyleStackEntry entry; ··· 275 287 self->inlineStyleStack.push_back(entry); 276 288 self->updateEffectiveInlineStyle(); 277 289 } else if (matches(name, ITALIC_TAGS, NUM_ITALIC_TAGS)) { 290 + // Flush buffer before style change so preceding text gets current style 291 + if (self->partWordBufferIndex > 0) { 292 + self->flushPartWordBuffer(); 293 + self->nextWordContinues = true; 294 + } 278 295 self->italicUntilDepth = std::min(self->italicUntilDepth, self->depth); 279 296 // Push inline style entry for italic tag 280 297 StyleStackEntry entry; ··· 294 311 } else if (strcmp(name, "span") == 0 || !isHeaderOrBlock(name)) { 295 312 // Handle span and other inline elements for CSS styling 296 313 if (cssStyle.hasFontWeight() || cssStyle.hasFontStyle() || cssStyle.hasTextDecoration()) { 314 + // Flush buffer before style change so preceding text gets current style 315 + if (self->partWordBufferIndex > 0) { 316 + self->flushPartWordBuffer(); 317 + self->nextWordContinues = true; 318 + } 297 319 StyleStackEntry entry; 298 320 entry.depth = self->depth; // Track depth for matching pop 299 321 if (cssStyle.hasFontWeight()) { ··· 331 353 if (self->partWordBufferIndex > 0) { 332 354 self->flushPartWordBuffer(); 333 355 } 356 + // Whitespace is a real word boundary — reset continuation state 357 + self->nextWordContinues = false; 334 358 // Skip the whitespace char 335 359 continue; 336 360 } ··· 387 411 // Flush buffer with current style BEFORE any style changes 388 412 if (self->partWordBufferIndex > 0) { 389 413 // Flush if style will change OR if we're closing a block/structural element 414 + const bool isInlineTag = !headerOrBlockTag && strcmp(name, "table") != 0 && 415 + !matches(name, IMAGE_TAGS, NUM_IMAGE_TAGS) && self->depth != 1; 390 416 const bool shouldFlush = styleWillChange || headerOrBlockTag || matches(name, BOLD_TAGS, NUM_BOLD_TAGS) || 391 417 matches(name, ITALIC_TAGS, NUM_ITALIC_TAGS) || 392 418 matches(name, UNDERLINE_TAGS, NUM_UNDERLINE_TAGS) || strcmp(name, "table") == 0 || ··· 394 420 395 421 if (shouldFlush) { 396 422 self->flushPartWordBuffer(); 423 + // If closing an inline element, the next word fragment continues the same visual word 424 + if (isInlineTag) { 425 + self->nextWordContinues = true; 426 + } 397 427 } 398 428 } 399 429
+1
lib/Epub/Epub/parsers/ChapterHtmlSlimParser.h
··· 30 30 // leave one char at end for null pointer 31 31 char partWordBuffer[MAX_WORD_SIZE + 1] = {}; 32 32 int partWordBufferIndex = 0; 33 + bool nextWordContinues = false; // true when next flushed word attaches to previous (inline element boundary) 33 34 std::unique_ptr<ParsedText> currentTextBlock = nullptr; 34 35 std::unique_ptr<Page> currentPage = nullptr; 35 36 int16_t currentPageNextY = 0;