fix: Remove separations after style changes (#720) · edouard.paris/crosspoint-reader@9f78fd3

+68 -64

lib/Epub/Epub/ParsedText.cpp

··· 19 19 constexpr char SOFT_HYPHEN_UTF8[] = "\xC2\xAD"; 20 20 constexpr size_t SOFT_HYPHEN_BYTES = 2; 21 21 22 - // Known attaching punctuation (including UTF-8 sequences) 23 - const std::vector<std::string> punctuation = { 24 - ".", 25 - ",", 26 - "!", 27 - "?", 28 - ";", 29 - ":", 30 - "\"", 31 - "'", 32 - "\xE2\x80\x99", // ’ (U+2019 right single quote) 33 - "\xE2\x80\x9D" // ” (U+201D right double quote) 34 - }; 35 - 36 - bool isAttachingPunctuationWord(const std::string& word) { 37 - if (word.empty()) return false; 38 - 39 - size_t pos = 0; 40 - while (pos < word.size()) { 41 - bool matched = false; 42 - for (const auto& p : punctuation) { 43 - if (word.compare(pos, p.size(), p) == 0) { 44 - pos += p.size(); 45 - matched = true; 46 - break; 47 - } 48 - } 49 - if (!matched) return false; 50 - } 51 - return true; 52 - } 53 - 54 22 bool containsSoftHyphen(const std::string& word) { return word.find(SOFT_HYPHEN_UTF8) != std::string::npos; } 55 23 56 24 // Removes every soft hyphen in-place so rendered glyphs match measured widths. ··· 81 49 82 50 } // namespace 83 51 84 - void ParsedText::addWord(std::string word, const EpdFontFamily::Style style, const bool underline) { 52 + void ParsedText::addWord(std::string word, const EpdFontFamily::Style fontStyle, const bool underline, 53 + const bool attachToPrevious) { 85 54 if (word.empty()) return; 86 55 87 56 words.push_back(std::move(word)); 88 - EpdFontFamily::Style combinedStyle = style; 57 + EpdFontFamily::Style combinedStyle = fontStyle; 89 58 if (underline) { 90 59 combinedStyle = static_cast<EpdFontFamily::Style>(combinedStyle | EpdFontFamily::UNDERLINE); 91 60 } 92 61 wordStyles.push_back(combinedStyle); 62 + wordContinues.push_back(attachToPrevious); 93 63 } 94 64 95 65 // Consumes data to minimize memory usage ··· 106 76 const int pageWidth = viewportWidth; 107 77 const int spaceWidth = renderer.getSpaceWidth(fontId); 108 78 auto wordWidths = calculateWordWidths(renderer, fontId); 79 + 80 + // Build indexed continues vector from the parallel list for O(1) access during layout 81 + std::vector<bool> continuesVec(wordContinues.begin(), wordContinues.end()); 82 + 109 83 std::vector<size_t> lineBreakIndices; 110 84 if (hyphenationEnabled) { 111 85 // Use greedy layout that can split words mid-loop when a hyphenated prefix fits. 112 - lineBreakIndices = computeHyphenatedLineBreaks(renderer, fontId, pageWidth, spaceWidth, wordWidths); 86 + lineBreakIndices = computeHyphenatedLineBreaks(renderer, fontId, pageWidth, spaceWidth, wordWidths, continuesVec); 113 87 } else { 114 - lineBreakIndices = computeLineBreaks(renderer, fontId, pageWidth, spaceWidth, wordWidths); 88 + lineBreakIndices = computeLineBreaks(renderer, fontId, pageWidth, spaceWidth, wordWidths, continuesVec); 115 89 } 116 90 const size_t lineCount = includeLastLine ? lineBreakIndices.size() : lineBreakIndices.size() - 1; 117 91 118 92 for (size_t i = 0; i < lineCount; ++i) { 119 - extractLine(i, pageWidth, spaceWidth, wordWidths, lineBreakIndices, processLine); 93 + extractLine(i, pageWidth, spaceWidth, wordWidths, continuesVec, lineBreakIndices, processLine); 120 94 } 121 95 } 122 96 ··· 140 114 } 141 115 142 116 std::vector<size_t> ParsedText::computeLineBreaks(const GfxRenderer& renderer, const int fontId, const int pageWidth, 143 - const int spaceWidth, std::vector<uint16_t>& wordWidths) { 117 + const int spaceWidth, std::vector<uint16_t>& wordWidths, 118 + std::vector<bool>& continuesVec) { 144 119 if (words.empty()) { 145 120 return {}; 146 121 } ··· 157 132 // First word needs to fit in reduced width if there's an indent 158 133 const int effectiveWidth = i == 0 ? pageWidth - firstLineIndent : pageWidth; 159 134 while (wordWidths[i] > effectiveWidth) { 160 - if (!hyphenateWordAtIndex(i, effectiveWidth, renderer, fontId, wordWidths, /*allowFallbackBreaks=*/true)) { 135 + if (!hyphenateWordAtIndex(i, effectiveWidth, renderer, fontId, wordWidths, /*allowFallbackBreaks=*/true, 136 + &continuesVec)) { 161 137 break; 162 138 } 163 139 } ··· 175 151 ans[totalWordCount - 1] = totalWordCount - 1; 176 152 177 153 for (int i = totalWordCount - 2; i >= 0; --i) { 178 - int currlen = -spaceWidth; 154 + int currlen = 0; 179 155 dp[i] = MAX_COST; 180 156 181 157 // First line has reduced width due to text-indent 182 158 const int effectivePageWidth = i == 0 ? pageWidth - firstLineIndent : pageWidth; 183 159 184 160 for (size_t j = i; j < totalWordCount; ++j) { 185 - // Current line length: previous width + space + current word width 186 - currlen += wordWidths[j] + spaceWidth; 161 + // Add space before word j, unless it's the first word on the line or a continuation 162 + const int gap = j > static_cast<size_t>(i) && !continuesVec[j] ? spaceWidth : 0; 163 + currlen += wordWidths[j] + gap; 187 164 188 165 if (currlen > effectivePageWidth) { 189 166 break; 167 + } 168 + 169 + // Cannot break after word j if the next word attaches to it (continuation group) 170 + if (j + 1 < totalWordCount && continuesVec[j + 1]) { 171 + continue; 190 172 } 191 173 192 174 int cost; ··· 260 242 // Builds break indices while opportunistically splitting the word that would overflow the current line. 261 243 std::vector<size_t> ParsedText::computeHyphenatedLineBreaks(const GfxRenderer& renderer, const int fontId, 262 244 const int pageWidth, const int spaceWidth, 263 - std::vector<uint16_t>& wordWidths) { 245 + std::vector<uint16_t>& wordWidths, 246 + std::vector<bool>& continuesVec) { 264 247 // Calculate first line indent (only for left/justified text without extra paragraph spacing) 265 248 const int firstLineIndent = 266 249 blockStyle.textIndent > 0 && !extraParagraphSpacing && ··· 282 265 // Consume as many words as possible for current line, splitting when prefixes fit 283 266 while (currentIndex < wordWidths.size()) { 284 267 const bool isFirstWord = currentIndex == lineStart; 285 - const int spacing = isFirstWord ? 0 : spaceWidth; 268 + const int spacing = isFirstWord || continuesVec[currentIndex] ? 0 : spaceWidth; 286 269 const int candidateWidth = spacing + wordWidths[currentIndex]; 287 270 288 271 // Word fits on current line ··· 296 279 const int availableWidth = effectivePageWidth - lineWidth - spacing; 297 280 const bool allowFallbackBreaks = isFirstWord; // Only for first word on line 298 281 299 - if (availableWidth > 0 && 300 - hyphenateWordAtIndex(currentIndex, availableWidth, renderer, fontId, wordWidths, allowFallbackBreaks)) { 282 + if (availableWidth > 0 && hyphenateWordAtIndex(currentIndex, availableWidth, renderer, fontId, wordWidths, 283 + allowFallbackBreaks, &continuesVec)) { 301 284 // Prefix now fits; append it to this line and move to next line 302 285 lineWidth += spacing + wordWidths[currentIndex]; 303 286 ++currentIndex; ··· 312 295 break; 313 296 } 314 297 298 + // Don't break before a continuation word (e.g., orphaned "?" after "question"). 299 + // Backtrack to the start of the continuation group so the whole group moves to the next line. 300 + while (currentIndex > lineStart + 1 && currentIndex < wordWidths.size() && continuesVec[currentIndex]) { 301 + --currentIndex; 302 + } 303 + 315 304 lineBreakIndices.push_back(currentIndex); 316 305 isFirstLine = false; 317 306 } ··· 323 312 // available width. 324 313 bool ParsedText::hyphenateWordAtIndex(const size_t wordIndex, const int availableWidth, const GfxRenderer& renderer, 325 314 const int fontId, std::vector<uint16_t>& wordWidths, 326 - const bool allowFallbackBreaks) { 315 + const bool allowFallbackBreaks, std::vector<bool>* continuesVec) { 327 316 // Guard against invalid indices or zero available width before attempting to split. 328 317 if (availableWidth <= 0 || wordIndex >= words.size()) { 329 318 return false; ··· 378 367 wordIt->push_back('-'); 379 368 } 380 369 381 - // Insert the remainder word (with matching style) directly after the prefix. 370 + // Insert the remainder word (with matching style and continuation flag) directly after the prefix. 382 371 auto insertWordIt = std::next(wordIt); 383 372 auto insertStyleIt = std::next(styleIt); 384 373 words.insert(insertWordIt, remainder); 385 374 wordStyles.insert(insertStyleIt, style); 386 375 376 + // The remainder inherits whatever continuation status the original word had with the word after it. 377 + // Find the continues entry for the original word and insert the remainder's entry after it. 378 + auto continuesIt = wordContinues.begin(); 379 + std::advance(continuesIt, wordIndex); 380 + const bool originalContinuedToNext = *continuesIt; 381 + // The original word (now prefix) does NOT continue to remainder (hyphen separates them) 382 + *continuesIt = false; 383 + const auto insertContinuesIt = std::next(continuesIt); 384 + wordContinues.insert(insertContinuesIt, originalContinuedToNext); 385 + 386 + // Keep the indexed vector in sync if provided 387 + if (continuesVec) { 388 + (*continuesVec)[wordIndex] = false; 389 + continuesVec->insert(continuesVec->begin() + wordIndex + 1, originalContinuedToNext); 390 + } 391 + 387 392 // Update cached widths to reflect the new prefix/remainder pairing. 388 393 wordWidths[wordIndex] = static_cast<uint16_t>(chosenWidth); 389 394 const uint16_t remainderWidth = measureWordWidth(renderer, fontId, remainder, style); ··· 392 397 } 393 398 394 399 void ParsedText::extractLine(const size_t breakIndex, const int pageWidth, const int spaceWidth, 395 - const std::vector<uint16_t>& wordWidths, const std::vector<size_t>& lineBreakIndices, 400 + const std::vector<uint16_t>& wordWidths, const std::vector<bool>& continuesVec, 401 + const std::vector<size_t>& lineBreakIndices, 396 402 const std::function<void(std::shared_ptr<TextBlock>)>& processLine) { 397 403 const size_t lineBreak = lineBreakIndices[breakIndex]; 398 404 const size_t lastBreakAt = breakIndex > 0 ? lineBreakIndices[breakIndex - 1] : 0; ··· 407 413 : 0; 408 414 409 415 // Calculate total word width for this line and count actual word gaps 410 - // (punctuation that attaches to previous word doesn't count as a gap) 411 - // Note: words list starts at the beginning because previous lines were spliced out 416 + // (continuation words attach to previous word with no gap) 412 417 int lineWordWidthSum = 0; 413 418 size_t actualGapCount = 0; 414 - auto countWordIt = words.begin(); 415 419 416 420 for (size_t wordIdx = 0; wordIdx < lineWordCount; wordIdx++) { 417 421 lineWordWidthSum += wordWidths[lastBreakAt + wordIdx]; 418 - // Count gaps: each word after the first creates a gap, unless it's attaching punctuation 419 - if (wordIdx > 0 && !isAttachingPunctuationWord(*countWordIt)) { 422 + // Count gaps: each word after the first creates a gap, unless it's a continuation 423 + if (wordIdx > 0 && !continuesVec[lastBreakAt + wordIdx]) { 420 424 actualGapCount++; 421 425 } 422 - ++countWordIt; 423 426 } 424 427 425 428 // Calculate spacing (account for indent reducing effective page width on first line) ··· 443 446 } 444 447 445 448 // Pre-calculate X positions for words 446 - // Punctuation that attaches to the previous word doesn't get space before it 447 - // Note: words list starts at the beginning because previous lines were spliced out 449 + // Continuation words attach to the previous word with no space before them 448 450 std::list<uint16_t> lineXPos; 449 - auto wordIt = words.begin(); 450 451 451 452 for (size_t wordIdx = 0; wordIdx < lineWordCount; wordIdx++) { 452 453 const uint16_t currentWordWidth = wordWidths[lastBreakAt + wordIdx]; 453 454 454 455 lineXPos.push_back(xpos); 455 456 456 - // Add spacing after this word, unless the next word is attaching punctuation 457 - auto nextWordIt = wordIt; 458 - ++nextWordIt; 459 - const bool nextIsAttachingPunctuation = wordIdx + 1 < lineWordCount && isAttachingPunctuationWord(*nextWordIt); 457 + // Add spacing after this word, unless the next word is a continuation 458 + const bool nextIsContinuation = wordIdx + 1 < lineWordCount && continuesVec[lastBreakAt + wordIdx + 1]; 460 459 461 - xpos += currentWordWidth + (nextIsAttachingPunctuation ? 0 : spacing); 462 - ++wordIt; 460 + xpos += currentWordWidth + (nextIsContinuation ? 0 : spacing); 463 461 } 464 462 465 463 // Iterators always start at the beginning as we are moving content with splice below 466 464 auto wordEndIt = words.begin(); 467 465 auto wordStyleEndIt = wordStyles.begin(); 466 + auto wordContinuesEndIt = wordContinues.begin(); 468 467 std::advance(wordEndIt, lineWordCount); 469 468 std::advance(wordStyleEndIt, lineWordCount); 469 + std::advance(wordContinuesEndIt, lineWordCount); 470 470 471 471 // *** CRITICAL STEP: CONSUME DATA USING SPLICE *** 472 472 std::list<std::string> lineWords; 473 473 lineWords.splice(lineWords.begin(), words, words.begin(), wordEndIt); 474 474 std::list<EpdFontFamily::Style> lineWordStyles; 475 475 lineWordStyles.splice(lineWordStyles.begin(), wordStyles, wordStyles.begin(), wordStyleEndIt); 476 + 477 + // Consume continues flags (not passed to TextBlock, but must be consumed to stay in sync) 478 + std::list<bool> lineContinues; 479 + lineContinues.splice(lineContinues.begin(), wordContinues, wordContinues.begin(), wordContinuesEndIt); 476 480 477 481 for (auto& word : lineWords) { 478 482 if (containsSoftHyphen(word)) {

+8 -5

lib/Epub/Epub/ParsedText.h

··· 16 16 class ParsedText { 17 17 std::list<std::string> words; 18 18 std::list<EpdFontFamily::Style> wordStyles; 19 + std::list<bool> wordContinues; // true = word attaches to previous (no space before it) 19 20 BlockStyle blockStyle; 20 21 bool extraParagraphSpacing; 21 22 bool hyphenationEnabled; 22 23 23 24 void applyParagraphIndent(); 24 25 std::vector<size_t> computeLineBreaks(const GfxRenderer& renderer, int fontId, int pageWidth, int spaceWidth, 25 - std::vector<uint16_t>& wordWidths); 26 + std::vector<uint16_t>& wordWidths, std::vector<bool>& continuesVec); 26 27 std::vector<size_t> computeHyphenatedLineBreaks(const GfxRenderer& renderer, int fontId, int pageWidth, 27 - int spaceWidth, std::vector<uint16_t>& wordWidths); 28 + int spaceWidth, std::vector<uint16_t>& wordWidths, 29 + std::vector<bool>& continuesVec); 28 30 bool hyphenateWordAtIndex(size_t wordIndex, int availableWidth, const GfxRenderer& renderer, int fontId, 29 - std::vector<uint16_t>& wordWidths, bool allowFallbackBreaks); 31 + std::vector<uint16_t>& wordWidths, bool allowFallbackBreaks, 32 + std::vector<bool>* continuesVec = nullptr); 30 33 void extractLine(size_t breakIndex, int pageWidth, int spaceWidth, const std::vector<uint16_t>& wordWidths, 31 - const std::vector<size_t>& lineBreakIndices, 34 + const std::vector<bool>& continuesVec, const std::vector<size_t>& lineBreakIndices, 32 35 const std::function<void(std::shared_ptr<TextBlock>)>& processLine); 33 36 std::vector<uint16_t> calculateWordWidths(const GfxRenderer& renderer, int fontId); 34 37 ··· 38 41 : blockStyle(blockStyle), extraParagraphSpacing(extraParagraphSpacing), hyphenationEnabled(hyphenationEnabled) {} 39 42 ~ParsedText() = default; 40 43 41 - void addWord(std::string word, EpdFontFamily::Style fontStyle, bool underline = false); 44 + void addWord(std::string word, EpdFontFamily::Style fontStyle, bool underline = false, bool attachToPrevious = false); 42 45 void setBlockStyle(const BlockStyle& blockStyle) { this->blockStyle = blockStyle; } 43 46 BlockStyle& getBlockStyle() { return blockStyle; } 44 47 size_t size() const { return words.size(); }

+31 -1

lib/Epub/Epub/parsers/ChapterHtmlSlimParser.cpp

··· 90 90 91 91 // flush the buffer 92 92 partWordBuffer[partWordBufferIndex] = '\0'; 93 - currentTextBlock->addWord(partWordBuffer, fontStyle); 93 + currentTextBlock->addWord(partWordBuffer, fontStyle, false, nextWordContinues); 94 94 partWordBufferIndex = 0; 95 + nextWordContinues = false; 95 96 } 96 97 97 98 // start a new text block if needed 98 99 void ChapterHtmlSlimParser::startNewTextBlock(const BlockStyle& blockStyle) { 100 + nextWordContinues = false; // New block = new paragraph, no continuation 99 101 if (currentTextBlock) { 100 102 // already have a text block running and it is empty - just reuse it 101 103 if (currentTextBlock->isEmpty()) { ··· 241 243 } 242 244 } 243 245 } else if (matches(name, UNDERLINE_TAGS, NUM_UNDERLINE_TAGS)) { 246 + // Flush buffer before style change so preceding text gets current style 247 + if (self->partWordBufferIndex > 0) { 248 + self->flushPartWordBuffer(); 249 + self->nextWordContinues = true; 250 + } 244 251 self->underlineUntilDepth = std::min(self->underlineUntilDepth, self->depth); 245 252 // Push inline style entry for underline tag 246 253 StyleStackEntry entry; ··· 258 265 self->inlineStyleStack.push_back(entry); 259 266 self->updateEffectiveInlineStyle(); 260 267 } else if (matches(name, BOLD_TAGS, NUM_BOLD_TAGS)) { 268 + // Flush buffer before style change so preceding text gets current style 269 + if (self->partWordBufferIndex > 0) { 270 + self->flushPartWordBuffer(); 271 + self->nextWordContinues = true; 272 + } 261 273 self->boldUntilDepth = std::min(self->boldUntilDepth, self->depth); 262 274 // Push inline style entry for bold tag 263 275 StyleStackEntry entry; ··· 275 287 self->inlineStyleStack.push_back(entry); 276 288 self->updateEffectiveInlineStyle(); 277 289 } else if (matches(name, ITALIC_TAGS, NUM_ITALIC_TAGS)) { 290 + // Flush buffer before style change so preceding text gets current style 291 + if (self->partWordBufferIndex > 0) { 292 + self->flushPartWordBuffer(); 293 + self->nextWordContinues = true; 294 + } 278 295 self->italicUntilDepth = std::min(self->italicUntilDepth, self->depth); 279 296 // Push inline style entry for italic tag 280 297 StyleStackEntry entry; ··· 294 311 } else if (strcmp(name, "span") == 0 || !isHeaderOrBlock(name)) { 295 312 // Handle span and other inline elements for CSS styling 296 313 if (cssStyle.hasFontWeight() || cssStyle.hasFontStyle() || cssStyle.hasTextDecoration()) { 314 + // Flush buffer before style change so preceding text gets current style 315 + if (self->partWordBufferIndex > 0) { 316 + self->flushPartWordBuffer(); 317 + self->nextWordContinues = true; 318 + } 297 319 StyleStackEntry entry; 298 320 entry.depth = self->depth; // Track depth for matching pop 299 321 if (cssStyle.hasFontWeight()) { ··· 331 353 if (self->partWordBufferIndex > 0) { 332 354 self->flushPartWordBuffer(); 333 355 } 356 + // Whitespace is a real word boundary — reset continuation state 357 + self->nextWordContinues = false; 334 358 // Skip the whitespace char 335 359 continue; 336 360 } ··· 387 411 // Flush buffer with current style BEFORE any style changes 388 412 if (self->partWordBufferIndex > 0) { 389 413 // Flush if style will change OR if we're closing a block/structural element 414 + const bool isInlineTag = !headerOrBlockTag && strcmp(name, "table") != 0 && 415 + !matches(name, IMAGE_TAGS, NUM_IMAGE_TAGS) && self->depth != 1; 390 416 const bool shouldFlush = styleWillChange || headerOrBlockTag || matches(name, BOLD_TAGS, NUM_BOLD_TAGS) || 391 417 matches(name, ITALIC_TAGS, NUM_ITALIC_TAGS) || 392 418 matches(name, UNDERLINE_TAGS, NUM_UNDERLINE_TAGS) || strcmp(name, "table") == 0 || ··· 394 420 395 421 if (shouldFlush) { 396 422 self->flushPartWordBuffer(); 423 + // If closing an inline element, the next word fragment continues the same visual word 424 + if (isInlineTag) { 425 + self->nextWordContinues = true; 426 + } 397 427 } 398 428 } 399 429

+1

lib/Epub/Epub/parsers/ChapterHtmlSlimParser.h

··· 30 30 // leave one char at end for null pointer 31 31 char partWordBuffer[MAX_WORD_SIZE + 1] = {}; 32 32 int partWordBufferIndex = 0; 33 + bool nextWordContinues = false; // true when next flushed word attaches to previous (inline element boundary) 33 34 std::unique_ptr<ParsedText> currentTextBlock = nullptr; 34 35 std::unique_ptr<Page> currentPage = nullptr; 35 36 int16_t currentPageNextY = 0;

Configure Feed

Configure Feed