@recaptime-dev's working patches + fork for Phorge, a community fork of Phabricator. (Upstream dev and stable branches are at upstream/main and upstream/stable respectively.) hq.recaptime.dev/wiki/Phorge
phorge phabricator
1
fork

Configure Feed

Select the types of activity you want to include in your feed.

Improve the performance of tab replacement in common cases

Summary:
See PHI1210. For certain large inputs, we spend more time than we need to replacing tabs with spaces. Add some fast paths:

- When a line only has tabs at the beginning of the line, we don't need to do as much work parsing the rest of the line.
- When a line has no unicode characters, we don't need to vectorize it to get the right result.

Test Plan:
- Added test coverage.
- Profiled this, got a ~60x performance increase on a 36,000 line 3MB text file.

Reviewers: amckinley

Reviewed By: amckinley

Differential Revision: https://secure.phabricator.com/D20477

+166 -33
+2
src/__phutil_library_map__.php
··· 662 662 'DifferentialSubscribersCommitMessageField' => 'applications/differential/field/DifferentialSubscribersCommitMessageField.php', 663 663 'DifferentialSummaryCommitMessageField' => 'applications/differential/field/DifferentialSummaryCommitMessageField.php', 664 664 'DifferentialSummaryField' => 'applications/differential/customfield/DifferentialSummaryField.php', 665 + 'DifferentialTabReplacementTestCase' => 'applications/differential/parser/__tests__/DifferentialTabReplacementTestCase.php', 665 666 'DifferentialTagsCommitMessageField' => 'applications/differential/field/DifferentialTagsCommitMessageField.php', 666 667 'DifferentialTasksCommitMessageField' => 'applications/differential/field/DifferentialTasksCommitMessageField.php', 667 668 'DifferentialTestPlanCommitMessageField' => 'applications/differential/field/DifferentialTestPlanCommitMessageField.php', ··· 6332 6333 'DifferentialSubscribersCommitMessageField' => 'DifferentialCommitMessageField', 6333 6334 'DifferentialSummaryCommitMessageField' => 'DifferentialCommitMessageField', 6334 6335 'DifferentialSummaryField' => 'DifferentialCoreCustomField', 6336 + 'DifferentialTabReplacementTestCase' => 'PhabricatorTestCase', 6335 6337 'DifferentialTagsCommitMessageField' => 'DifferentialCommitMessageField', 6336 6338 'DifferentialTasksCommitMessageField' => 'DifferentialCommitMessageField', 6337 6339 'DifferentialTestPlanCommitMessageField' => 'DifferentialCommitMessageField',
+108 -33
src/applications/differential/parser/DifferentialChangesetParser.php
··· 1456 1456 1457 1457 $line = phutil_string_cast($line); 1458 1458 1459 - if (strpos($line, "\t") !== false) { 1460 - $line = $this->replaceTabsWithSpaces($line); 1461 - } 1459 + // TODO: This should be flexible, eventually. 1460 + $tab_width = 2; 1461 + 1462 + $line = self::replaceTabsWithSpaces($line, $tab_width); 1462 1463 $line = str_replace($search, $replace, $line); 1463 1464 1464 1465 if ($is_html) { ··· 1543 1544 return $rules; 1544 1545 } 1545 1546 1546 - private function replaceTabsWithSpaces($line) { 1547 - // TODO: This should be flexible, eventually. 1548 - $tab_width = 2; 1549 - 1550 - static $tags; 1551 - if ($tags === null) { 1552 - $tags = array(); 1547 + public static function replaceTabsWithSpaces($line, $tab_width) { 1548 + static $tags = array(); 1549 + if (empty($tags[$tab_width])) { 1553 1550 for ($ii = 1; $ii <= $tab_width; $ii++) { 1554 1551 $tag = phutil_tag( 1555 1552 'span', ··· 1562 1559 } 1563 1560 } 1564 1561 1565 - // If the line is particularly long, don't try to vectorize it. Use a 1566 - // faster approximation of the correct tabstop expansion instead. This 1567 - // usually still arrives at the right result. 1568 - if (strlen($line) > 256) { 1569 - return str_replace("\t", $tags[$tab_width], $line); 1562 + // Expand all prefix tabs until we encounter any non-tab character. This 1563 + // is cheap and often immediately produces the correct result with no 1564 + // further work (and, particularly, no need to handle any unicode cases). 1565 + 1566 + $len = strlen($line); 1567 + 1568 + $head = 0; 1569 + for ($head = 0; $head < $len; $head++) { 1570 + $char = $line[$head]; 1571 + if ($char !== "\t") { 1572 + break; 1573 + } 1570 1574 } 1571 1575 1572 - $line = phutil_utf8v_combined($line); 1576 + if ($head) { 1577 + if (empty($tags[$tab_width * $head])) { 1578 + $tags[$tab_width * $head] = str_repeat($tags[$tab_width], $head); 1579 + } 1580 + $prefix = $tags[$tab_width * $head]; 1581 + $line = substr($line, $head); 1582 + } else { 1583 + $prefix = ''; 1584 + } 1585 + 1586 + // If we have no remaining tabs elsewhere in the string after taking care 1587 + // of all the prefix tabs, we're done. 1588 + if (strpos($line, "\t") === false) { 1589 + return $prefix.$line; 1590 + } 1591 + 1592 + $len = strlen($line); 1593 + 1594 + // If the line is particularly long, don't try to do anything special with 1595 + // it. Use a faster approximation of the correct tabstop expansion instead. 1596 + // This usually still arrives at the right result. 1597 + if ($len > 256) { 1598 + return $prefix.str_replace("\t", $tags[$tab_width], $line); 1599 + } 1600 + 1573 1601 $in_tag = false; 1574 1602 $pos = 0; 1575 - foreach ($line as $key => $char) { 1576 - if ($char === '<') { 1577 - $in_tag = true; 1578 - continue; 1603 + 1604 + // See PHI1210. If the line only has single-byte characters, we don't need 1605 + // to vectorize it and can avoid an expensive UTF8 call. 1606 + 1607 + $fast_path = preg_match('/^[\x01-\x7F]*\z/', $line); 1608 + if ($fast_path) { 1609 + $replace = array(); 1610 + for ($ii = 0; $ii < $len; $ii++) { 1611 + $char = $line[$ii]; 1612 + if ($char === '>') { 1613 + $in_tag = false; 1614 + continue; 1615 + } 1616 + 1617 + if ($in_tag) { 1618 + continue; 1619 + } 1620 + 1621 + if ($char === '<') { 1622 + $in_tag = true; 1623 + continue; 1624 + } 1625 + 1626 + if ($char === "\t") { 1627 + $count = $tab_width - ($pos % $tab_width); 1628 + $pos += $count; 1629 + $replace[$ii] = $tags[$count]; 1630 + continue; 1631 + } 1632 + 1633 + $pos++; 1579 1634 } 1580 1635 1581 - if ($char === '>') { 1582 - $in_tag = false; 1583 - continue; 1584 - } 1636 + if ($replace) { 1637 + // Apply replacements starting at the end of the string so they 1638 + // don't mess up the offsets for following replacements. 1639 + $replace = array_reverse($replace, true); 1585 1640 1586 - if ($in_tag) { 1587 - continue; 1641 + foreach ($replace as $replace_pos => $replacement) { 1642 + $line = substr_replace($line, $replacement, $replace_pos, 1); 1643 + } 1588 1644 } 1645 + } else { 1646 + $line = phutil_utf8v_combined($line); 1647 + foreach ($line as $key => $char) { 1648 + if ($char === '>') { 1649 + $in_tag = false; 1650 + continue; 1651 + } 1652 + 1653 + if ($in_tag) { 1654 + continue; 1655 + } 1656 + 1657 + if ($char === '<') { 1658 + $in_tag = true; 1659 + continue; 1660 + } 1589 1661 1590 - if ($char === "\t") { 1591 - $count = $tab_width - ($pos % $tab_width); 1592 - $pos += $count; 1593 - $line[$key] = $tags[$count]; 1594 - continue; 1662 + if ($char === "\t") { 1663 + $count = $tab_width - ($pos % $tab_width); 1664 + $pos += $count; 1665 + $line[$key] = $tags[$count]; 1666 + continue; 1667 + } 1668 + 1669 + $pos++; 1595 1670 } 1596 1671 1597 - $pos++; 1672 + $line = implode('', $line); 1598 1673 } 1599 1674 1600 - return implode('', $line); 1675 + return $prefix.$line; 1601 1676 } 1602 1677 1603 1678 }
+56
src/applications/differential/parser/__tests__/DifferentialTabReplacementTestCase.php
··· 1 + <?php 2 + 3 + final class DifferentialTabReplacementTestCase 4 + extends PhabricatorTestCase { 5 + 6 + public function testTabReplacement() { 7 + $tab1 = "<span data-copy-text=\"\t\"> </span>"; 8 + $tab2 = "<span data-copy-text=\"\t\"> </span>"; 9 + 10 + $cat = "\xF0\x9F\x90\xB1"; 11 + 12 + $cases = array( 13 + '' => '', 14 + 'x' => 'x', 15 + 16 + // Tabs inside HTML tags should not be replaced. 17 + "<\t>x" => "<\t>x", 18 + 19 + // Normal tabs should be replaced. These are all aligned to the tab 20 + // width, so they'll be replaced inline. 21 + "\tx" => "{$tab2}x", 22 + " \tx" => " {$tab2}x", 23 + "\t x" => "{$tab2} x", 24 + "aa\tx" => "aa{$tab2}x", 25 + "aa \tx" => "aa {$tab2}x", 26 + "aa\t x" => "aa{$tab2} x", 27 + 28 + // This tab is not tabstop-aligned, so it is replaced with fewer 29 + // spaces to bring us to the next tabstop. 30 + " \tx" => " {$tab1}x", 31 + 32 + // Text inside HTML tags should not count when aligning tabs with 33 + // tabstops. 34 + "<tag> </tag>\tx" => "<tag> </tag>{$tab1}x", 35 + "<tag2> </tag>\tx" => "<tag2> </tag>{$tab1}x", 36 + 37 + // The code has to take a slow path when inputs contain unicode, but 38 + // should produce the right results and align tabs to tabstops while 39 + // respecting UTF8 display character widths, not byte widths. 40 + "{$cat}\tx" => "{$cat}{$tab1}x", 41 + "{$cat}{$cat}\tx" => "{$cat}{$cat}{$tab2}x", 42 + ); 43 + 44 + foreach ($cases as $input => $expect) { 45 + $actual = DifferentialChangesetParser::replaceTabsWithSpaces( 46 + $input, 47 + 2); 48 + 49 + $this->assertEqual( 50 + $expect, 51 + $actual, 52 + pht('Tabs to Spaces: %s', $input)); 53 + } 54 + } 55 + 56 + }