Improve Remarkup parsing performance for certain large input blocks

+20 -14

src/infrastructure/markup/blockrule/PhutilRemarkupNoteBlockRule.php

··· 100 100 } 101 101 102 102 private function getRegEx() { 103 - $words = array( 104 - 'NOTE', 105 - 'IMPORTANT', 106 - 'WARNING', 107 - ); 103 + static $regex; 104 + 105 + if ($regex === null) { 106 + $words = array( 107 + 'NOTE', 108 + 'IMPORTANT', 109 + 'WARNING', 110 + ); 108 111 109 - foreach ($words as $k => $word) { 110 - $words[$k] = preg_quote($word, '/'); 112 + foreach ($words as $k => $word) { 113 + $words[$k] = preg_quote($word, '/'); 114 + } 115 + $words = implode('|', $words); 116 + 117 + $regex = 118 + '/^(?:'. 119 + '(?:$(?P<hideword>'.$words.')$)'. 120 + '|'. 121 + '(?:(?P<showword>'.$words.'):))\s*'. 122 + '/'; 111 123 } 112 - $words = implode('|', $words); 113 124 114 - return 115 - '/^(?:'. 116 - '(?:$(?P<hideword>'.$words.')$)'. 117 - '|'. 118 - '(?:(?P<showword>'.$words.'):))\s*'. 119 - '/'; 125 + return $regex; 120 126 } 121 127 }

+86 -36

src/infrastructure/markup/remarkup/PhutilRemarkupEngine.php

··· 153 153 $block_rules = $this->blockRules; 154 154 $blocks = array(); 155 155 $cursor = 0; 156 - $prev_block = array(); 156 + 157 + $can_merge = array(); 158 + foreach ($block_rules as $key => $block_rule) { 159 + if ($block_rule instanceof PhutilRemarkupDefaultBlockRule) { 160 + $can_merge[$key] = true; 161 + } 162 + } 163 + 164 + $last_block = null; 165 + $last_block_key = -1; 166 + 167 + // See T13487. For very large inputs, block separation can dominate 168 + // runtime. This is written somewhat clumsily to attempt to handle 169 + // very large inputs as gracefully as is practical. 157 170 158 171 while (isset($text[$cursor])) { 159 172 $starting_cursor = $cursor; 160 - foreach ($block_rules as $block_rule) { 173 + foreach ($block_rules as $block_key => $block_rule) { 161 174 $num_lines = $block_rule->getMatchingLineCount($text, $cursor); 162 175 163 176 if ($num_lines) { 164 - if ($blocks) { 165 - $prev_block = last($blocks); 166 - } 167 - 168 - $curr_block = array( 177 + $current_block = array( 169 178 'start' => $cursor, 170 179 'num_lines' => $num_lines, 171 180 'rule' => $block_rule, 172 - 'is_empty' => self::isEmptyBlock($text, $cursor, $num_lines), 181 + 'empty' => self::isEmptyBlock($text, $cursor, $num_lines), 173 182 'children' => array(), 183 + 'merge' => isset($can_merge[$block_key]), 174 184 ); 175 185 176 - if ($prev_block 177 - && self::shouldMergeBlocks($text, $prev_block, $curr_block)) { 178 - $blocks[last_key($blocks)]['num_lines'] += $curr_block['num_lines']; 179 - $blocks[last_key($blocks)]['is_empty'] = 180 - $blocks[last_key($blocks)]['is_empty'] && $curr_block['is_empty']; 186 + $should_merge = self::shouldMergeParagraphBlocks( 187 + $text, 188 + $last_block, 189 + $current_block); 190 + 191 + if ($should_merge) { 192 + $last_block['num_lines'] = 193 + ($last_block['num_lines'] + $current_block['num_lines']); 194 + 195 + $last_block['empty'] = 196 + ($last_block['empty'] && $current_block['empty']); 197 + 198 + $blocks[$last_block_key] = $last_block; 181 199 } else { 182 - $blocks[] = $curr_block; 200 + $blocks[] = $current_block; 201 + 202 + $last_block = $current_block; 203 + $last_block_key++; 183 204 } 184 205 185 206 $cursor += $num_lines; ··· 192 213 } 193 214 } 194 215 216 + // See T13487. It's common for blocks to be small, and this loop seems to 217 + // measure as faster if we manually concatenate blocks than if we 218 + // "array_slice()" and "implode()" blocks. This is a bit muddy. 219 + 195 220 foreach ($blocks as $key => $block) { 196 - $lines = array_slice($text, $block['start'], $block['num_lines']); 197 - $blocks[$key]['text'] = implode('', $lines); 221 + $min = $block['start']; 222 + $max = $min + $block['num_lines']; 223 + 224 + $lines = ''; 225 + for ($ii = $min; $ii < $max; $ii++) { 226 + $lines .= $text[$ii]; 227 + } 228 + 229 + $blocks[$key]['text'] = $lines; 198 230 } 199 231 200 232 // Stop splitting child blocks apart if we get too deep. This arrests ··· 246 278 return $output; 247 279 } 248 280 249 - private static function shouldMergeBlocks($text, $prev_block, $curr_block) { 250 - $block_rules = ipull(array($prev_block, $curr_block), 'rule'); 281 + private static function shouldMergeParagraphBlocks( 282 + $text, 283 + $last_block, 284 + $current_block) { 251 285 252 - $default_rule = 'PhutilRemarkupDefaultBlockRule'; 253 - try { 254 - assert_instances_of($block_rules, $default_rule); 286 + // If we're at the beginning of the input, we can't merge. 287 + if ($last_block === null) { 288 + return false; 289 + } 255 290 256 - // If the last block was empty keep merging 257 - if ($prev_block['is_empty']) { 258 - return true; 259 - } 291 + // If the previous block wasn't a default block, we can't merge. 292 + if (!$last_block['merge']) { 293 + return false; 294 + } 260 295 261 - // If this line is blank keep merging 262 - if ($curr_block['is_empty']) { 263 - return true; 264 - } 296 + // If the current block isn't a default block, we can't merge. 297 + if (!$current_block['merge']) { 298 + return false; 299 + } 265 300 266 - // If the current line and the last line have content, keep merging 267 - if (strlen(trim($text[$curr_block['start'] - 1]))) { 268 - if (strlen(trim($text[$curr_block['start']]))) { 269 - return true; 270 - } 271 - } 272 - } catch (Exception $e) {} 301 + // If the last block was empty, we definitely want to merge. 302 + if ($last_block['empty']) { 303 + return true; 304 + } 305 + 306 + // If this block is empty, we definitely want to merge. 307 + if ($current_block['empty']) { 308 + return true; 309 + } 310 + 311 + // Check if the last line of the previous block or the first line of this 312 + // block have any non-whitespace text. If they both do, we're going to 313 + // merge. 314 + 315 + // If either of them are a blank line or a line with only whitespace, we 316 + // do not merge: this means we've found a paragraph break. 317 + 318 + $tail = $text[$current_block['start'] - 1]; 319 + $head = $text[$current_block['start']]; 320 + if (strlen(trim($tail)) && strlen(trim($head))) { 321 + return true; 322 + } 273 323 274 324 return false; 275 325 }

+36 -23

src/infrastructure/markup/rule/PhabricatorObjectRemarkupRule.php

··· 2 2 3 3 abstract class PhabricatorObjectRemarkupRule extends PhutilRemarkupRule { 4 4 5 + private $referencePattern; 6 + private $embedPattern; 7 + 5 8 const KEY_RULE_OBJECT = 'rule.object'; 6 9 const KEY_MENTIONED_OBJECTS = 'rule.object.mentioned'; 7 10 ··· 192 195 } 193 196 194 197 private function getObjectEmbedPattern() { 195 - $prefix = $this->getObjectNamePrefix(); 196 - $prefix = preg_quote($prefix); 197 - $id = $this->getObjectIDPattern(); 198 + if ($this->embedPattern === null) { 199 + $prefix = $this->getObjectNamePrefix(); 200 + $prefix = preg_quote($prefix); 201 + $id = $this->getObjectIDPattern(); 198 202 199 - return '(\B{'.$prefix.'('.$id.')([,\s](?:[^}\\\\]|\\\\.)*)?}\B)u'; 203 + $this->embedPattern = 204 + '(\B{'.$prefix.'('.$id.')([,\s](?:[^}\\\\]|\\\\.)*)?}\B)u'; 205 + } 206 + 207 + return $this->embedPattern; 200 208 } 201 209 202 210 private function getObjectReferencePattern() { 203 - $prefix = $this->getObjectNamePrefix(); 204 - $prefix = preg_quote($prefix); 211 + if ($this->referencePattern === null) { 212 + $prefix = $this->getObjectNamePrefix(); 213 + $prefix = preg_quote($prefix); 205 214 206 - $id = $this->getObjectIDPattern(); 215 + $id = $this->getObjectIDPattern(); 207 216 208 - // If the prefix starts with a word character (like "D"), we want to 209 - // require a word boundary so that we don't match "XD1" as "D1". If the 210 - // prefix does not start with a word character, we want to require no word 211 - // boundary for the same reasons. Test if the prefix starts with a word 212 - // character. 213 - if ($this->getObjectNamePrefixBeginsWithWordCharacter()) { 214 - $boundary = '\\b'; 215 - } else { 216 - $boundary = '\\B'; 217 - } 217 + // If the prefix starts with a word character (like "D"), we want to 218 + // require a word boundary so that we don't match "XD1" as "D1". If the 219 + // prefix does not start with a word character, we want to require no word 220 + // boundary for the same reasons. Test if the prefix starts with a word 221 + // character. 222 + if ($this->getObjectNamePrefixBeginsWithWordCharacter()) { 223 + $boundary = '\\b'; 224 + } else { 225 + $boundary = '\\B'; 226 + } 218 227 219 - // The "(?<![#@-])" prevents us from linking "#abcdef" or similar, and 220 - // "ABC-T1" (see T5714), and from matching "@T1" as a task (it is a user) 221 - // (see T9479). 228 + // The "(?<![#@-])" prevents us from linking "#abcdef" or similar, and 229 + // "ABC-T1" (see T5714), and from matching "@T1" as a task (it is a user) 230 + // (see T9479). 222 231 223 - // The "\b" allows us to link "(abcdef)" or similar without linking things 224 - // in the middle of words. 232 + // The "\b" allows us to link "(abcdef)" or similar without linking things 233 + // in the middle of words. 225 234 226 - return '((?<![#@-])'.$boundary.$prefix.'('.$id.')(?:#([-\w\d]+))?(?!\w))u'; 235 + $this->referencePattern = 236 + '((?<![#@-])'.$boundary.$prefix.'('.$id.')(?:#([-\w\d]+))?(?!\w))u'; 237 + } 238 + 239 + return $this->referencePattern; 227 240 } 228 241 229 242

Configure Feed

Configure Feed