@recaptime-dev's working patches + fork for Phorge, a community fork of Phabricator. (Upstream dev and stable branches are at upstream/main and upstream/stable respectively.) hq.recaptime.dev/wiki/Phorge
phorge phabricator
1
fork

Configure Feed

Select the types of activity you want to include in your feed.

Move search query parser/compiler classes to Phabricator

Summary: Ref T13472. Ref T13395. These classes are only used by Phabricator and not likely to find much use in Arcanist.

Test Plan: Grepped libphutil and Arcanist for removed symbols.

Maniphest Tasks: T13472, T13395

Differential Revision: https://secure.phabricator.com/D20939

+1294
+20
externals/porter-stemmer/LICENSE
··· 1 + The MIT License (MIT) 2 + 3 + Copyright (c) 2005-2016 Richard Heyes (http://www.phpguru.org/) 4 + 5 + Permission is hereby granted, free of charge, to any person obtaining a copy of 6 + this software and associated documentation files (the "Software"), to deal in 7 + the Software without restriction, including without limitation the rights to 8 + use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of 9 + the Software, and to permit persons to whom the Software is furnished to do so, 10 + subject to the following conditions: 11 + 12 + The above copyright notice and this permission notice shall be included in all 13 + copies or substantial portions of the Software. 14 + 15 + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS 17 + FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR 18 + COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER 19 + IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN 20 + CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+42
externals/porter-stemmer/README.md
··· 1 + # Porter Stemmer by Richard Heyes 2 + 3 + # Installation (with composer) 4 + 5 + ```json 6 + { 7 + "require": { 8 + "camspiers/porter-stemmer": "1.0.0" 9 + } 10 + } 11 + ``` 12 + 13 + $ composer install 14 + 15 + # Usage 16 + 17 + ```php 18 + $stem = Porter::Stem($word); 19 + ``` 20 + 21 + # License 22 + 23 + The MIT License (MIT) 24 + 25 + Copyright (c) 2005-2016 Richard Heyes (http://www.phpguru.org/) 26 + 27 + Permission is hereby granted, free of charge, to any person obtaining a copy of 28 + this software and associated documentation files (the "Software"), to deal in 29 + the Software without restriction, including without limitation the rights to 30 + use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of 31 + the Software, and to permit persons to whom the Software is furnished to do so, 32 + subject to the following conditions: 33 + 34 + The above copyright notice and this permission notice shall be included in all 35 + copies or substantial portions of the Software. 36 + 37 + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 38 + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS 39 + FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR 40 + COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER 41 + IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN 42 + CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+426
externals/porter-stemmer/src/Porter.php
··· 1 + <?php 2 + 3 + # vim: set expandtab tabstop=4 shiftwidth=4 softtabstop=4: 4 + 5 + /** 6 + * Copyright (c) 2005-2016 Richard Heyes (http://www.phpguru.org/) 7 + * 8 + * Portions Copyright 2003-2007 Jon Abernathy <jon@chuggnutt.com> 9 + * 10 + * Originally available under the GPL 2 or greater. Relicensed with permission 11 + * of original authors under the MIT License in 2016. 12 + * 13 + * All rights reserved. 14 + * 15 + * @package PorterStemmer 16 + * @author Richard Heyes 17 + * @author Jon Abernathy <jon@chuggnutt.com> 18 + * @copyright 2005-2016 Richard Heyes (http://www.phpguru.org/) 19 + * @license http://www.opensource.org/licenses/mit-license.html MIT License 20 + */ 21 + 22 + /** 23 + * PHP 5 Implementation of the Porter Stemmer algorithm. Certain elements 24 + * were borrowed from the (broken) implementation by Jon Abernathy. 25 + * 26 + * See http://tartarus.org/~martin/PorterStemmer/ for a description of the 27 + * algorithm. 28 + * 29 + * Usage: 30 + * 31 + * $stem = PorterStemmer::Stem($word); 32 + * 33 + * How easy is that? 34 + * 35 + * @package PorterStemmer 36 + * @author Richard Heyes 37 + * @author Jon Abernathy <jon@chuggnutt.com> 38 + * @copyright 2005-2016 Richard Heyes (http://www.phpguru.org/) 39 + * @license http://www.opensource.org/licenses/mit-license.html MIT License 40 + */ 41 + class Porter 42 + { 43 + /** 44 + * Regex for matching a consonant 45 + * 46 + * @var string 47 + */ 48 + private static $regex_consonant = '(?:[bcdfghjklmnpqrstvwxz]|(?<=[aeiou])y|^y)'; 49 + 50 + /** 51 + * Regex for matching a vowel 52 + * 53 + * @var string 54 + */ 55 + private static $regex_vowel = '(?:[aeiou]|(?<![aeiou])y)'; 56 + 57 + /** 58 + * Stems a word. Simple huh? 59 + * 60 + * @param string $word Word to stem 61 + * 62 + * @return string Stemmed word 63 + */ 64 + public static function Stem($word) 65 + { 66 + if (strlen($word) <= 2) { 67 + return $word; 68 + } 69 + 70 + $word = self::step1ab($word); 71 + $word = self::step1c($word); 72 + $word = self::step2($word); 73 + $word = self::step3($word); 74 + $word = self::step4($word); 75 + $word = self::step5($word); 76 + 77 + return $word; 78 + } 79 + 80 + /** 81 + * Step 1 82 + */ 83 + private static function step1ab($word) 84 + { 85 + // Part a 86 + if (substr($word, -1) == 's') { 87 + 88 + self::replace($word, 'sses', 'ss') 89 + OR self::replace($word, 'ies', 'i') 90 + OR self::replace($word, 'ss', 'ss') 91 + OR self::replace($word, 's', ''); 92 + } 93 + 94 + // Part b 95 + if (substr($word, -2, 1) != 'e' OR !self::replace($word, 'eed', 'ee', 0)) { // First rule 96 + $v = self::$regex_vowel; 97 + 98 + // ing and ed 99 + if ( preg_match("#$v+#", substr($word, 0, -3)) && self::replace($word, 'ing', '') 100 + OR preg_match("#$v+#", substr($word, 0, -2)) && self::replace($word, 'ed', '')) { // Note use of && and OR, for precedence reasons 101 + 102 + // If one of above two test successful 103 + if ( !self::replace($word, 'at', 'ate') 104 + AND !self::replace($word, 'bl', 'ble') 105 + AND !self::replace($word, 'iz', 'ize')) { 106 + 107 + // Double consonant ending 108 + if ( self::doubleConsonant($word) 109 + AND substr($word, -2) != 'll' 110 + AND substr($word, -2) != 'ss' 111 + AND substr($word, -2) != 'zz') { 112 + 113 + $word = substr($word, 0, -1); 114 + 115 + } elseif (self::m($word) == 1 AND self::cvc($word)) { 116 + $word .= 'e'; 117 + } 118 + } 119 + } 120 + } 121 + 122 + return $word; 123 + } 124 + 125 + /** 126 + * Step 1c 127 + * 128 + * @param string $word Word to stem 129 + */ 130 + private static function step1c($word) 131 + { 132 + $v = self::$regex_vowel; 133 + 134 + if (substr($word, -1) == 'y' && preg_match("#$v+#", substr($word, 0, -1))) { 135 + self::replace($word, 'y', 'i'); 136 + } 137 + 138 + return $word; 139 + } 140 + 141 + /** 142 + * Step 2 143 + * 144 + * @param string $word Word to stem 145 + */ 146 + private static function step2($word) 147 + { 148 + switch (substr($word, -2, 1)) { 149 + case 'a': 150 + self::replace($word, 'ational', 'ate', 0) 151 + OR self::replace($word, 'tional', 'tion', 0); 152 + break; 153 + 154 + case 'c': 155 + self::replace($word, 'enci', 'ence', 0) 156 + OR self::replace($word, 'anci', 'ance', 0); 157 + break; 158 + 159 + case 'e': 160 + self::replace($word, 'izer', 'ize', 0); 161 + break; 162 + 163 + case 'g': 164 + self::replace($word, 'logi', 'log', 0); 165 + break; 166 + 167 + case 'l': 168 + self::replace($word, 'entli', 'ent', 0) 169 + OR self::replace($word, 'ousli', 'ous', 0) 170 + OR self::replace($word, 'alli', 'al', 0) 171 + OR self::replace($word, 'bli', 'ble', 0) 172 + OR self::replace($word, 'eli', 'e', 0); 173 + break; 174 + 175 + case 'o': 176 + self::replace($word, 'ization', 'ize', 0) 177 + OR self::replace($word, 'ation', 'ate', 0) 178 + OR self::replace($word, 'ator', 'ate', 0); 179 + break; 180 + 181 + case 's': 182 + self::replace($word, 'iveness', 'ive', 0) 183 + OR self::replace($word, 'fulness', 'ful', 0) 184 + OR self::replace($word, 'ousness', 'ous', 0) 185 + OR self::replace($word, 'alism', 'al', 0); 186 + break; 187 + 188 + case 't': 189 + self::replace($word, 'biliti', 'ble', 0) 190 + OR self::replace($word, 'aliti', 'al', 0) 191 + OR self::replace($word, 'iviti', 'ive', 0); 192 + break; 193 + } 194 + 195 + return $word; 196 + } 197 + 198 + /** 199 + * Step 3 200 + * 201 + * @param string $word String to stem 202 + */ 203 + private static function step3($word) 204 + { 205 + switch (substr($word, -2, 1)) { 206 + case 'a': 207 + self::replace($word, 'ical', 'ic', 0); 208 + break; 209 + 210 + case 's': 211 + self::replace($word, 'ness', '', 0); 212 + break; 213 + 214 + case 't': 215 + self::replace($word, 'icate', 'ic', 0) 216 + OR self::replace($word, 'iciti', 'ic', 0); 217 + break; 218 + 219 + case 'u': 220 + self::replace($word, 'ful', '', 0); 221 + break; 222 + 223 + case 'v': 224 + self::replace($word, 'ative', '', 0); 225 + break; 226 + 227 + case 'z': 228 + self::replace($word, 'alize', 'al', 0); 229 + break; 230 + } 231 + 232 + return $word; 233 + } 234 + 235 + /** 236 + * Step 4 237 + * 238 + * @param string $word Word to stem 239 + */ 240 + private static function step4($word) 241 + { 242 + switch (substr($word, -2, 1)) { 243 + case 'a': 244 + self::replace($word, 'al', '', 1); 245 + break; 246 + 247 + case 'c': 248 + self::replace($word, 'ance', '', 1) 249 + OR self::replace($word, 'ence', '', 1); 250 + break; 251 + 252 + case 'e': 253 + self::replace($word, 'er', '', 1); 254 + break; 255 + 256 + case 'i': 257 + self::replace($word, 'ic', '', 1); 258 + break; 259 + 260 + case 'l': 261 + self::replace($word, 'able', '', 1) 262 + OR self::replace($word, 'ible', '', 1); 263 + break; 264 + 265 + case 'n': 266 + self::replace($word, 'ant', '', 1) 267 + OR self::replace($word, 'ement', '', 1) 268 + OR self::replace($word, 'ment', '', 1) 269 + OR self::replace($word, 'ent', '', 1); 270 + break; 271 + 272 + case 'o': 273 + if (substr($word, -4) == 'tion' OR substr($word, -4) == 'sion') { 274 + self::replace($word, 'ion', '', 1); 275 + } else { 276 + self::replace($word, 'ou', '', 1); 277 + } 278 + break; 279 + 280 + case 's': 281 + self::replace($word, 'ism', '', 1); 282 + break; 283 + 284 + case 't': 285 + self::replace($word, 'ate', '', 1) 286 + OR self::replace($word, 'iti', '', 1); 287 + break; 288 + 289 + case 'u': 290 + self::replace($word, 'ous', '', 1); 291 + break; 292 + 293 + case 'v': 294 + self::replace($word, 'ive', '', 1); 295 + break; 296 + 297 + case 'z': 298 + self::replace($word, 'ize', '', 1); 299 + break; 300 + } 301 + 302 + return $word; 303 + } 304 + 305 + /** 306 + * Step 5 307 + * 308 + * @param string $word Word to stem 309 + */ 310 + private static function step5($word) 311 + { 312 + // Part a 313 + if (substr($word, -1) == 'e') { 314 + if (self::m(substr($word, 0, -1)) > 1) { 315 + self::replace($word, 'e', ''); 316 + 317 + } elseif (self::m(substr($word, 0, -1)) == 1) { 318 + 319 + if (!self::cvc(substr($word, 0, -1))) { 320 + self::replace($word, 'e', ''); 321 + } 322 + } 323 + } 324 + 325 + // Part b 326 + if (self::m($word) > 1 AND self::doubleConsonant($word) AND substr($word, -1) == 'l') { 327 + $word = substr($word, 0, -1); 328 + } 329 + 330 + return $word; 331 + } 332 + 333 + /** 334 + * Replaces the first string with the second, at the end of the string 335 + * 336 + * If third arg is given, then the preceding string must match that m 337 + * count at least. 338 + * 339 + * @param string $str String to check 340 + * @param string $check Ending to check for 341 + * @param string $repl Replacement string 342 + * @param int $m Optional minimum number of m() to meet 343 + * 344 + * @return bool Whether the $check string was at the end of the $str 345 + * string. True does not necessarily mean that it was 346 + * replaced. 347 + */ 348 + private static function replace(&$str, $check, $repl, $m = null) 349 + { 350 + $len = 0 - strlen($check); 351 + 352 + if (substr($str, $len) == $check) { 353 + $substr = substr($str, 0, $len); 354 + if (is_null($m) OR self::m($substr) > $m) { 355 + $str = $substr . $repl; 356 + } 357 + 358 + return true; 359 + } 360 + 361 + return false; 362 + } 363 + 364 + /** 365 + * What, you mean it's not obvious from the name? 366 + * 367 + * m() measures the number of consonant sequences in $str. if c is 368 + * a consonant sequence and v a vowel sequence, and <..> indicates arbitrary 369 + * presence, 370 + * 371 + * <c><v> gives 0 372 + * <c>vc<v> gives 1 373 + * <c>vcvc<v> gives 2 374 + * <c>vcvcvc<v> gives 3 375 + * 376 + * @param string $str The string to return the m count for 377 + * 378 + * @return int The m count 379 + */ 380 + private static function m($str) 381 + { 382 + $c = self::$regex_consonant; 383 + $v = self::$regex_vowel; 384 + 385 + $str = preg_replace("#^$c+#", '', $str); 386 + $str = preg_replace("#$v+$#", '', $str); 387 + 388 + preg_match_all("#($v+$c+)#", $str, $matches); 389 + 390 + return count($matches[1]); 391 + } 392 + 393 + /** 394 + * Returns true/false as to whether the given string contains two 395 + * of the same consonant next to each other at the end of the string. 396 + * 397 + * @param string $str String to check 398 + * 399 + * @return bool Result 400 + */ 401 + private static function doubleConsonant($str) 402 + { 403 + $c = self::$regex_consonant; 404 + 405 + return preg_match("#$c{2}$#", $str, $matches) AND $matches[0]{0} == $matches[0]{1}; 406 + } 407 + 408 + /** 409 + * Checks for ending CVC sequence where second C is not W, X or Y 410 + * 411 + * @param string $str String to check 412 + * 413 + * @return bool Result 414 + */ 415 + private static function cvc($str) 416 + { 417 + $c = self::$regex_consonant; 418 + $v = self::$regex_vowel; 419 + 420 + return preg_match("#($c$v$c)$#", $str, $matches) 421 + AND strlen($matches[1]) == 3 422 + AND $matches[1]{2} != 'w' 423 + AND $matches[1]{2} != 'x' 424 + AND $matches[1]{2} != 'y'; 425 + } 426 + }
+12
src/__phutil_library_map__.php
··· 5658 5658 'PhutilRemarkupTableBlockRule' => 'infrastructure/markup/blockrule/PhutilRemarkupTableBlockRule.php', 5659 5659 'PhutilRemarkupTestInterpreterRule' => 'infrastructure/markup/blockrule/PhutilRemarkupTestInterpreterRule.php', 5660 5660 'PhutilRemarkupUnderlineRule' => 'infrastructure/markup/markuprule/PhutilRemarkupUnderlineRule.php', 5661 + 'PhutilSearchQueryCompiler' => 'applications/search/compiler/PhutilSearchQueryCompiler.php', 5662 + 'PhutilSearchQueryCompilerSyntaxException' => 'applications/search/compiler/PhutilSearchQueryCompilerSyntaxException.php', 5663 + 'PhutilSearchQueryCompilerTestCase' => 'applications/search/compiler/__tests__/PhutilSearchQueryCompilerTestCase.php', 5664 + 'PhutilSearchQueryToken' => 'applications/search/compiler/PhutilSearchQueryToken.php', 5665 + 'PhutilSearchStemmer' => 'applications/search/compiler/PhutilSearchStemmer.php', 5666 + 'PhutilSearchStemmerTestCase' => 'applications/search/compiler/__tests__/PhutilSearchStemmerTestCase.php', 5661 5667 'PhutilSlackAuthAdapter' => 'applications/auth/adapter/PhutilSlackAuthAdapter.php', 5662 5668 'PhutilTwitchAuthAdapter' => 'applications/auth/adapter/PhutilTwitchAuthAdapter.php', 5663 5669 'PhutilTwitterAuthAdapter' => 'applications/auth/adapter/PhutilTwitterAuthAdapter.php', ··· 12483 12489 'PhutilRemarkupTableBlockRule' => 'PhutilRemarkupBlockRule', 12484 12490 'PhutilRemarkupTestInterpreterRule' => 'PhutilRemarkupBlockInterpreter', 12485 12491 'PhutilRemarkupUnderlineRule' => 'PhutilRemarkupRule', 12492 + 'PhutilSearchQueryCompiler' => 'Phobject', 12493 + 'PhutilSearchQueryCompilerSyntaxException' => 'Exception', 12494 + 'PhutilSearchQueryCompilerTestCase' => 'PhutilTestCase', 12495 + 'PhutilSearchQueryToken' => 'Phobject', 12496 + 'PhutilSearchStemmer' => 'Phobject', 12497 + 'PhutilSearchStemmerTestCase' => 'PhutilTestCase', 12486 12498 'PhutilSlackAuthAdapter' => 'PhutilOAuthAuthAdapter', 12487 12499 'PhutilTwitchAuthAdapter' => 'PhutilOAuthAuthAdapter', 12488 12500 'PhutilTwitterAuthAdapter' => 'PhutilOAuth1AuthAdapter',
+374
src/applications/search/compiler/PhutilSearchQueryCompiler.php
··· 1 + <?php 2 + 3 + final class PhutilSearchQueryCompiler 4 + extends Phobject { 5 + 6 + private $operators = '+ -><()~*:""&|'; 7 + private $query; 8 + private $stemmer; 9 + private $enableFunctions = false; 10 + 11 + const OPERATOR_NOT = 'not'; 12 + const OPERATOR_AND = 'and'; 13 + const OPERATOR_SUBSTRING = 'sub'; 14 + const OPERATOR_EXACT = 'exact'; 15 + 16 + public function setOperators($operators) { 17 + $this->operators = $operators; 18 + return $this; 19 + } 20 + 21 + public function getOperators() { 22 + return $this->operators; 23 + } 24 + 25 + public function setStemmer(PhutilSearchStemmer $stemmer) { 26 + $this->stemmer = $stemmer; 27 + return $this; 28 + } 29 + 30 + public function getStemmer() { 31 + return $this->stemmer; 32 + } 33 + 34 + public function setEnableFunctions($enable_functions) { 35 + $this->enableFunctions = $enable_functions; 36 + return $this; 37 + } 38 + 39 + public function getEnableFunctions() { 40 + return $this->enableFunctions; 41 + } 42 + 43 + public function compileQuery(array $tokens) { 44 + assert_instances_of($tokens, 'PhutilSearchQueryToken'); 45 + 46 + $result = array(); 47 + foreach ($tokens as $token) { 48 + $result[] = $this->renderToken($token); 49 + } 50 + 51 + return $this->compileRenderedTokens($result); 52 + } 53 + 54 + public function compileLiteralQuery(array $tokens) { 55 + assert_instances_of($tokens, 'PhutilSearchQueryToken'); 56 + 57 + $result = array(); 58 + foreach ($tokens as $token) { 59 + if (!$token->isQuoted()) { 60 + continue; 61 + } 62 + $result[] = $this->renderToken($token); 63 + } 64 + 65 + return $this->compileRenderedTokens($result); 66 + } 67 + 68 + public function compileStemmedQuery(array $tokens) { 69 + assert_instances_of($tokens, 'PhutilSearchQueryToken'); 70 + 71 + $result = array(); 72 + foreach ($tokens as $token) { 73 + if ($token->isQuoted()) { 74 + continue; 75 + } 76 + $result[] = $this->renderToken($token, $this->getStemmer()); 77 + } 78 + 79 + return $this->compileRenderedTokens($result); 80 + } 81 + 82 + private function compileRenderedTokens(array $list) { 83 + if (!$list) { 84 + return null; 85 + } 86 + 87 + $list = array_unique($list); 88 + return implode(' ', $list); 89 + } 90 + 91 + public function newTokens($query) { 92 + $results = $this->tokenizeQuery($query); 93 + 94 + $tokens = array(); 95 + foreach ($results as $result) { 96 + $tokens[] = PhutilSearchQueryToken::newFromDictionary($result); 97 + } 98 + 99 + return $tokens; 100 + } 101 + 102 + private function tokenizeQuery($query) { 103 + $maximum_bytes = 1024; 104 + 105 + $query_bytes = strlen($query); 106 + if ($query_bytes > $maximum_bytes) { 107 + throw new PhutilSearchQueryCompilerSyntaxException( 108 + pht( 109 + 'Query is too long (%s bytes, maximum is %s bytes).', 110 + new PhutilNumber($query_bytes), 111 + new PhutilNumber($maximum_bytes))); 112 + } 113 + 114 + $query = phutil_utf8v($query); 115 + $length = count($query); 116 + 117 + $enable_functions = $this->getEnableFunctions(); 118 + 119 + $mode = 'scan'; 120 + $current_operator = array(); 121 + $current_token = array(); 122 + $current_function = null; 123 + $is_quoted = false; 124 + $tokens = array(); 125 + 126 + if ($enable_functions) { 127 + $operator_characters = '[~=+-]'; 128 + } else { 129 + $operator_characters = '[+-]'; 130 + } 131 + 132 + for ($ii = 0; $ii < $length; $ii++) { 133 + $character = $query[$ii]; 134 + 135 + if ($mode == 'scan') { 136 + if (preg_match('/^\s\z/u', $character)) { 137 + continue; 138 + } 139 + 140 + $mode = 'function'; 141 + } 142 + 143 + if ($mode == 'function') { 144 + $mode = 'operator'; 145 + 146 + if ($enable_functions) { 147 + $found = false; 148 + for ($jj = $ii; $jj < $length; $jj++) { 149 + if (preg_match('/^[a-zA-Z]\z/u', $query[$jj])) { 150 + continue; 151 + } 152 + if ($query[$jj] == ':') { 153 + $found = $jj; 154 + } 155 + break; 156 + } 157 + 158 + if ($found !== false) { 159 + $function = array_slice($query, $ii, ($jj - $ii)); 160 + $current_function = implode('', $function); 161 + 162 + if (!strlen($current_function)) { 163 + $current_function = null; 164 + } 165 + 166 + $ii = $jj; 167 + continue; 168 + } 169 + } 170 + } 171 + 172 + if ($mode == 'operator') { 173 + if (preg_match('/^\s\z/u', $character)) { 174 + continue; 175 + } 176 + 177 + if (preg_match('/^'.$operator_characters.'\z/', $character)) { 178 + $current_operator[] = $character; 179 + continue; 180 + } 181 + 182 + $mode = 'quote'; 183 + } 184 + 185 + if ($mode == 'quote') { 186 + if (preg_match('/^"\z/', $character)) { 187 + $is_quoted = true; 188 + $mode = 'token'; 189 + continue; 190 + } 191 + 192 + $mode = 'token'; 193 + } 194 + 195 + if ($mode == 'token') { 196 + $capture = false; 197 + $was_quoted = $is_quoted; 198 + if ($is_quoted) { 199 + if (preg_match('/^"\z/', $character)) { 200 + $capture = true; 201 + $mode = 'scan'; 202 + $is_quoted = false; 203 + } 204 + } else { 205 + if (preg_match('/^\s\z/u', $character)) { 206 + $capture = true; 207 + $mode = 'scan'; 208 + } 209 + 210 + if (preg_match('/^"\z/', $character)) { 211 + $capture = true; 212 + $mode = 'token'; 213 + $is_quoted = true; 214 + } 215 + } 216 + 217 + if ($capture) { 218 + $token = array( 219 + 'operator' => $current_operator, 220 + 'quoted' => $was_quoted, 221 + 'value' => $current_token, 222 + ); 223 + 224 + if ($enable_functions) { 225 + $token['function'] = $current_function; 226 + } 227 + 228 + $tokens[] = $token; 229 + 230 + $current_operator = array(); 231 + $current_token = array(); 232 + $current_function = null; 233 + continue; 234 + } else { 235 + $current_token[] = $character; 236 + } 237 + } 238 + } 239 + 240 + if ($is_quoted) { 241 + throw new PhutilSearchQueryCompilerSyntaxException( 242 + pht( 243 + 'Query contains unmatched double quotes.')); 244 + } 245 + 246 + if ($mode == 'operator') { 247 + throw new PhutilSearchQueryCompilerSyntaxException( 248 + pht( 249 + 'Query contains operator ("%s") with no search term.', 250 + implode('', $current_operator))); 251 + } 252 + 253 + $token = array( 254 + 'operator' => $current_operator, 255 + 'quoted' => false, 256 + 'value' => $current_token, 257 + ); 258 + 259 + if ($enable_functions) { 260 + $token['function'] = $current_function; 261 + } 262 + 263 + $tokens[] = $token; 264 + 265 + $results = array(); 266 + foreach ($tokens as $token) { 267 + $value = implode('', $token['value']); 268 + $operator_string = implode('', $token['operator']); 269 + 270 + if (!strlen($value)) { 271 + continue; 272 + } 273 + 274 + $is_quoted = $token['quoted']; 275 + 276 + switch ($operator_string) { 277 + case '-': 278 + $operator = self::OPERATOR_NOT; 279 + break; 280 + case '~': 281 + $operator = self::OPERATOR_SUBSTRING; 282 + break; 283 + case '=': 284 + $operator = self::OPERATOR_EXACT; 285 + break; 286 + case '+': 287 + $operator = self::OPERATOR_AND; 288 + break; 289 + case '': 290 + // See T12995. If this query term contains Chinese, Japanese or 291 + // Korean characters, treat the term as a substring term by default. 292 + // These languages do not separate words with spaces, so the term 293 + // search mode is normally useless. 294 + if ($enable_functions && !$is_quoted && phutil_utf8_is_cjk($value)) { 295 + $operator = self::OPERATOR_SUBSTRING; 296 + } else { 297 + $operator = self::OPERATOR_AND; 298 + } 299 + break; 300 + default: 301 + throw new PhutilSearchQueryCompilerSyntaxException( 302 + pht( 303 + 'Query has an invalid sequence of operators ("%s").', 304 + $operator_string)); 305 + } 306 + 307 + $result = array( 308 + 'operator' => $operator, 309 + 'quoted' => $is_quoted, 310 + 'value' => $value, 311 + ); 312 + 313 + if ($enable_functions) { 314 + $result['function'] = $token['function']; 315 + } 316 + 317 + $results[] = $result; 318 + } 319 + 320 + return $results; 321 + } 322 + 323 + private function renderToken( 324 + PhutilSearchQueryToken $token, 325 + PhutilSearchStemmer $stemmer = null) { 326 + $value = $token->getValue(); 327 + 328 + if ($stemmer) { 329 + $value = $stemmer->stemToken($value); 330 + } 331 + 332 + $value = $this->quoteToken($value); 333 + $operator = $token->getOperator(); 334 + $prefix = $this->getOperatorPrefix($operator); 335 + 336 + $value = $prefix.$value; 337 + 338 + return $value; 339 + } 340 + 341 + private function getOperatorPrefix($operator) { 342 + $operators = $this->operators; 343 + 344 + switch ($operator) { 345 + case self::OPERATOR_AND: 346 + $prefix = $operators[0]; 347 + break; 348 + case self::OPERATOR_NOT: 349 + $prefix = $operators[2]; 350 + break; 351 + default: 352 + throw new PhutilSearchQueryCompilerSyntaxException( 353 + pht( 354 + 'Unsupported operator prefix "%s".', 355 + $operator)); 356 + } 357 + 358 + if ($prefix == ' ') { 359 + $prefix = null; 360 + } 361 + 362 + return $prefix; 363 + } 364 + 365 + private function quoteToken($value) { 366 + $operators = $this->operators; 367 + 368 + $open_quote = $this->operators[10]; 369 + $close_quote = $this->operators[11]; 370 + 371 + return $open_quote.$value.$close_quote; 372 + } 373 + 374 + }
+4
src/applications/search/compiler/PhutilSearchQueryCompilerSyntaxException.php
··· 1 + <?php 2 + 3 + final class PhutilSearchQueryCompilerSyntaxException 4 + extends Exception {}
+37
src/applications/search/compiler/PhutilSearchQueryToken.php
··· 1 + <?php 2 + 3 + final class PhutilSearchQueryToken extends Phobject { 4 + 5 + private $isQuoted; 6 + private $value; 7 + private $operator; 8 + private $function; 9 + 10 + public static function newFromDictionary(array $dictionary) { 11 + $token = new self(); 12 + 13 + $token->isQuoted = $dictionary['quoted']; 14 + $token->operator = $dictionary['operator']; 15 + $token->value = $dictionary['value']; 16 + $token->function = idx($dictionary, 'function'); 17 + 18 + return $token; 19 + } 20 + 21 + public function isQuoted() { 22 + return $this->isQuoted; 23 + } 24 + 25 + public function getValue() { 26 + return $this->value; 27 + } 28 + 29 + public function getOperator() { 30 + return $this->operator; 31 + } 32 + 33 + public function getFunction() { 34 + return $this->function; 35 + } 36 + 37 + }
+74
src/applications/search/compiler/PhutilSearchStemmer.php
··· 1 + <?php 2 + 3 + final class PhutilSearchStemmer 4 + extends Phobject { 5 + 6 + public function stemToken($token) { 7 + $token = $this->normalizeToken($token); 8 + return $this->applyStemmer($token); 9 + } 10 + 11 + public function stemCorpus($corpus) { 12 + $corpus = $this->normalizeCorpus($corpus); 13 + $tokens = preg_split('/[^a-zA-Z0-9\x7F-\xFF._]+/', $corpus); 14 + 15 + $words = array(); 16 + foreach ($tokens as $key => $token) { 17 + $token = trim($token, '._'); 18 + 19 + if (strlen($token) < 3) { 20 + continue; 21 + } 22 + 23 + $words[$token] = $token; 24 + } 25 + 26 + $stems = array(); 27 + foreach ($words as $word) { 28 + $stems[] = $this->applyStemmer($word); 29 + } 30 + 31 + return implode(' ', $stems); 32 + } 33 + 34 + private function normalizeToken($token) { 35 + return phutil_utf8_strtolower($token); 36 + } 37 + 38 + private function normalizeCorpus($corpus) { 39 + return phutil_utf8_strtolower($corpus); 40 + } 41 + 42 + /** 43 + * @phutil-external-symbol class Porter 44 + */ 45 + private function applyStemmer($normalized_token) { 46 + // If the token has internal punctuation, handle it literally. This 47 + // deals with things like domain names, Conduit API methods, and other 48 + // sorts of informal tokens. 49 + if (preg_match('/[._]/', $normalized_token)) { 50 + return $normalized_token; 51 + } 52 + 53 + static $loaded; 54 + 55 + if ($loaded === null) { 56 + $root = dirname(phutil_get_library_root('phabricator')); 57 + require_once $root.'/externals/porter-stemmer/src/Porter.php'; 58 + $loaded = true; 59 + } 60 + 61 + 62 + $stem = Porter::stem($normalized_token); 63 + 64 + // If the stem is too short, it won't be a candidate for indexing. These 65 + // tokens are also likely to be acronyms (like "DNS") rather than real 66 + // English words. 67 + if (strlen($stem) < 3) { 68 + return $normalized_token; 69 + } 70 + 71 + return $stem; 72 + } 73 + 74 + }
+220
src/applications/search/compiler/__tests__/PhutilSearchQueryCompilerTestCase.php
··· 1 + <?php 2 + 3 + final class PhutilSearchQueryCompilerTestCase 4 + extends PhutilTestCase { 5 + 6 + public function testCompileQueries() { 7 + $tests = array( 8 + '' => null, 9 + 'cat dog' => '+"cat" +"dog"', 10 + 'cat -dog' => '+"cat" -"dog"', 11 + 'cat-dog' => '+"cat-dog"', 12 + 13 + // If there are spaces after an operator, the operator applies to the 14 + // next search term. 15 + 'cat - dog' => '+"cat" -"dog"', 16 + 17 + // Double quotes serve as delimiters even if there is no whitespace 18 + // between terms. 19 + '"cat"dog' => '+"cat" +"dog"', 20 + 21 + // This query is too long. 22 + str_repeat('x', 2048) => false, 23 + 24 + // Multiple operators are not permitted. 25 + '++cat' => false, 26 + '+-cat' => false, 27 + '--cat' => false, 28 + 29 + // Stray operators are not permitted. 30 + '+' => false, 31 + 'cat +' => false, 32 + 33 + // Double quotes must be paired. 34 + '"' => false, 35 + 'cat "' => false, 36 + '"cat' => false, 37 + 'A"' => false, 38 + 'A"B"' => '+"A" +"B"', 39 + ); 40 + 41 + $this->assertCompileQueries($tests); 42 + 43 + // Test that we compile queries correctly if the operators have been 44 + // swapped to use "AND" by default. 45 + $operator_tests = array( 46 + 'cat dog' => '"cat" "dog"', 47 + 'cat -dog' => '"cat" -"dog"', 48 + ); 49 + $this->assertCompileQueries($operator_tests, ' |-><()~*:""&\''); 50 + 51 + 52 + // Test that we compile queries correctly if the quote operators have 53 + // been swapped to differ. 54 + $quote_tests = array( 55 + 'cat dog' => '+[cat] +[dog]', 56 + 'cat -dog' => '+[cat] -[dog]', 57 + ); 58 + $this->assertCompileQueries($quote_tests, '+ -><()~*:[]&|'); 59 + 60 + } 61 + 62 + public function testCompileQueriesWithStemming() { 63 + $stemming_tests = array( 64 + 'cat dog' => array( 65 + null, 66 + '+"cat" +"dog"', 67 + ), 68 + 'cats dogs' => array( 69 + null, 70 + '+"cat" +"dog"', 71 + ), 72 + 'cats "dogs"' => array( 73 + '+"dogs"', 74 + '+"cat"', 75 + ), 76 + '"blessed blade" of the windseeker' => array( 77 + '+"blessed blade"', 78 + '+"of" +"the" +"windseek"', 79 + ), 80 + 'mailing users for mentions on tasks' => array( 81 + null, 82 + '+"mail" +"user" +"for" +"mention" +"on" +"task"', 83 + ), 84 + ); 85 + 86 + $stemmer = new PhutilSearchStemmer(); 87 + $this->assertCompileQueries($stemming_tests, null, $stemmer); 88 + } 89 + 90 + public function testCompileQueriesWithFunctions() { 91 + $op_and = PhutilSearchQueryCompiler::OPERATOR_AND; 92 + $op_sub = PhutilSearchQueryCompiler::OPERATOR_SUBSTRING; 93 + $op_exact = PhutilSearchQueryCompiler::OPERATOR_EXACT; 94 + 95 + $mao = "\xE7\x8C\xAB"; 96 + 97 + $function_tests = array( 98 + 'cat' => array( 99 + array(null, $op_and, 'cat'), 100 + ), 101 + ':cat' => array( 102 + array(null, $op_and, 'cat'), 103 + ), 104 + 'title:cat' => array( 105 + array('title', $op_and, 'cat'), 106 + ), 107 + 'title:cat:dog' => array( 108 + array('title', $op_and, 'cat:dog'), 109 + ), 110 + 'title:~cat' => array( 111 + array('title', $op_sub, 'cat'), 112 + ), 113 + 'cat title:="Meow Meow"' => array( 114 + array(null, $op_and, 'cat'), 115 + array('title', $op_exact, 'Meow Meow'), 116 + ), 117 + 'title:cat title:dog' => array( 118 + array('title', $op_and, 'cat'), 119 + array('title', $op_and, 'dog'), 120 + ), 121 + '~"core and seven years ag"' => array( 122 + array(null, $op_sub, 'core and seven years ag'), 123 + ), 124 + $mao => array( 125 + array(null, $op_sub, $mao), 126 + ), 127 + '+'.$mao => array( 128 + array(null, $op_and, $mao), 129 + ), 130 + '~'.$mao => array( 131 + array(null, $op_sub, $mao), 132 + ), 133 + '"'.$mao.'"' => array( 134 + array(null, $op_and, $mao), 135 + ), 136 + ); 137 + 138 + $this->assertCompileFunctionQueries($function_tests); 139 + } 140 + 141 + private function assertCompileQueries( 142 + array $tests, 143 + $operators = null, 144 + PhutilSearchStemmer $stemmer = null) { 145 + foreach ($tests as $input => $expect) { 146 + $caught = null; 147 + 148 + $query = null; 149 + $literal_query = null; 150 + $stemmed_query = null; 151 + 152 + try { 153 + $compiler = new PhutilSearchQueryCompiler(); 154 + 155 + if ($operators !== null) { 156 + $compiler->setOperators($operators); 157 + } 158 + 159 + if ($stemmer !== null) { 160 + $compiler->setStemmer($stemmer); 161 + } 162 + 163 + $tokens = $compiler->newTokens($input); 164 + 165 + if ($stemmer) { 166 + $literal_query = $compiler->compileLiteralQuery($tokens); 167 + $stemmed_query = $compiler->compileStemmedQuery($tokens); 168 + } else { 169 + $query = $compiler->compileQuery($tokens); 170 + } 171 + } catch (PhutilSearchQueryCompilerSyntaxException $ex) { 172 + $caught = $ex; 173 + } 174 + 175 + if ($caught !== null) { 176 + $query = false; 177 + $literal_query = false; 178 + $stemmed_query = false; 179 + } 180 + 181 + if (!$stemmer) { 182 + $this->assertEqual( 183 + $expect, 184 + $query, 185 + pht('Compilation of query: %s', $input)); 186 + } else { 187 + $this->assertEqual( 188 + $expect, 189 + ($literal_query === false) 190 + ? false 191 + : array($literal_query, $stemmed_query), 192 + pht('Stemmed compilation of query: %s', $input)); 193 + } 194 + } 195 + } 196 + 197 + private function assertCompileFunctionQueries(array $tests) { 198 + foreach ($tests as $input => $expect) { 199 + $compiler = id(new PhutilSearchQueryCompiler()) 200 + ->setEnableFunctions(true); 201 + 202 + $tokens = $compiler->newTokens($input); 203 + 204 + $result = array(); 205 + foreach ($tokens as $token) { 206 + $result[] = array( 207 + $token->getFunction(), 208 + $token->getOperator(), 209 + $token->getValue(), 210 + ); 211 + } 212 + 213 + $this->assertEqual( 214 + $expect, 215 + $result, 216 + pht('Function compilation of query: %s', $input)); 217 + } 218 + } 219 + 220 + }
+85
src/applications/search/compiler/__tests__/PhutilSearchStemmerTestCase.php
··· 1 + <?php 2 + 3 + final class PhutilSearchStemmerTestCase 4 + extends PhutilTestCase { 5 + 6 + public function testStemTokens() { 7 + $tests = array( 8 + // Various real-world cases collected from users before we implemented 9 + // stemming. 10 + 'tokens' => 'token', 11 + 'panels' => 'panel', 12 + 13 + 'renames' => 'renam', 14 + 'rename' => 'renam', 15 + 16 + 'components' => 'compon', 17 + 'component' => 'compon', 18 + 19 + 'implementation' => 'implement', 20 + 'implements' => 'implement', 21 + 'implementing' => 'implement', 22 + 'implementer' => 'implement', 23 + 24 + 'deleting' => 'delet', 25 + 'deletion' => 'delet', 26 + 'delete' => 'delet', 27 + 28 + 'erratically' => 'errat', 29 + 'erratic' => 'errat', 30 + 31 + // Stems should be normalized. 32 + 'DOG' => 'dog', 33 + 34 + // If stemming would bring a token under 3 characters, it should not 35 + // be stemmed. 36 + 'dns' => 'dns', 37 + 'nis' => 'nis', 38 + 39 + // Complex tokens with internal punctuation should be left untouched; 40 + // these are usually things like domain names, API calls, informal tags, 41 + // etc. 42 + 'apples' => 'appl', 43 + 'bananas' => 'banana', 44 + 'apples_bananas' => 'apples_bananas', 45 + 'apples_bananas.apples_bananas' => 'apples_bananas.apples_bananas', 46 + ); 47 + 48 + $stemmer = new PhutilSearchStemmer(); 49 + foreach ($tests as $input => $expect) { 50 + $stem = $stemmer->stemToken($input); 51 + $this->assertEqual( 52 + $expect, 53 + $stem, 54 + pht('Token stem of "%s".', $input)); 55 + } 56 + } 57 + 58 + public function testStemDocuments() { 59 + $tests = array( 60 + 'The wild boar meandered erratically.' => 61 + 'the wild boar meander errat', 62 + 'Fool me onc, shame on you. Fool me twice, shame on me.' => 63 + 'fool onc shame you twice', 64 + 'Fireball is a seventh-level spell which deals 2d16 points of damage '. 65 + 'in a 1-meter radius around a target.' => 66 + 'firebal seventh level spell which deal 2d16 point damag meter '. 67 + 'radiu around target', 68 + 'apples-bananas' => 'appl banana', 69 + 'apples_bananas' => 'apples_bananas', 70 + 'apples.bananas' => 'apples.bananas', 71 + 'oddly-proportioned' => 'oddli proport', 72 + ); 73 + 74 + $stemmer = new PhutilSearchStemmer(); 75 + foreach ($tests as $input => $expect) { 76 + $stem = $stemmer->stemCorpus($input); 77 + $this->assertEqual( 78 + $expect, 79 + $stem, 80 + pht('Corpus stem of: %s', $input)); 81 + } 82 + } 83 + 84 + 85 + }