@recaptime-dev's working patches + fork for Phorge, a community fork of Phabricator. (Upstream dev and stable branches are at upstream/main and upstream/stable respectively.) hq.recaptime.dev/wiki/Phorge
phorge phabricator
1
fork

Configure Feed

Select the types of activity you want to include in your feed.

Consolidate more Ferret engine code into FerretEngine

Summary: Ref T12819. Earlier I separated some ngram code into an "ngram engine" hoping to share it across the simple Ngrams stuff and the full Ferret stuff, but they actually use slightly different rules. Just pull more of this stuff into FerretEngine to reduce the number of moving pieces and the amount of code duplication.

Test Plan: Searched for terms, rebuilt indexes.

Reviewers: chad

Reviewed By: chad

Maniphest Tasks: T12819

Differential Revision: https://secure.phabricator.com/D18533

+110 -115
+2 -4
src/__phutil_library_map__.php
··· 2834 2834 'PhabricatorFeedStoryReference' => 'applications/feed/storage/PhabricatorFeedStoryReference.php', 2835 2835 'PhabricatorFerretDocument' => 'applications/search/ferret/PhabricatorFerretDocument.php', 2836 2836 'PhabricatorFerretEngine' => 'applications/search/ferret/PhabricatorFerretEngine.php', 2837 + 'PhabricatorFerretEngineTestCase' => 'applications/search/ferret/__tests__/PhabricatorFerretEngineTestCase.php', 2837 2838 'PhabricatorFerretField' => 'applications/search/ferret/PhabricatorFerretField.php', 2838 2839 'PhabricatorFerretFulltextEngineExtension' => 'applications/search/engineextension/PhabricatorFerretFulltextEngineExtension.php', 2839 2840 'PhabricatorFerretInterface' => 'applications/search/ferret/PhabricatorFerretInterface.php', ··· 3205 3206 'PhabricatorNamedQueryQuery' => 'applications/search/query/PhabricatorNamedQueryQuery.php', 3206 3207 'PhabricatorNavigationRemarkupRule' => 'infrastructure/markup/rule/PhabricatorNavigationRemarkupRule.php', 3207 3208 'PhabricatorNeverTriggerClock' => 'infrastructure/daemon/workers/clock/PhabricatorNeverTriggerClock.php', 3208 - 'PhabricatorNgramEngine' => 'applications/search/ngrams/PhabricatorNgramEngine.php', 3209 - 'PhabricatorNgramEngineTestCase' => 'applications/search/ngrams/__tests__/PhabricatorNgramEngineTestCase.php', 3210 3209 'PhabricatorNgramsIndexEngineExtension' => 'applications/search/engineextension/PhabricatorNgramsIndexEngineExtension.php', 3211 3210 'PhabricatorNgramsInterface' => 'applications/search/interface/PhabricatorNgramsInterface.php', 3212 3211 'PhabricatorNotificationBuilder' => 'applications/notification/builder/PhabricatorNotificationBuilder.php', ··· 8166 8165 'PhabricatorFeedStoryReference' => 'PhabricatorFeedDAO', 8167 8166 'PhabricatorFerretDocument' => 'PhabricatorSearchDAO', 8168 8167 'PhabricatorFerretEngine' => 'Phobject', 8168 + 'PhabricatorFerretEngineTestCase' => 'PhabricatorTestCase', 8169 8169 'PhabricatorFerretField' => 'PhabricatorSearchDAO', 8170 8170 'PhabricatorFerretFulltextEngineExtension' => 'PhabricatorFulltextEngineExtension', 8171 8171 'PhabricatorFerretNgrams' => 'PhabricatorSearchDAO', ··· 8587 8587 'PhabricatorNamedQueryQuery' => 'PhabricatorCursorPagedPolicyAwareQuery', 8588 8588 'PhabricatorNavigationRemarkupRule' => 'PhutilRemarkupRule', 8589 8589 'PhabricatorNeverTriggerClock' => 'PhabricatorTriggerClock', 8590 - 'PhabricatorNgramEngine' => 'Phobject', 8591 - 'PhabricatorNgramEngineTestCase' => 'PhabricatorTestCase', 8592 8590 'PhabricatorNgramsIndexEngineExtension' => 'PhabricatorIndexEngineExtension', 8593 8591 'PhabricatorNgramsInterface' => 'PhabricatorIndexableInterface', 8594 8592 'PhabricatorNotificationBuilder' => 'Phobject',
+4 -5
src/applications/search/engineextension/PhabricatorFerretFulltextEngineExtension.php
··· 29 29 ->setEpochCreated(0) 30 30 ->setEpochModified(0); 31 31 32 - $stemmer = new PhutilSearchStemmer(); 33 - $ngram_engine = id(new PhabricatorNgramEngine()); 32 + $stemmer = $engine->newStemmer(); 34 33 35 34 // Copy all of the "title" and "body" fields to create new "core" fields. 36 35 // This allows users to search "in title or body" with the "core:" prefix. ··· 69 68 continue; 70 69 } 71 70 72 - $term_corpus = $ngram_engine->newTermsCorpus($raw_corpus); 71 + $term_corpus = $engine->newTermsCorpus($raw_corpus); 73 72 74 73 $normal_corpus = $stemmer->stemCorpus($raw_corpus); 75 - $normal_coprus = $ngram_engine->newTermsCorpus($normal_corpus); 74 + $normal_coprus = $engine->newTermsCorpus($normal_corpus); 76 75 77 76 if (!isset($ferret_corpus_map[$key])) { 78 77 $ferret_corpus_map[$key] = $empty_template; ··· 116 115 } 117 116 $ngrams_source = implode(' ', $ngrams_source); 118 117 119 - $ngrams = $ngram_engine->getNgramsFromString($ngrams_source, 'index'); 118 + $ngrams = $engine->getNgramsFromString($ngrams_source, 'index'); 120 119 121 120 $ferret_document->openTransaction(); 122 121
+93
src/applications/search/ferret/PhabricatorFerretEngine.php
··· 6 6 abstract public function newDocumentObject(); 7 7 abstract public function newFieldObject(); 8 8 9 + public function newStemmer() { 10 + return new PhutilSearchStemmer(); 11 + } 12 + 13 + public function tokenizeString($value) { 14 + $value = trim($value, ' '); 15 + $value = preg_split('/ +/', $value); 16 + return $value; 17 + } 18 + 19 + public function getNgramsFromString($value, $mode) { 20 + $tokens = $this->tokenizeString($value); 21 + 22 + $ngrams = array(); 23 + foreach ($tokens as $token) { 24 + $token = phutil_utf8_strtolower($token); 25 + 26 + switch ($mode) { 27 + case 'query': 28 + break; 29 + case 'index': 30 + $token = ' '.$token.' '; 31 + break; 32 + case 'prefix': 33 + $token = ' '.$token; 34 + break; 35 + } 36 + 37 + $token_v = phutil_utf8v($token); 38 + $len = (count($token_v) - 2); 39 + for ($ii = 0; $ii < $len; $ii++) { 40 + $ngram = array_slice($token_v, $ii, 3); 41 + $ngram = implode('', $ngram); 42 + $ngrams[$ngram] = $ngram; 43 + } 44 + } 45 + 46 + ksort($ngrams); 47 + 48 + return array_keys($ngrams); 49 + } 50 + 51 + public function newTermsCorpus($raw_corpus) { 52 + $term_corpus = strtr( 53 + $raw_corpus, 54 + array( 55 + '!' => ' ', 56 + '"' => ' ', 57 + '#' => ' ', 58 + '$' => ' ', 59 + '%' => ' ', 60 + '&' => ' ', 61 + '(' => ' ', 62 + ')' => ' ', 63 + '*' => ' ', 64 + '+' => ' ', 65 + ',' => ' ', 66 + '-' => ' ', 67 + '/' => ' ', 68 + ':' => ' ', 69 + ';' => ' ', 70 + '<' => ' ', 71 + '=' => ' ', 72 + '>' => ' ', 73 + '?' => ' ', 74 + '@' => ' ', 75 + '[' => ' ', 76 + '\\' => ' ', 77 + ']' => ' ', 78 + '^' => ' ', 79 + '`' => ' ', 80 + '{' => ' ', 81 + '|' => ' ', 82 + '}' => ' ', 83 + '~' => ' ', 84 + '.' => ' ', 85 + '_' => ' ', 86 + "\n" => ' ', 87 + "\r" => ' ', 88 + "\t" => ' ', 89 + )); 90 + 91 + // NOTE: Single quotes divide terms only if they're at a word boundary. 92 + // In contractions, like "whom'st've", the entire word is a single term. 93 + $term_corpus = preg_replace('/(^| )[\']+/', ' ', $term_corpus); 94 + $term_corpus = preg_replace('/[\']+( |$)/', ' ', $term_corpus); 95 + 96 + $term_corpus = preg_replace('/\s+/u', ' ', $term_corpus); 97 + $term_corpus = trim($term_corpus, ' '); 98 + 99 + return $term_corpus; 100 + } 101 + 9 102 }
-95
src/applications/search/ngrams/PhabricatorNgramEngine.php
··· 1 - <?php 2 - 3 - final class PhabricatorNgramEngine extends Phobject { 4 - 5 - public function tokenizeString($value) { 6 - $value = trim($value, ' '); 7 - $value = preg_split('/ +/', $value); 8 - return $value; 9 - } 10 - 11 - public function getNgramsFromString($value, $mode) { 12 - $tokens = $this->tokenizeString($value); 13 - 14 - $ngrams = array(); 15 - foreach ($tokens as $token) { 16 - $token = phutil_utf8_strtolower($token); 17 - 18 - switch ($mode) { 19 - case 'query': 20 - break; 21 - case 'index': 22 - $token = ' '.$token.' '; 23 - break; 24 - case 'prefix': 25 - $token = ' '.$token; 26 - break; 27 - } 28 - 29 - $token_v = phutil_utf8v($token); 30 - $len = (count($token_v) - 2); 31 - for ($ii = 0; $ii < $len; $ii++) { 32 - $ngram = array_slice($token_v, $ii, 3); 33 - $ngram = implode('', $ngram); 34 - $ngrams[$ngram] = $ngram; 35 - } 36 - } 37 - 38 - ksort($ngrams); 39 - 40 - return array_keys($ngrams); 41 - } 42 - 43 - public function newTermsCorpus($raw_corpus) { 44 - $term_corpus = strtr( 45 - $raw_corpus, 46 - array( 47 - '!' => ' ', 48 - '"' => ' ', 49 - '#' => ' ', 50 - '$' => ' ', 51 - '%' => ' ', 52 - '&' => ' ', 53 - '(' => ' ', 54 - ')' => ' ', 55 - '*' => ' ', 56 - '+' => ' ', 57 - ',' => ' ', 58 - '-' => ' ', 59 - '/' => ' ', 60 - ':' => ' ', 61 - ';' => ' ', 62 - '<' => ' ', 63 - '=' => ' ', 64 - '>' => ' ', 65 - '?' => ' ', 66 - '@' => ' ', 67 - '[' => ' ', 68 - '\\' => ' ', 69 - ']' => ' ', 70 - '^' => ' ', 71 - '`' => ' ', 72 - '{' => ' ', 73 - '|' => ' ', 74 - '}' => ' ', 75 - '~' => ' ', 76 - '.' => ' ', 77 - '_' => ' ', 78 - "\n" => ' ', 79 - "\r" => ' ', 80 - "\t" => ' ', 81 - )); 82 - 83 - // NOTE: Single quotes divide terms only if they're at a word boundary. 84 - // In contractions, like "whom'st've", the entire word is a single term. 85 - $term_corpus = preg_replace('/(^| )[\']+/', ' ', $term_corpus); 86 - $term_corpus = preg_replace('/[\']+( |$)/', ' ', $term_corpus); 87 - 88 - $term_corpus = preg_replace('/\s+/u', ' ', $term_corpus); 89 - $term_corpus = trim($term_corpus, ' '); 90 - 91 - return $term_corpus; 92 - } 93 - 94 - 95 - }
+3 -2
src/applications/search/ngrams/__tests__/PhabricatorNgramEngineTestCase.php src/applications/search/ferret/__tests__/PhabricatorFerretEngineTestCase.php
··· 1 1 <?php 2 2 3 - final class PhabricatorNgramEngineTestCase 3 + final class PhabricatorFerretEngineTestCase 4 4 extends PhabricatorTestCase { 5 5 6 6 public function testTermsCorpus() { ··· 12 12 'http example org path to file jpg', 13 13 ); 14 14 15 - $engine = new PhabricatorNgramEngine(); 15 + $engine = new ManiphestTaskFerretEngine(); 16 + 16 17 foreach ($map as $input => $expect) { 17 18 $actual = $engine->newTermsCorpus($input); 18 19
+8 -9
src/infrastructure/query/policy/PhabricatorCursorPagedPolicyAwareQuery.php
··· 1453 1453 $op_not = PhutilSearchQueryCompiler::OPERATOR_NOT; 1454 1454 1455 1455 $engine = $this->ferretEngine; 1456 - $ngram_engine = new PhabricatorNgramEngine(); 1457 - $stemmer = new PhutilSearchStemmer(); 1456 + $stemmer = $engine->newStemmer(); 1458 1457 1459 1458 $ngram_table = $engine->newNgramsObject(); 1460 1459 $ngram_table_name = $ngram_table->getTableName(); ··· 1498 1497 } 1499 1498 1500 1499 if ($is_substring) { 1501 - $ngrams = $ngram_engine->getNgramsFromString($value, 'query'); 1500 + $ngrams = $engine->getNgramsFromString($value, 'query'); 1502 1501 } else { 1503 - $ngrams = $ngram_engine->getNgramsFromString($value, 'index'); 1502 + $ngrams = $engine->getNgramsFromString($value, 'index'); 1504 1503 1505 1504 // If this is a stemmed term, only look for ngrams present in both the 1506 1505 // unstemmed and stemmed variations. 1507 1506 if ($is_stemmed) { 1508 1507 $stem_value = $stemmer->stemToken($value); 1509 - $stem_ngrams = $ngram_engine->getNgramsFromString( 1508 + $stem_ngrams = $engine->getNgramsFromString( 1510 1509 $stem_value, 1511 1510 'index'); 1512 1511 ··· 1587 1586 return array(); 1588 1587 } 1589 1588 1590 - $ngram_engine = new PhabricatorNgramEngine(); 1591 - $stemmer = new PhutilSearchStemmer(); 1589 + $engine = $this->ferretEngine; 1590 + $stemmer = $engine->newStemmer(); 1592 1591 $table_map = $this->ferretTables; 1593 1592 1594 1593 $op_sub = PhutilSearchQueryCompiler::OPERATOR_SUBSTRING; ··· 1653 1652 1654 1653 $term_constraints = array(); 1655 1654 1656 - $term_value = ' '.$ngram_engine->newTermsCorpus($value).' '; 1655 + $term_value = ' '.$engine->newTermsCorpus($value).' '; 1657 1656 if ($is_not) { 1658 1657 $term_constraints[] = qsprintf( 1659 1658 $conn, ··· 1670 1669 1671 1670 if ($is_stemmed) { 1672 1671 $stem_value = $stemmer->stemToken($value); 1673 - $stem_value = $ngram_engine->newTermsCorpus($stem_value); 1672 + $stem_value = $engine->newTermsCorpus($stem_value); 1674 1673 $stem_value = ' '.$stem_value.' '; 1675 1674 1676 1675 $term_constraints[] = qsprintf(