@recaptime-dev's working patches + fork for Phorge, a community fork of Phabricator. (Upstream dev and stable branches are at upstream/main and upstream/stable respectively.) hq.recaptime.dev/wiki/Phorge
phorge phabricator
1
fork

Configure Feed

Select the types of activity you want to include in your feed.

Reduce the number of magic strings in the Ferret implementation

Summary:
Ref T12819. Push more of the magic `' '` stuff into the engine and simplify calls to ngram construction.

Also fixes a bug where a task with title "apple banana" and description "cherry doughnut" could match query "banana cherry" by separating separate term segments with newlines instead of spaces.

Test Plan:
- Indexed some objects.
- Searched (term, substring, quoted terms).
- Viewed index in database.

Reviewers: chad

Reviewed By: chad

Maniphest Tasks: T12819

Differential Revision: https://secure.phabricator.com/D18534

+31 -30
+8 -8
src/applications/search/engineextension/PhabricatorFerretFulltextEngineExtension.php
··· 71 71 $term_corpus = $engine->newTermsCorpus($raw_corpus); 72 72 73 73 $normal_corpus = $stemmer->stemCorpus($raw_corpus); 74 - $normal_coprus = $engine->newTermsCorpus($normal_corpus); 74 + $normal_corpus = $engine->newTermsCorpus($normal_corpus); 75 75 76 76 if (!isset($ferret_corpus_map[$key])) { 77 77 $ferret_corpus_map[$key] = $empty_template; ··· 91 91 foreach ($ferret_corpus_map as $key => $fields) { 92 92 $raw_corpus = $fields['raw']; 93 93 $raw_corpus = implode("\n", $raw_corpus); 94 - $ngrams_source[] = $raw_corpus; 94 + if (strlen($raw_corpus)) { 95 + $ngrams_source[] = $raw_corpus; 96 + } 95 97 96 98 $normal_corpus = $fields['normal']; 97 - $normal_corpus = implode(' ', $normal_corpus); 99 + $normal_corpus = implode("\n", $normal_corpus); 98 100 if (strlen($normal_corpus)) { 99 101 $ngrams_source[] = $normal_corpus; 100 - $normal_corpus = ' '.$normal_corpus.' '; 101 102 } 102 103 103 104 $term_corpus = $fields['term']; 104 - $term_corpus = implode(' ', $term_corpus); 105 + $term_corpus = implode("\n", $term_corpus); 105 106 if (strlen($term_corpus)) { 106 107 $ngrams_source[] = $term_corpus; 107 - $term_corpus = ' '.$term_corpus.' '; 108 108 } 109 109 110 110 $ferret_fields[] = $engine->newFieldObject() ··· 113 113 ->setTermCorpus($term_corpus) 114 114 ->setNormalCorpus($normal_corpus); 115 115 } 116 - $ngrams_source = implode(' ', $ngrams_source); 116 + $ngrams_source = implode("\n", $ngrams_source); 117 117 118 - $ngrams = $engine->getNgramsFromString($ngrams_source, 'index'); 118 + $ngrams = $engine->getTermNgramsFromString($ngrams_source); 119 119 120 120 $ferret_document->openTransaction(); 121 121
+15 -10
src/applications/search/ferret/PhabricatorFerretEngine.php
··· 16 16 return $value; 17 17 } 18 18 19 - public function getNgramsFromString($value, $mode) { 19 + public function getTermNgramsFromString($string) { 20 + return $this->getNgramsFromString($string, true); 21 + } 22 + 23 + public function getSubstringNgramsFromString($string) { 24 + return $this->getNgramsFromString($string, false); 25 + } 26 + 27 + private function getNgramsFromString($value, $as_term) { 20 28 $tokens = $this->tokenizeString($value); 21 29 22 30 $ngrams = array(); 23 31 foreach ($tokens as $token) { 24 32 $token = phutil_utf8_strtolower($token); 25 33 26 - switch ($mode) { 27 - case 'query': 28 - break; 29 - case 'index': 30 - $token = ' '.$token.' '; 31 - break; 32 - case 'prefix': 33 - $token = ' '.$token; 34 - break; 34 + if ($as_term) { 35 + $token = ' '.$token.' '; 35 36 } 36 37 37 38 $token_v = phutil_utf8v($token); ··· 95 96 96 97 $term_corpus = preg_replace('/\s+/u', ' ', $term_corpus); 97 98 $term_corpus = trim($term_corpus, ' '); 99 + 100 + if (strlen($term_corpus)) { 101 + $term_corpus = ' '.$term_corpus.' '; 102 + } 98 103 99 104 return $term_corpus; 100 105 }
+4 -4
src/applications/search/ferret/__tests__/PhabricatorFerretEngineTestCase.php
··· 5 5 6 6 public function testTermsCorpus() { 7 7 $map = array( 8 - 'Hear ye, hear ye!' => 'Hear ye hear ye', 9 - "Thou whom'st've art worthy." => "Thou whom'st've art worthy", 10 - 'Guaranteed to contain "food".' => 'Guaranteed to contain food', 8 + 'Hear ye, hear ye!' => ' Hear ye hear ye ', 9 + "Thou whom'st've art worthy." => " Thou whom'st've art worthy ", 10 + 'Guaranteed to contain "food".' => ' Guaranteed to contain food ', 11 11 'http://example.org/path/to/file.jpg' => 12 - 'http example org path to file jpg', 12 + ' http example org path to file jpg ', 13 13 ); 14 14 15 15 $engine = new ManiphestTaskFerretEngine();
+4 -8
src/infrastructure/query/policy/PhabricatorCursorPagedPolicyAwareQuery.php
··· 1497 1497 } 1498 1498 1499 1499 if ($is_substring) { 1500 - $ngrams = $engine->getNgramsFromString($value, 'query'); 1500 + $ngrams = $engine->getSubstringNgramsFromString($value); 1501 1501 } else { 1502 - $ngrams = $engine->getNgramsFromString($value, 'index'); 1502 + $ngrams = $engine->getTermNgramsFromString($value); 1503 1503 1504 1504 // If this is a stemmed term, only look for ngrams present in both the 1505 1505 // unstemmed and stemmed variations. 1506 1506 if ($is_stemmed) { 1507 1507 $stem_value = $stemmer->stemToken($value); 1508 - $stem_ngrams = $engine->getNgramsFromString( 1509 - $stem_value, 1510 - 'index'); 1511 - 1508 + $stem_ngrams = $engine->getTermNgramsFromString($stem_value); 1512 1509 $ngrams = array_intersect($ngrams, $stem_ngrams); 1513 1510 } 1514 1511 } ··· 1652 1649 1653 1650 $term_constraints = array(); 1654 1651 1655 - $term_value = ' '.$engine->newTermsCorpus($value).' '; 1652 + $term_value = $engine->newTermsCorpus($value); 1656 1653 if ($is_not) { 1657 1654 $term_constraints[] = qsprintf( 1658 1655 $conn, ··· 1670 1667 if ($is_stemmed) { 1671 1668 $stem_value = $stemmer->stemToken($value); 1672 1669 $stem_value = $engine->newTermsCorpus($stem_value); 1673 - $stem_value = ' '.$stem_value.' '; 1674 1670 1675 1671 $term_constraints[] = qsprintf( 1676 1672 $conn,