@recaptime-dev's working patches + fork for Phorge, a community fork of Phabricator. (Upstream dev and stable branches are at upstream/main and upstream/stable respectively.) hq.recaptime.dev/wiki/Phorge
phorge phabricator
1
fork

Configure Feed

Select the types of activity you want to include in your feed.

Add a "terms" corpus to Ferret fields

Summary:
Ref T12819. Ferret currently does substring search, but this is not the default mode users expect: when you search for the "RICO" act, you do not expect to find documents containing "apRICOt" even though "RICO" is a substring.

To support term search, index the corpus as a list of terms with puncutation removed and whitespace normalized so the engine can match against it.

Test Plan:
Ran `storage upgrade`, ran `search index`, saw sensible database results:

```
rawCorpus: This is the task description.

Hark! Whom'st'dve eaten this "food" shall surely ~perish~?? #blessed
normalCorpus: thi the task descript hark whom dve eaten food shall sure perish bless
termCorpus: This is the task description Hark Whom'st'dve eaten this food shall surely perish blessed
```

Reviewers: chad

Reviewed By: chad

Maniphest Tasks: T12819

Differential Revision: https://secure.phabricator.com/D18498

+97 -2
+2
resources/sql/autopatches/20170830.ferret.02.term.sql
··· 1 + ALTER TABLE {$NAMESPACE}_maniphest.maniphest_task_ffield 2 + ADD termCorpus LONGTEXT NOT NULL COLLATE {$COLLATE_SORT};
+2
src/__phutil_library_map__.php
··· 3206 3206 'PhabricatorNavigationRemarkupRule' => 'infrastructure/markup/rule/PhabricatorNavigationRemarkupRule.php', 3207 3207 'PhabricatorNeverTriggerClock' => 'infrastructure/daemon/workers/clock/PhabricatorNeverTriggerClock.php', 3208 3208 'PhabricatorNgramEngine' => 'applications/search/ngrams/PhabricatorNgramEngine.php', 3209 + 'PhabricatorNgramEngineTestCase' => 'applications/search/ngrams/__tests__/PhabricatorNgramEngineTestCase.php', 3209 3210 'PhabricatorNgramsIndexEngineExtension' => 'applications/search/engineextension/PhabricatorNgramsIndexEngineExtension.php', 3210 3211 'PhabricatorNgramsInterface' => 'applications/search/interface/PhabricatorNgramsInterface.php', 3211 3212 'PhabricatorNotificationBuilder' => 'applications/notification/builder/PhabricatorNotificationBuilder.php', ··· 8587 8588 'PhabricatorNavigationRemarkupRule' => 'PhutilRemarkupRule', 8588 8589 'PhabricatorNeverTriggerClock' => 'PhabricatorTriggerClock', 8589 8590 'PhabricatorNgramEngine' => 'Phobject', 8591 + 'PhabricatorNgramEngineTestCase' => 'PhabricatorTestCase', 8590 8592 'PhabricatorNgramsIndexEngineExtension' => 'PhabricatorIndexEngineExtension', 8591 8593 'PhabricatorNgramsInterface' => 'PhabricatorIndexableInterface', 8592 8594 'PhabricatorNotificationBuilder' => 'Phobject',
+13 -2
src/applications/search/engineextension/PhabricatorFerretFulltextEngineExtension.php
··· 30 30 ->setEpochModified(0); 31 31 32 32 $stemmer = new PhutilSearchStemmer(); 33 + $ngram_engine = id(new PhabricatorNgramEngine()); 33 34 34 35 $key_all = PhabricatorSearchDocumentFieldType::FIELD_ALL; 35 36 36 37 $empty_template = array( 37 38 'raw' => array(), 39 + 'term' => array(), 38 40 'normal' => array(), 39 41 ); 40 42 ··· 49 51 } 50 52 51 53 $normal_corpus = $stemmer->stemCorpus($raw_corpus); 54 + $term_corpus = $ngram_engine->newTermsCorpus($raw_corpus); 52 55 53 56 if (!isset($ferret_corpus_map[$key])) { 54 57 $ferret_corpus_map[$key] = $empty_template; 55 58 } 56 59 57 60 $ferret_corpus_map[$key]['raw'][] = $raw_corpus; 61 + $ferret_corpus_map[$key]['term'][] = $term_corpus; 58 62 $ferret_corpus_map[$key]['normal'][] = $normal_corpus; 59 63 60 64 $ferret_corpus_map[$key_all]['raw'][] = $raw_corpus; 65 + $ferret_corpus_map[$key_all]['term'][] = $term_corpus; 61 66 $ferret_corpus_map[$key_all]['normal'][] = $normal_corpus; 62 67 } 63 68 ··· 69 74 $normal_corpus = $fields['normal']; 70 75 $normal_corpus = implode("\n", $normal_corpus); 71 76 77 + $term_corpus = $fields['term']; 78 + $term_corpus = implode(' ', $term_corpus); 79 + if (strlen($term_corpus)) { 80 + $term_corpus = ' '.$term_corpus.' '; 81 + } 82 + 72 83 $ferret_fields[] = $engine->newFieldObject() 73 84 ->setFieldKey($key) 74 85 ->setRawCorpus($raw_corpus) 86 + ->setTermCorpus($term_corpus) 75 87 ->setNormalCorpus($normal_corpus); 76 88 } 77 89 78 90 $ngrams_source = $ferret_corpus_map[$key_all]['raw']; 79 91 $ngrams_source = implode("\n", $ngrams_source); 80 92 81 - $ngrams = id(new PhabricatorNgramEngine()) 82 - ->getNgramsFromString($ngrams_source, 'index'); 93 + $ngrams = $ngram_engine->getNgramsFromString($ngrams_source, 'index'); 83 94 84 95 $ferret_document->openTransaction(); 85 96
+2
src/applications/search/ferret/PhabricatorFerretField.php
··· 6 6 protected $documentID; 7 7 protected $fieldKey; 8 8 protected $rawCorpus; 9 + protected $termCorpus; 9 10 protected $normalCorpus; 10 11 11 12 abstract public function getIndexKey(); ··· 17 18 'documentID' => 'uint32', 18 19 'fieldKey' => 'text4', 19 20 'rawCorpus' => 'sort', 21 + 'termCorpus' => 'sort', 20 22 'normalCorpus' => 'sort', 21 23 ), 22 24 self::CONFIG_KEY_SCHEMA => array(
+52
src/applications/search/ngrams/PhabricatorNgramEngine.php
··· 40 40 return array_keys($ngrams); 41 41 } 42 42 43 + public function newTermsCorpus($raw_corpus) { 44 + $term_corpus = strtr( 45 + $raw_corpus, 46 + array( 47 + '!' => ' ', 48 + '"' => ' ', 49 + '#' => ' ', 50 + '$' => ' ', 51 + '%' => ' ', 52 + '&' => ' ', 53 + '(' => ' ', 54 + ')' => ' ', 55 + '*' => ' ', 56 + '+' => ' ', 57 + ',' => ' ', 58 + '-' => ' ', 59 + '/' => ' ', 60 + ':' => ' ', 61 + ';' => ' ', 62 + '<' => ' ', 63 + '=' => ' ', 64 + '>' => ' ', 65 + '?' => ' ', 66 + '@' => ' ', 67 + '[' => ' ', 68 + '\\' => ' ', 69 + ']' => ' ', 70 + '^' => ' ', 71 + '`' => ' ', 72 + '{' => ' ', 73 + '|' => ' ', 74 + '}' => ' ', 75 + '~' => ' ', 76 + '.' => ' ', 77 + '_' => ' ', 78 + "\n" => ' ', 79 + "\r" => ' ', 80 + "\t" => ' ', 81 + )); 82 + 83 + // NOTE: Single quotes divide terms only if they're at a word boundary. 84 + // In contractions, like "whom'st've", the entire word is a single term. 85 + $term_corpus = preg_replace('/(^| )[\']+/', ' ', $term_corpus); 86 + $term_corpus = preg_replace('/[\']+( |$)/', ' ', $term_corpus); 87 + 88 + $term_corpus = preg_replace('/\s+/u', ' ', $term_corpus); 89 + $term_corpus = trim($term_corpus, ' '); 90 + 91 + return $term_corpus; 92 + } 93 + 94 + 43 95 }
+26
src/applications/search/ngrams/__tests__/PhabricatorNgramEngineTestCase.php
··· 1 + <?php 2 + 3 + final class PhabricatorNgramEngineTestCase 4 + extends PhabricatorTestCase { 5 + 6 + public function testTermsCorpus() { 7 + $map = array( 8 + 'Hear ye, hear ye!' => 'Hear ye hear ye', 9 + "Thou whom'st've art worthy." => "Thou whom'st've art worthy", 10 + 'Guaranteed to contain "food".' => 'Guaranteed to contain food', 11 + 'http://example.org/path/to/file.jpg' => 12 + 'http example org path to file jpg', 13 + ); 14 + 15 + $engine = new PhabricatorNgramEngine(); 16 + foreach ($map as $input => $expect) { 17 + $actual = $engine->newTermsCorpus($input); 18 + 19 + $this->assertEqual( 20 + $expect, 21 + $actual, 22 + pht('Terms corpus for: %s', $input)); 23 + } 24 + } 25 + 26 + }