@recaptime-dev's working patches + fork for Phorge, a community fork of Phabricator. (Upstream dev and stable branches are at upstream/main and upstream/stable respectively.) hq.recaptime.dev/wiki/Phorge
phorge phabricator
1
fork

Configure Feed

Select the types of activity you want to include in your feed.

Add a workflow for populating (or depopulating) the common ngrams table

Summary:
Depends on D18672. Ref T13000. This does an on-demand build of the common ngrams table.

Plan here is:

- Push to `secure`.
- Build the common ngrams table here.
- See if stuff breaks?

If it looks okay on this dataset, we can build out the GC support and try it in production.

Test Plan:
- Locally, my dataset has a bunch of `bin/lipsum` tasks with similar, common words.
- Verified that ipsum terms now skip ngrams. For "lorem ipsum" search performance actually IMPROVED by skipping the ngrams table (12s to 9s).
- Queried for normal terms, got very fast results using the ngram table, as normal.

Reviewers: amckinley

Reviewed By: amckinley

Maniphest Tasks: T13000

Differential Revision: https://secure.phabricator.com/D18673

+108
+2
src/__phutil_library_map__.php
··· 3948 3948 'PhabricatorSearchIndexVersionDestructionEngineExtension' => 'applications/search/engineextension/PhabricatorSearchIndexVersionDestructionEngineExtension.php', 3949 3949 'PhabricatorSearchManagementIndexWorkflow' => 'applications/search/management/PhabricatorSearchManagementIndexWorkflow.php', 3950 3950 'PhabricatorSearchManagementInitWorkflow' => 'applications/search/management/PhabricatorSearchManagementInitWorkflow.php', 3951 + 'PhabricatorSearchManagementNgramsWorkflow' => 'applications/search/management/PhabricatorSearchManagementNgramsWorkflow.php', 3951 3952 'PhabricatorSearchManagementWorkflow' => 'applications/search/management/PhabricatorSearchManagementWorkflow.php', 3952 3953 'PhabricatorSearchNgrams' => 'applications/search/ngrams/PhabricatorSearchNgrams.php', 3953 3954 'PhabricatorSearchNgramsDestructionEngineExtension' => 'applications/search/engineextension/PhabricatorSearchNgramsDestructionEngineExtension.php', ··· 9528 9529 'PhabricatorSearchIndexVersionDestructionEngineExtension' => 'PhabricatorDestructionEngineExtension', 9529 9530 'PhabricatorSearchManagementIndexWorkflow' => 'PhabricatorSearchManagementWorkflow', 9530 9531 'PhabricatorSearchManagementInitWorkflow' => 'PhabricatorSearchManagementWorkflow', 9532 + 'PhabricatorSearchManagementNgramsWorkflow' => 'PhabricatorSearchManagementWorkflow', 9531 9533 'PhabricatorSearchManagementWorkflow' => 'PhabricatorManagementWorkflow', 9532 9534 'PhabricatorSearchNgrams' => 'PhabricatorSearchDAO', 9533 9535 'PhabricatorSearchNgramsDestructionEngineExtension' => 'PhabricatorDestructionEngineExtension',
+106
src/applications/search/management/PhabricatorSearchManagementNgramsWorkflow.php
··· 1 + <?php 2 + 3 + final class PhabricatorSearchManagementNgramsWorkflow 4 + extends PhabricatorSearchManagementWorkflow { 5 + 6 + protected function didConstruct() { 7 + $this 8 + ->setName('ngrams') 9 + ->setSynopsis(pht('Recompute common ngrams.')) 10 + ->setArguments( 11 + array( 12 + array( 13 + 'name' => 'reset', 14 + 'help' => pht('Reset all common ngram records.'), 15 + ), 16 + )); 17 + } 18 + 19 + public function execute(PhutilArgumentParser $args) { 20 + $is_reset = $args->getArg('reset'); 21 + 22 + $all_objects = id(new PhutilClassMapQuery()) 23 + ->setAncestorClass('PhabricatorFerretInterface') 24 + ->execute(); 25 + 26 + $min_documents = 4096; 27 + $threshold = 0.15; 28 + 29 + foreach ($all_objects as $object) { 30 + $engine = $object->newFerretEngine(); 31 + $conn = $object->establishConnection('w'); 32 + $display_name = get_class($object); 33 + 34 + if ($is_reset) { 35 + echo tsprintf( 36 + "%s\n", 37 + pht( 38 + 'Resetting common ngrams for "%s".', 39 + $display_name)); 40 + 41 + queryfx( 42 + $conn, 43 + 'DELETE FROM %T', 44 + $engine->getCommonNgramsTableName()); 45 + continue; 46 + } 47 + 48 + $document_count = queryfx_one( 49 + $conn, 50 + 'SELECT COUNT(*) N FROM %T', 51 + $engine->getDocumentTableName()); 52 + $document_count = $document_count['N']; 53 + 54 + if ($document_count < $min_documents) { 55 + echo tsprintf( 56 + "%s\n", 57 + pht( 58 + 'Too few documents of type "%s" for any ngrams to be common.', 59 + $display_name)); 60 + continue; 61 + } 62 + 63 + $min_frequency = (int)ceil($document_count * $threshold); 64 + $common_ngrams = queryfx_all( 65 + $conn, 66 + 'SELECT ngram, COUNT(*) N FROM %T 67 + GROUP BY ngram 68 + HAVING N >= %d', 69 + $engine->getNgramsTableName(), 70 + $min_frequency); 71 + 72 + if (!$common_ngrams) { 73 + echo tsprintf( 74 + "%s\n", 75 + pht( 76 + 'No new common ngrams exist for "%s".', 77 + $display_name)); 78 + continue; 79 + } 80 + 81 + $sql = array(); 82 + foreach ($common_ngrams as $ngram) { 83 + $sql[] = qsprintf( 84 + $conn, 85 + '(%s, 1)', 86 + $ngram['ngram']); 87 + } 88 + 89 + foreach (PhabricatorLiskDAO::chunkSQL($sql) as $chunk) { 90 + queryfx( 91 + $conn, 92 + 'INSERT IGNORE INTO %T (ngram, needsCollection) 93 + VALUES %Q', 94 + $engine->getCommonNgramsTableName(), 95 + $chunk); 96 + } 97 + 98 + echo tsprintf( 99 + "%s\n", 100 + pht( 101 + 'Updated common ngrams for "%s".', 102 + $display_name)); 103 + } 104 + } 105 + 106 + }