@recaptime-dev's working patches + fork for Phorge, a community fork of Phabricator. (Upstream dev and stable branches are at upstream/main and upstream/stable respectively.) hq.recaptime.dev/wiki/Phorge
phorge phabricator
1
fork

Configure Feed

Select the types of activity you want to include in your feed.

Allow the Ferret engine to remove "common" ngrams from the index

Summary:
Ref T13000. This adds support for tracking "common" ngrams, which occur in too many documents to be useful as part of the ngram index.

If an ngram is listed in the "common" table, it won't be written when indexing documents, or queried for when searching for them.

In this change, nothing actually writes to the "common" table. I'll start writing to the table in a followup change.

Specifically, I plan to do this:

- A new GC process updates the "common" table periodically, by writing ngrams which appear in more than X% of documents to it, for some value of X, if there are at least a minimum number of documents (maybe like 4,000).
- A new GC process deletes ngrams that have been added to the common table from the existing indexes.

Hopefully, this will pare down the ngrams index to something reasonable over time without requiring any manual tuning.

Test Plan:
- Ran some queries and indexes.
- Manually inserted ngrams `xxx` and `yyy` into the ngrams table, searched and indexed, saw them ignored as viable ngrams for search/index.

Reviewers: amckinley

Reviewed By: amckinley

Maniphest Tasks: T13000

Differential Revision: https://secure.phabricator.com/D18672

+200 -12
+7
resources/sql/autopatches/20171002.cngram.01.maniphest.sql
··· 1 + CREATE TABLE {$NAMESPACE}_maniphest.maniphest_task_fngrams_common ( 2 + id INT UNSIGNED NOT NULL AUTO_INCREMENT PRIMARY KEY, 3 + ngram CHAR(3) NOT NULL COLLATE {$COLLATE_TEXT}, 4 + needsCollection BOOL NOT NULL, 5 + UNIQUE KEY `key_ngram` (ngram), 6 + KEY `key_collect` (needsCollection) 7 + ) ENGINE=InnoDB, COLLATE {$COLLATE_TEXT};
+7
resources/sql/autopatches/20171002.cngram.02.event.sql
··· 1 + CREATE TABLE {$NAMESPACE}_calendar.calendar_event_fngrams_common ( 2 + id INT UNSIGNED NOT NULL AUTO_INCREMENT PRIMARY KEY, 3 + ngram CHAR(3) NOT NULL COLLATE {$COLLATE_TEXT}, 4 + needsCollection BOOL NOT NULL, 5 + UNIQUE KEY `key_ngram` (ngram), 6 + KEY `key_collect` (needsCollection) 7 + ) ENGINE=InnoDB, COLLATE {$COLLATE_TEXT};
+7
resources/sql/autopatches/20171002.cngram.03.revision.sql
··· 1 + CREATE TABLE {$NAMESPACE}_differential.differential_revision_fngrams_common ( 2 + id INT UNSIGNED NOT NULL AUTO_INCREMENT PRIMARY KEY, 3 + ngram CHAR(3) NOT NULL COLLATE {$COLLATE_TEXT}, 4 + needsCollection BOOL NOT NULL, 5 + UNIQUE KEY `key_ngram` (ngram), 6 + KEY `key_collect` (needsCollection) 7 + ) ENGINE=InnoDB, COLLATE {$COLLATE_TEXT};
+7
resources/sql/autopatches/20171002.cngram.04.fund.sql
··· 1 + CREATE TABLE {$NAMESPACE}_fund.fund_initiative_fngrams_common ( 2 + id INT UNSIGNED NOT NULL AUTO_INCREMENT PRIMARY KEY, 3 + ngram CHAR(3) NOT NULL COLLATE {$COLLATE_TEXT}, 4 + needsCollection BOOL NOT NULL, 5 + UNIQUE KEY `key_ngram` (ngram), 6 + KEY `key_collect` (needsCollection) 7 + ) ENGINE=InnoDB, COLLATE {$COLLATE_TEXT};
+7
resources/sql/autopatches/20171002.cngram.05.owners.sql
··· 1 + CREATE TABLE {$NAMESPACE}_owners.owners_package_fngrams_common ( 2 + id INT UNSIGNED NOT NULL AUTO_INCREMENT PRIMARY KEY, 3 + ngram CHAR(3) NOT NULL COLLATE {$COLLATE_TEXT}, 4 + needsCollection BOOL NOT NULL, 5 + UNIQUE KEY `key_ngram` (ngram), 6 + KEY `key_collect` (needsCollection) 7 + ) ENGINE=InnoDB, COLLATE {$COLLATE_TEXT};
+7
resources/sql/autopatches/20171002.cngram.06.passphrase.sql
··· 1 + CREATE TABLE {$NAMESPACE}_passphrase.passphrase_credential_fngrams_common ( 2 + id INT UNSIGNED NOT NULL AUTO_INCREMENT PRIMARY KEY, 3 + ngram CHAR(3) NOT NULL COLLATE {$COLLATE_TEXT}, 4 + needsCollection BOOL NOT NULL, 5 + UNIQUE KEY `key_ngram` (ngram), 6 + KEY `key_collect` (needsCollection) 7 + ) ENGINE=InnoDB, COLLATE {$COLLATE_TEXT};
+7
resources/sql/autopatches/20171002.cngram.07.blog.sql
··· 1 + CREATE TABLE {$NAMESPACE}_phame.phame_blog_fngrams_common ( 2 + id INT UNSIGNED NOT NULL AUTO_INCREMENT PRIMARY KEY, 3 + ngram CHAR(3) NOT NULL COLLATE {$COLLATE_TEXT}, 4 + needsCollection BOOL NOT NULL, 5 + UNIQUE KEY `key_ngram` (ngram), 6 + KEY `key_collect` (needsCollection) 7 + ) ENGINE=InnoDB, COLLATE {$COLLATE_TEXT};
+7
resources/sql/autopatches/20171002.cngram.08.post.sql
··· 1 + CREATE TABLE {$NAMESPACE}_phame.phame_post_fngrams_common ( 2 + id INT UNSIGNED NOT NULL AUTO_INCREMENT PRIMARY KEY, 3 + ngram CHAR(3) NOT NULL COLLATE {$COLLATE_TEXT}, 4 + needsCollection BOOL NOT NULL, 5 + UNIQUE KEY `key_ngram` (ngram), 6 + KEY `key_collect` (needsCollection) 7 + ) ENGINE=InnoDB, COLLATE {$COLLATE_TEXT};
+7
resources/sql/autopatches/20171002.cngram.09.pholio.sql
··· 1 + CREATE TABLE {$NAMESPACE}_pholio.pholio_mock_fngrams_common ( 2 + id INT UNSIGNED NOT NULL AUTO_INCREMENT PRIMARY KEY, 3 + ngram CHAR(3) NOT NULL COLLATE {$COLLATE_TEXT}, 4 + needsCollection BOOL NOT NULL, 5 + UNIQUE KEY `key_ngram` (ngram), 6 + KEY `key_collect` (needsCollection) 7 + ) ENGINE=InnoDB, COLLATE {$COLLATE_TEXT};
+7
resources/sql/autopatches/20171002.cngram.10.phriction.sql
··· 1 + CREATE TABLE {$NAMESPACE}_phriction.phriction_document_fngrams_common ( 2 + id INT UNSIGNED NOT NULL AUTO_INCREMENT PRIMARY KEY, 3 + ngram CHAR(3) NOT NULL COLLATE {$COLLATE_TEXT}, 4 + needsCollection BOOL NOT NULL, 5 + UNIQUE KEY `key_ngram` (ngram), 6 + KEY `key_collect` (needsCollection) 7 + ) ENGINE=InnoDB, COLLATE {$COLLATE_TEXT};
+7
resources/sql/autopatches/20171002.cngram.11.project.sql
··· 1 + CREATE TABLE {$NAMESPACE}_project.project_project_fngrams_common ( 2 + id INT UNSIGNED NOT NULL AUTO_INCREMENT PRIMARY KEY, 3 + ngram CHAR(3) NOT NULL COLLATE {$COLLATE_TEXT}, 4 + needsCollection BOOL NOT NULL, 5 + UNIQUE KEY `key_ngram` (ngram), 6 + KEY `key_collect` (needsCollection) 7 + ) ENGINE=InnoDB, COLLATE {$COLLATE_TEXT};
+7
resources/sql/autopatches/20171002.cngram.12.user.sql
··· 1 + CREATE TABLE {$NAMESPACE}_user.user_user_fngrams_common ( 2 + id INT UNSIGNED NOT NULL AUTO_INCREMENT PRIMARY KEY, 3 + ngram CHAR(3) NOT NULL COLLATE {$COLLATE_TEXT}, 4 + needsCollection BOOL NOT NULL, 5 + UNIQUE KEY `key_ngram` (ngram), 6 + KEY `key_collect` (needsCollection) 7 + ) ENGINE=InnoDB, COLLATE {$COLLATE_TEXT};
+7
resources/sql/autopatches/20171002.cngram.13.repository.sql
··· 1 + CREATE TABLE {$NAMESPACE}_repository.repository_repository_fngrams_common ( 2 + id INT UNSIGNED NOT NULL AUTO_INCREMENT PRIMARY KEY, 3 + ngram CHAR(3) NOT NULL COLLATE {$COLLATE_TEXT}, 4 + needsCollection BOOL NOT NULL, 5 + UNIQUE KEY `key_ngram` (ngram), 6 + KEY `key_collect` (needsCollection) 7 + ) ENGINE=InnoDB, COLLATE {$COLLATE_TEXT};
+7
resources/sql/autopatches/20171002.cngram.14.commit.sql
··· 1 + CREATE TABLE {$NAMESPACE}_repository.repository_commit_fngrams_common ( 2 + id INT UNSIGNED NOT NULL AUTO_INCREMENT PRIMARY KEY, 3 + ngram CHAR(3) NOT NULL COLLATE {$COLLATE_TEXT}, 4 + needsCollection BOOL NOT NULL, 5 + UNIQUE KEY `key_ngram` (ngram), 6 + KEY `key_collect` (needsCollection) 7 + ) ENGINE=InnoDB, COLLATE {$COLLATE_TEXT};
+6
src/applications/config/schema/PhabricatorConfigSchemaSpec.php
··· 73 73 $engine->getNgramsTableName(), 74 74 $engine->getNgramsSchemaColumns(), 75 75 $engine->getNgramsSchemaKeys()); 76 + 77 + $this->buildRawSchema( 78 + $engine->getApplicationName(), 79 + $engine->getCommonNgramsTableName(), 80 + $engine->getCommonNgramsSchemaColumns(), 81 + $engine->getCommonNgramsSchemaKeys()); 76 82 } 77 83 78 84 protected function buildRawSchema(
+37 -12
src/applications/search/engineextension/PhabricatorFerretFulltextEngineExtension.php
··· 165 165 $ferret_field['normalCorpus']); 166 166 } 167 167 168 - $sql = array(); 169 - foreach ($ngrams as $ngram) { 170 - $sql[] = qsprintf( 168 + if ($ngrams) { 169 + $common = queryfx_all( 171 170 $conn, 172 - '(%d, %s)', 173 - $document_id, 174 - $ngram); 171 + 'SELECT ngram FROM %T WHERE ngram IN (%Ls)', 172 + $engine->getCommonNgramsTableName(), 173 + $ngrams); 174 + $common = ipull($common, 'ngram', 'ngram'); 175 + 176 + foreach ($ngrams as $key => $ngram) { 177 + if (isset($common[$ngram])) { 178 + unset($ngrams[$key]); 179 + continue; 180 + } 181 + 182 + // NOTE: MySQL discards trailing whitespace in CHAR(X) columns. 183 + $trim_ngram = rtrim($ngram, ' '); 184 + if (isset($common[$ngram])) { 185 + unset($ngrams[$key]); 186 + continue; 187 + } 188 + } 175 189 } 176 190 177 - foreach (PhabricatorLiskDAO::chunkSQL($sql) as $chunk) { 178 - queryfx( 179 - $conn, 180 - 'INSERT INTO %T (documentID, ngram) VALUES %Q', 181 - $engine->getNgramsTableName(), 182 - $chunk); 191 + if ($ngrams) { 192 + $sql = array(); 193 + foreach ($ngrams as $ngram) { 194 + $sql[] = qsprintf( 195 + $conn, 196 + '(%d, %s)', 197 + $document_id, 198 + $ngram); 199 + } 200 + 201 + foreach (PhabricatorLiskDAO::chunkSQL($sql) as $chunk) { 202 + queryfx( 203 + $conn, 204 + 'INSERT INTO %T (documentID, ngram) VALUES %Q', 205 + $engine->getNgramsTableName(), 206 + $chunk); 207 + } 183 208 } 184 209 } catch (Exception $ex) { 185 210 $object->killTransaction();
+31
src/applications/search/ferret/PhabricatorFerretEngine.php
··· 295 295 ); 296 296 } 297 297 298 + public function getCommonNgramsTableName() { 299 + $application = $this->getApplicationName(); 300 + $scope = $this->getScopeName(); 301 + 302 + return "{$application}_{$scope}_fngrams_common"; 303 + } 304 + 305 + public function getCommonNgramsSchemaColumns() { 306 + return array( 307 + 'id' => 'auto', 308 + 'ngram' => 'char3', 309 + 'needsCollection' => 'bool', 310 + ); 311 + } 312 + 313 + public function getCommonNgramsSchemaKeys() { 314 + return array( 315 + 'PRIMARY' => array( 316 + 'columns' => array('id'), 317 + 'unique' => true, 318 + ), 319 + 'key_ngram' => array( 320 + 'columns' => array('ngram'), 321 + 'unique' => true, 322 + ), 323 + 'key_collect' => array( 324 + 'columns' => array('needsCollection'), 325 + ), 326 + ); 327 + } 328 + 298 329 }
+28
src/infrastructure/query/policy/PhabricatorCursorPagedPolicyAwareQuery.php
··· 1700 1700 } 1701 1701 } 1702 1702 1703 + // Remove common ngrams, like "the", which occur too frequently in 1704 + // documents to be useful in constraining the query. The best ngrams 1705 + // are obscure sequences which occur in very few documents. 1706 + 1707 + if ($flat) { 1708 + $common_ngrams = queryfx_all( 1709 + $conn, 1710 + 'SELECT ngram FROM %T WHERE ngram IN (%Ls)', 1711 + $engine->getCommonNgramsTableName(), 1712 + ipull($flat, 'ngram')); 1713 + $common_ngrams = ipull($common_ngrams, 'ngram', 'ngram'); 1714 + 1715 + foreach ($flat as $key => $spec) { 1716 + $ngram = $spec['ngram']; 1717 + if (isset($common_ngrams[$ngram])) { 1718 + unset($flat[$key]); 1719 + continue; 1720 + } 1721 + 1722 + // NOTE: MySQL discards trailing whitespace in CHAR(X) columns. 1723 + $trim_ngram = rtrim($ngram, ' '); 1724 + if (isset($common_ngrams[$trim_ngram])) { 1725 + unset($flat[$key]); 1726 + continue; 1727 + } 1728 + } 1729 + } 1730 + 1703 1731 // MySQL only allows us to join a maximum of 61 tables per query. Each 1704 1732 // ngram is going to cost us a join toward that limit, so if the user 1705 1733 // specified a very long query string, just pick 16 of the ngrams