@recaptime-dev's working patches + fork for Phorge, a community fork of Phabricator. (Upstream dev and stable branches are at upstream/main and upstream/stable respectively.) hq.recaptime.dev/wiki/Phorge
phorge phabricator
1
fork

Configure Feed

Select the types of activity you want to include in your feed.

Provide some "term vs substring" support for the Ferret engine

Summary:
Ref T12819. Distinguishes between "term" queries and "substring" queries, and tries to match them correctly most of the time. For example:

- `example` matches "example", obviously.
- `~amp` matches "example", but `amp` does not.
- `examples` matches "example" through stemming.
- `"examples"` does not match "example" (quoted text does not stem).
- `"an examp"` does not match "an example" (quoted text is still term text).
- `~"an examp"` matches "an example" (quoted, substring-operator text uses substring search).

Test Plan: Ran searches similar to the above, they seemed to do what they should.

Reviewers: chad

Reviewed By: chad

Maniphest Tasks: T12819

Differential Revision: https://secure.phabricator.com/D18500

+123 -42
+5 -5
src/applications/maniphest/query/ManiphestTaskSearchEngine.php
··· 92 92 ->setLabel(pht('Contains Words')) 93 93 ->setKey('fulltext'), 94 94 id(new PhabricatorSearchTextField()) 95 - ->setLabel(pht('Matches (Prototype)')) 96 - ->setKey('ferret') 95 + ->setLabel(pht('Query (Prototype)')) 96 + ->setKey('query') 97 97 ->setIsHidden($hide_ferret), 98 98 id(new PhabricatorSearchThreeStateField()) 99 99 ->setLabel(pht('Open Parents')) ··· 150 150 'statuses', 151 151 'priorities', 152 152 'subtypes', 153 + 'query', 153 154 'fulltext', 154 - 'ferret', 155 155 'hasParents', 156 156 'hasSubtasks', 157 157 'parentIDs', ··· 231 231 $query->withFullTextSearch($map['fulltext']); 232 232 } 233 233 234 - if (strlen($map['ferret'])) { 235 - $raw_query = $map['ferret']; 234 + if (strlen($map['query'])) { 235 + $raw_query = $map['query']; 236 236 237 237 $compiler = id(new PhutilSearchQueryCompiler()) 238 238 ->setEnableFunctions(true);
+12 -5
src/applications/search/engineextension/PhabricatorFerretFulltextEngineExtension.php
··· 50 50 continue; 51 51 } 52 52 53 - $normal_corpus = $stemmer->stemCorpus($raw_corpus); 54 53 $term_corpus = $ngram_engine->newTermsCorpus($raw_corpus); 54 + 55 + $normal_corpus = $stemmer->stemCorpus($raw_corpus); 56 + $normal_coprus = $ngram_engine->newTermsCorpus($normal_corpus); 55 57 56 58 if (!isset($ferret_corpus_map[$key])) { 57 59 $ferret_corpus_map[$key] = $empty_template; ··· 67 69 } 68 70 69 71 $ferret_fields = array(); 72 + $ngrams_source = array(); 70 73 foreach ($ferret_corpus_map as $key => $fields) { 71 74 $raw_corpus = $fields['raw']; 72 75 $raw_corpus = implode("\n", $raw_corpus); 76 + $ngrams_source[] = $raw_corpus; 73 77 74 78 $normal_corpus = $fields['normal']; 75 - $normal_corpus = implode("\n", $normal_corpus); 79 + $normal_corpus = implode(' ', $normal_corpus); 80 + if (strlen($normal_corpus)) { 81 + $ngrams_source[] = $normal_corpus; 82 + $normal_corpus = ' '.$normal_corpus.' '; 83 + } 76 84 77 85 $term_corpus = $fields['term']; 78 86 $term_corpus = implode(' ', $term_corpus); 79 87 if (strlen($term_corpus)) { 88 + $ngrams_source[] = $term_corpus; 80 89 $term_corpus = ' '.$term_corpus.' '; 81 90 } 82 91 ··· 86 95 ->setTermCorpus($term_corpus) 87 96 ->setNormalCorpus($normal_corpus); 88 97 } 89 - 90 - $ngrams_source = $ferret_corpus_map[$key_all]['raw']; 91 - $ngrams_source = implode("\n", $ngrams_source); 98 + $ngrams_source = implode(' ', $ngrams_source); 92 99 93 100 $ngrams = $ngram_engine->getNgramsFromString($ngrams_source, 'index'); 94 101
+106 -32
src/infrastructure/query/policy/PhabricatorCursorPagedPolicyAwareQuery.php
··· 1409 1409 return array(); 1410 1410 } 1411 1411 1412 + $op_sub = PhutilSearchQueryCompiler::OPERATOR_SUBSTRING; 1413 + 1412 1414 $engine = $this->ferretEngine; 1413 1415 $ngram_engine = new PhabricatorNgramEngine(); 1416 + $stemmer = new PhutilSearchStemmer(); 1414 1417 1415 1418 $ngram_table = $engine->newNgramsObject(); 1416 1419 $ngram_table_name = $ngram_table->getTableName(); ··· 1422 1425 1423 1426 $length = count(phutil_utf8v($value)); 1424 1427 1425 - if ($length >= 3) { 1428 + if ($raw_token->getOperator() == $op_sub) { 1429 + $is_substring = true; 1430 + } else { 1431 + $is_substring = false; 1432 + } 1433 + 1434 + // If the user specified a substring query for a substring which is 1435 + // shorter than the ngram length, we can't use the ngram index, so 1436 + // don't do a join. We'll fall back to just doing LIKE on the full 1437 + // corpus. 1438 + if ($is_substring) { 1439 + if ($length < 3) { 1440 + continue; 1441 + } 1442 + } 1443 + 1444 + if ($raw_token->isQuoted()) { 1445 + $is_stemmed = false; 1446 + } else { 1447 + $is_stemmed = true; 1448 + } 1449 + 1450 + if ($is_substring) { 1426 1451 $ngrams = $ngram_engine->getNgramsFromString($value, 'query'); 1427 - $prefix = false; 1428 - } else if ($length == 2) { 1429 - $ngrams = $ngram_engine->getNgramsFromString($value, 'prefix'); 1430 - $prefix = false; 1431 1452 } else { 1432 - $ngrams = array(' '.$value); 1433 - $prefix = true; 1453 + $ngrams = $ngram_engine->getNgramsFromString($value, 'index'); 1454 + 1455 + // If this is a stemmed term, only look for ngrams present in both the 1456 + // unstemmed and stemmed variations. 1457 + if ($is_stemmed) { 1458 + $stem_value = $stemmer->stemToken($value); 1459 + $stem_ngrams = $ngram_engine->getNgramsFromString( 1460 + $stem_value, 1461 + 'index'); 1462 + 1463 + $ngrams = array_intersect($ngrams, $stem_ngrams); 1464 + } 1434 1465 } 1435 1466 1436 1467 foreach ($ngrams as $ngram) { 1437 1468 $flat[] = array( 1438 1469 'table' => $ngram_table_name, 1439 1470 'ngram' => $ngram, 1440 - 'prefix' => $prefix, 1441 1471 ); 1442 1472 } 1443 1473 } ··· 1472 1502 foreach ($flat as $spec) { 1473 1503 $table = $spec['table']; 1474 1504 $ngram = $spec['ngram']; 1475 - $prefix = $spec['prefix']; 1476 1505 1477 1506 $alias = 'ft'.$idx++; 1478 1507 1479 - if ($prefix) { 1480 - $joins[] = qsprintf( 1481 - $conn, 1482 - 'JOIN %T %T ON %T.documentID = ftdoc.id AND %T.ngram LIKE %>', 1483 - $table, 1484 - $alias, 1485 - $alias, 1486 - $alias, 1487 - $ngram); 1488 - } else { 1489 - $joins[] = qsprintf( 1490 - $conn, 1491 - 'JOIN %T %T ON %T.documentID = ftdoc.id AND %T.ngram = %s', 1492 - $table, 1493 - $alias, 1494 - $alias, 1495 - $alias, 1496 - $ngram); 1497 - } 1508 + $joins[] = qsprintf( 1509 + $conn, 1510 + 'JOIN %T %T ON %T.documentID = ftdoc.id AND %T.ngram = %s', 1511 + $table, 1512 + $alias, 1513 + $alias, 1514 + $alias, 1515 + $ngram); 1498 1516 } 1499 1517 1500 1518 $joins[] = qsprintf( ··· 1509 1527 if (!$this->ferretEngine) { 1510 1528 return array(); 1511 1529 } 1530 + 1531 + $ngram_engine = new PhabricatorNgramEngine(); 1532 + $stemmer = new PhutilSearchStemmer(); 1533 + $op_sub = PhutilSearchQueryCompiler::OPERATOR_SUBSTRING; 1512 1534 1513 1535 $where = array(); 1514 1536 foreach ($this->ferretTokens as $fulltext_token) { 1515 1537 $raw_token = $fulltext_token->getToken(); 1516 1538 $value = $raw_token->getValue(); 1517 1539 1518 - $where[] = qsprintf( 1540 + if ($raw_token->getOperator() == $op_sub) { 1541 + $is_substring = true; 1542 + } else { 1543 + $is_substring = false; 1544 + } 1545 + 1546 + // If we're doing substring search, we just match against the raw corpus 1547 + // and we're done. 1548 + if ($is_substring) { 1549 + $where[] = qsprintf( 1550 + $conn, 1551 + '(ftfield.rawCorpus LIKE %~)', 1552 + $value); 1553 + continue; 1554 + } 1555 + 1556 + // Otherwise, we need to match against the term corpus and the normal 1557 + // corpus, so that searching for "raw" does not find "strawberry". 1558 + if ($raw_token->isQuoted()) { 1559 + $is_quoted = true; 1560 + $is_stemmed = false; 1561 + } else { 1562 + $is_quoted = false; 1563 + $is_stemmed = true; 1564 + } 1565 + 1566 + $term_constraints = array(); 1567 + 1568 + $term_value = ' '.$ngram_engine->newTermsCorpus($value).' '; 1569 + $term_constraints[] = qsprintf( 1519 1570 $conn, 1520 - '(ftfield.rawCorpus LIKE %~ OR ftfield.normalCorpus LIKE %~)', 1521 - $value, 1522 - $value); 1571 + '(ftfield.termCorpus LIKE %~)', 1572 + $term_value); 1573 + 1574 + if ($is_stemmed) { 1575 + $stem_value = $stemmer->stemToken($value); 1576 + $stem_value = $ngram_engine->newTermsCorpus($stem_value); 1577 + $stem_value = ' '.$stem_value.' '; 1578 + 1579 + $term_constraints[] = qsprintf( 1580 + $conn, 1581 + '(ftfield.normalCorpus LIKE %~)', 1582 + $stem_value); 1583 + } 1584 + 1585 + if ($is_quoted) { 1586 + $where[] = qsprintf( 1587 + $conn, 1588 + '(ftfield.rawCorpus LIKE %~ AND (%Q))', 1589 + $value, 1590 + implode(' OR ', $term_constraints)); 1591 + } else { 1592 + $where[] = qsprintf( 1593 + $conn, 1594 + '(%Q)', 1595 + implode(' OR ', $term_constraints)); 1596 + } 1523 1597 } 1524 1598 1525 1599 return $where;