@recaptime-dev's working patches + fork for Phorge, a community fork of Phabricator. (Upstream dev and stable branches are at upstream/main and upstream/stable respectively.) hq.recaptime.dev/wiki/Phorge
phorge phabricator
1
fork

Configure Feed

Select the types of activity you want to include in your feed.

Improve search highlighting for CJK and substring queries

Summary:
Fixes T12995. Currently, the result highlighter (which shows //where// terms matched) only works in "term" mode, not in "substring" mode.

Provide better feedback and behvaior:

- When a term is a substring term, color it a little differently and add a tooltip. (This is partly to make it easier to debug/diagnose things, probably not enormously valuable to users.)
- When a term is a substring term, highlight it anywhere in the results.

Test Plan:
Queried for latin and CJK terms.

Here is CJK being highlighted:

{F5192195}

Here is substring vs non-substring implicit behavior:

{F5192196}

Here's ONLY terms being highlighted:

{F5192198}

Here's terms and substrings, since the query now has a substring:

{F5192201}

Reviewers: amckinley

Reviewed By: amckinley

Maniphest Tasks: T12995

Differential Revision: https://secure.phabricator.com/D18635

+100 -54
+4
src/applications/search/query/PhabricatorFulltextToken.php
··· 56 56 $shade = PHUITagView::COLOR_RED; 57 57 $icon = 'fa-minus'; 58 58 break; 59 + case PhutilSearchQueryCompiler::OPERATOR_SUBSTRING: 60 + $tip = pht('Substring Search'); 61 + $shade = PHUITagView::COLOR_VIOLET; 62 + break; 59 63 default: 60 64 $shade = PHUITagView::COLOR_BLUE; 61 65 break;
+1 -1
src/applications/search/query/PhabricatorSearchApplicationSearchEngine.php
··· 261 261 foreach ($results as $phid => $handle) { 262 262 $view = id(new PhabricatorSearchResultView()) 263 263 ->setHandle($handle) 264 - ->setQuery($query) 264 + ->setTokens($fulltext_tokens) 265 265 ->setObject(idx($objects, $phid)) 266 266 ->render(); 267 267 $list->addItem($view);
+95 -53
src/applications/search/view/PhabricatorSearchResultView.php
··· 3 3 final class PhabricatorSearchResultView extends AphrontView { 4 4 5 5 private $handle; 6 - private $query; 7 6 private $object; 7 + private $tokens; 8 8 9 9 public function setHandle(PhabricatorObjectHandle $handle) { 10 10 $this->handle = $handle; 11 11 return $this; 12 12 } 13 13 14 - public function setQuery(PhabricatorSavedQuery $query) { 15 - $this->query = $query; 14 + public function setTokens(array $tokens) { 15 + assert_instances_of($tokens, 'PhabricatorFulltextToken'); 16 + $this->tokens = $tokens; 16 17 return $this; 17 18 } 18 19 ··· 56 57 * matched their query. 57 58 */ 58 59 private function emboldenQuery($str) { 59 - $query = $this->query->getParameter('query'); 60 + $tokens = $this->tokens; 60 61 61 - if (!strlen($query) || !strlen($str)) { 62 + if (!$tokens) { 62 63 return $str; 63 64 } 64 65 65 - // This algorithm is safe but not especially fast, so don't bother if 66 - // we're dealing with a lot of data. This mostly prevents silly/malicious 67 - // queries from doing anything bad. 68 - if (strlen($query) + strlen($str) > 2048) { 66 + if (count($tokens) > 16) { 69 67 return $str; 70 68 } 71 69 72 - // Keep track of which characters we're going to make bold. This is 73 - // byte oriented, but we'll make sure we don't put a bold in the middle 74 - // of a character later. 75 - $bold = array_fill(0, strlen($str), false); 70 + if (!strlen($str)) { 71 + return $str; 72 + } 76 73 77 - // Split the query into words. 78 - $parts = preg_split('/ +/', $query); 74 + if (strlen($str) > 2048) { 75 + return $str; 76 + } 79 77 80 - // Find all occurrences of each word, and mark them to be emboldened. 81 - foreach ($parts as $part) { 82 - $part = trim($part); 83 - $part = trim($part, '"+'); 84 - if (!strlen($part)) { 85 - continue; 78 + $patterns = array(); 79 + foreach ($tokens as $token) { 80 + $raw_token = $token->getToken(); 81 + $operator = $raw_token->getOperator(); 82 + 83 + $value = $raw_token->getValue(); 84 + 85 + switch ($operator) { 86 + case PhutilSearchQueryCompiler::OPERATOR_SUBSTRING: 87 + $patterns[] = '(('.preg_quote($value).'))ui'; 88 + break; 89 + case PhutilSearchQueryCompiler::OPERATOR_AND: 90 + $patterns[] = '((?<=\W|^)('.preg_quote($value).')(?=\W|\z))ui'; 91 + break; 92 + default: 93 + // Don't highlight anything else, particularly "NOT". 94 + break; 86 95 } 96 + } 87 97 98 + // Find all matches for all query terms in the document title, then reduce 99 + // them to a map from offsets to highlighted sequence lengths. If two terms 100 + // match at the same position, we choose the longer one. 101 + $all_matches = array(); 102 + foreach ($patterns as $pattern) { 88 103 $matches = null; 89 - $has_matches = preg_match_all( 90 - '/(?:^|\b)('.preg_quote($part, '/').')/i', 104 + $ok = preg_match_all( 105 + $pattern, 91 106 $str, 92 107 $matches, 93 108 PREG_OFFSET_CAPTURE); 94 - 95 - if (!$has_matches) { 109 + if (!$ok) { 96 110 continue; 97 111 } 98 112 99 - // Flag the matching part of the range for boldening. 100 113 foreach ($matches[1] as $match) { 101 - $offset = $match[1]; 102 - for ($ii = 0; $ii < strlen($match[0]); $ii++) { 103 - $bold[$offset + $ii] = true; 114 + $match_text = $match[0]; 115 + $match_offset = $match[1]; 116 + 117 + if (!isset($all_matches[$match_offset])) { 118 + $all_matches[$match_offset] = 0; 104 119 } 120 + 121 + $all_matches[$match_offset] = max( 122 + $all_matches[$match_offset], 123 + strlen($match_text)); 105 124 } 106 125 } 107 126 108 - // Split the string into ranges, applying bold styling as required. 109 - $out = array(); 110 - $buf = ''; 111 - $pos = 0; 112 - $is_bold = false; 127 + // Go through the string one display glyph at a time. If a glyph starts 128 + // on a highlighted byte position, turn on highlighting for the nubmer 129 + // of matching bytes. If a query searches for "e" and the document contains 130 + // an "e" followed by a bunch of combining marks, this will correctly 131 + // highlight the entire glyph. 132 + $parts = array(); 133 + $highlight = 0; 134 + $offset = 0; 135 + foreach (phutil_utf8v_combined($str) as $character) { 136 + $length = strlen($character); 137 + 138 + if (isset($all_matches[$offset])) { 139 + $highlight = $all_matches[$offset]; 140 + } 113 141 114 - // Make sure this is UTF8 because phutil_utf8v() will explode if it isn't. 115 - $str = phutil_utf8ize($str); 116 - foreach (phutil_utf8v($str) as $chr) { 117 - if ($bold[$pos] != $is_bold) { 118 - if (strlen($buf)) { 119 - if ($is_bold) { 120 - $out[] = phutil_tag('strong', array(), $buf); 121 - } else { 122 - $out[] = $buf; 123 - } 124 - $buf = ''; 142 + if ($highlight > 0) { 143 + $is_highlighted = true; 144 + $highlight -= $length; 145 + } else { 146 + $is_highlighted = false; 147 + } 148 + 149 + $parts[] = array( 150 + 'text' => $character, 151 + 'highlighted' => $is_highlighted, 152 + ); 153 + 154 + $offset += $length; 155 + } 156 + 157 + // Combine all the sequences together so we aren't emitting a tag around 158 + // every individual character. 159 + $last = null; 160 + foreach ($parts as $key => $part) { 161 + if ($last !== null) { 162 + if ($part['highlighted'] == $parts[$last]['highlighted']) { 163 + $parts[$last]['text'] .= $part['text']; 164 + unset($parts[$key]); 165 + continue; 125 166 } 126 - $is_bold = !$is_bold; 127 167 } 128 - $buf .= $chr; 129 - $pos += strlen($chr); 168 + 169 + $last = $key; 130 170 } 131 171 132 - if (strlen($buf)) { 133 - if ($is_bold) { 134 - $out[] = phutil_tag('strong', array(), $buf); 172 + // Finally, add tags. 173 + $result = array(); 174 + foreach ($parts as $part) { 175 + if ($part['highlighted']) { 176 + $result[] = phutil_tag('strong', array(), $part['text']); 135 177 } else { 136 - $out[] = $buf; 178 + $result[] = $part['text']; 137 179 } 138 180 } 139 181 140 - return $out; 182 + return $result; 141 183 } 142 184 143 185 }