loading up the forgejo repo on tangled to test page performance
0
fork

Configure Feed

Select the types of activity you want to include in your feed.

feat(code search): replace fuzzy search with union search for indexer (#6947)

Fuzzy searching for code has been known to be problematic #5264 and in my personal opinion isn't very useful.

Reviewed-on: https://codeberg.org/forgejo/forgejo/pulls/6947
Reviewed-by: Gusted <gusted@noreply.codeberg.org>
Co-authored-by: Shiny Nematoda <snematoda.751k2@aleeas.com>
Co-committed-by: Shiny Nematoda <snematoda.751k2@aleeas.com>

authored by

Shiny Nematoda
Shiny Nematoda
and committed by
Gusted
3816db68 cb46a036

+105 -86
+3 -3
modules/git/grep.go
··· 28 28 HighlightedRanges [][3]int 29 29 } 30 30 31 - type grepMode int 31 + type GrepMode int 32 32 33 33 const ( 34 - FixedGrepMode grepMode = iota 34 + FixedGrepMode GrepMode = iota 35 35 FixedAnyGrepMode 36 36 RegExpGrepMode 37 37 ) ··· 43 43 MaxResultLimit int 44 44 MatchesPerFile int // >= git 2.38 45 45 ContextLineNumber int 46 - Mode grepMode 46 + Mode GrepMode 47 47 Filename string 48 48 } 49 49
+12 -11
modules/indexer/code/bleve/bleve.go
··· 40 40 const ( 41 41 unicodeNormalizeName = "unicodeNormalize" 42 42 maxBatchSize = 16 43 - // fuzzyDenominator determines the levenshtein distance per each character of a keyword 44 - fuzzyDenominator = 4 45 - // see https://github.com/blevesearch/bleve/issues/1563#issuecomment-786822311 46 - maxFuzziness = 2 47 43 ) 48 44 49 45 func addUnicodeNormalizeTokenFilter(m *mapping.IndexMappingImpl) error { ··· 260 256 keywordQuery query.Query 261 257 ) 262 258 263 - phraseQuery := bleve.NewMatchPhraseQuery(opts.Keyword) 264 - phraseQuery.FieldVal = "Content" 265 - phraseQuery.Analyzer = repoIndexerAnalyzer 266 - keywordQuery = phraseQuery 267 - if opts.IsKeywordFuzzy { 268 - phraseQuery.Fuzziness = min(maxFuzziness, len(opts.Keyword)/fuzzyDenominator) 259 + if opts.Mode == internal.CodeSearchModeUnion { 260 + query := bleve.NewDisjunctionQuery() 261 + for _, field := range strings.Fields(opts.Keyword) { 262 + query.AddQuery(inner_bleve.MatchPhraseQuery(field, "Content", repoIndexerAnalyzer, 0)) 263 + } 264 + keywordQuery = query 265 + } else { 266 + keywordQuery = inner_bleve.MatchPhraseQuery(opts.Keyword, "Content", repoIndexerAnalyzer, 0) 269 267 } 270 268 271 269 if len(opts.RepoIDs) > 0 { ··· 325 323 for i, hit := range result.Hits { 326 324 startIndex, endIndex := -1, -1 327 325 for _, locations := range hit.Locations["Content"] { 326 + if startIndex != -1 && endIndex != -1 { 327 + break 328 + } 328 329 location := locations[0] 329 330 locationStart := int(location.Start) 330 331 locationEnd := int(location.End) 331 332 if startIndex < 0 || locationStart < startIndex { 332 333 startIndex = locationStart 333 334 } 334 - if endIndex < 0 || locationEnd > endIndex { 335 + if endIndex < 0 && locationEnd > endIndex { 335 336 endIndex = locationEnd 336 337 } 337 338 }
+4 -4
modules/indexer/code/elasticsearch/elasticsearch.go
··· 33 33 esRepoIndexerLatestVersion = 2 34 34 // multi-match-types, currently only 2 types are used 35 35 // Reference: https://www.elastic.co/guide/en/elasticsearch/reference/7.0/query-dsl-multi-match-query.html#multi-match-types 36 - esMultiMatchTypeBestFields = "best_fields" 37 - esMultiMatchTypePhrasePrefix = "phrase_prefix" 36 + esMultiMatchTypeBestFields = "best_fields" 37 + esMultiMatchTypePhrase = "phrase" 38 38 ) 39 39 40 40 var _ internal.Indexer = &Indexer{} ··· 334 334 335 335 // Search searches for codes and language stats by given conditions. 336 336 func (b *Indexer) Search(ctx context.Context, opts *internal.SearchOptions) (int64, []*internal.SearchResult, []*internal.SearchResultLanguages, error) { 337 - searchType := esMultiMatchTypePhrasePrefix 338 - if opts.IsKeywordFuzzy { 337 + searchType := esMultiMatchTypePhrase 338 + if opts.Mode == internal.CodeSearchModeUnion { 339 339 searchType = esMultiMatchTypeBestFields 340 340 } 341 341
+2 -2
modules/indexer/code/indexer_test.go
··· 100 100 Page: 1, 101 101 PageSize: 10, 102 102 }, 103 - Filename: kw.Filename, 104 - IsKeywordFuzzy: true, 103 + Filename: kw.Filename, 104 + Mode: SearchModeUnion, 105 105 }) 106 106 require.NoError(t, err) 107 107 assert.Len(t, kw.IDs, int(total))
+15 -1
modules/indexer/code/internal/indexer.go
··· 20 20 Search(ctx context.Context, opts *SearchOptions) (int64, []*SearchResult, []*SearchResultLanguages, error) 21 21 } 22 22 23 + type CodeSearchMode int 24 + 25 + const ( 26 + CodeSearchModeExact CodeSearchMode = iota 27 + CodeSearchModeUnion 28 + ) 29 + 30 + func (mode CodeSearchMode) String() string { 31 + if mode == CodeSearchModeUnion { 32 + return "union" 33 + } 34 + return "exact" 35 + } 36 + 23 37 type SearchOptions struct { 24 38 RepoIDs []int64 25 39 Keyword string 26 40 Language string 27 41 Filename string 28 42 29 - IsKeywordFuzzy bool 43 + Mode CodeSearchMode 30 44 31 45 db.Paginator 32 46 }
+8 -2
modules/indexer/code/search.go
··· 35 35 36 36 type SearchOptions = internal.SearchOptions 37 37 38 - var CodeSearchOptions = [2]string{"exact", "fuzzy"} 38 + var CodeSearchOptions = [2]string{"exact", "union"} 39 + 40 + type SearchMode = internal.CodeSearchMode 41 + 42 + const ( 43 + SearchModeExact = internal.CodeSearchModeExact 44 + SearchModeUnion = internal.CodeSearchModeUnion 45 + ) 39 46 40 47 func indices(content string, selectionStartIndex, selectionEndIndex int) (int, int) { 41 48 startIndex := selectionStartIndex ··· 206 213 } 207 214 208 215 // PerformSearch perform a search on a repository 209 - // if isFuzzy is true set the Damerau-Levenshtein distance from 0 to 2 210 216 func PerformSearch(ctx context.Context, opts *SearchOptions) (int, []*Result, []*SearchResultLanguages, error) { 211 217 if opts == nil || len(opts.Keyword) == 0 { 212 218 return 0, nil, nil, nil
+12 -14
routers/web/explore/code.go
··· 37 37 keyword := ctx.FormTrim("q") 38 38 path := ctx.FormTrim("path") 39 39 40 - isFuzzy := ctx.FormOptionalBool("fuzzy").ValueOrDefault(true) 41 - if mode := ctx.FormTrim("mode"); len(mode) > 0 { 42 - isFuzzy = mode == "fuzzy" 40 + mode := code_indexer.SearchModeExact 41 + if m := ctx.FormTrim("mode"); m == "union" || 42 + m == "fuzzy" || 43 + ctx.FormBool("fuzzy") { 44 + mode = code_indexer.SearchModeUnion 43 45 } 44 46 45 47 ctx.Data["Keyword"] = keyword 46 48 ctx.Data["Language"] = language 47 - ctx.Data["CodeSearchOptions"] = []string{"exact", "fuzzy"} 48 - if isFuzzy { 49 - ctx.Data["CodeSearchMode"] = "fuzzy" 50 - } else { 51 - ctx.Data["CodeSearchMode"] = "exact" 52 - } 49 + ctx.Data["CodeSearchOptions"] = code_indexer.CodeSearchOptions 50 + ctx.Data["CodeSearchMode"] = mode.String() 53 51 ctx.Data["PageIsViewCode"] = true 54 52 55 53 if keyword == "" { ··· 88 86 89 87 if (len(repoIDs) > 0) || isAdmin { 90 88 total, searchResults, searchResultLanguages, err = code_indexer.PerformSearch(ctx, &code_indexer.SearchOptions{ 91 - RepoIDs: repoIDs, 92 - Keyword: keyword, 93 - IsKeywordFuzzy: isFuzzy, 94 - Language: language, 95 - Filename: path, 89 + RepoIDs: repoIDs, 90 + Keyword: keyword, 91 + Mode: mode, 92 + Language: language, 93 + Filename: path, 96 94 Paginator: &db.ListOptions{ 97 95 Page: page, 98 96 PageSize: setting.UI.RepoSearchPagingNum,
+31 -20
routers/web/repo/search.go
··· 21 21 22 22 const ( 23 23 ExactSearchMode searchMode = iota 24 - FuzzySearchMode 24 + UnionSearchMode 25 25 RegExpSearchMode 26 26 ) 27 27 28 28 func searchModeFromString(s string) searchMode { 29 29 switch s { 30 30 case "fuzzy", "union": 31 - return FuzzySearchMode 31 + return UnionSearchMode 32 32 case "regexp": 33 33 return RegExpSearchMode 34 34 default: ··· 40 40 switch m { 41 41 case ExactSearchMode: 42 42 return "exact" 43 - case FuzzySearchMode: 44 - return "fuzzy" 43 + case UnionSearchMode: 44 + return "union" 45 45 case RegExpSearchMode: 46 46 return "regexp" 47 47 default: 48 48 panic("cannot happen") 49 + } 50 + } 51 + 52 + func (m searchMode) ToIndexer() code_indexer.SearchMode { 53 + if m == ExactSearchMode { 54 + return code_indexer.SearchModeExact 55 + } 56 + return code_indexer.SearchModeUnion 57 + } 58 + 59 + func (m searchMode) ToGitGrep() git.GrepMode { 60 + switch m { 61 + case RegExpSearchMode: 62 + return git.RegExpGrepMode 63 + case UnionSearchMode: 64 + return git.FixedAnyGrepMode 65 + default: 66 + return git.FixedGrepMode 49 67 } 50 68 } 51 69 ··· 59 77 if modeStr := ctx.FormString("mode"); len(modeStr) > 0 { 60 78 mode = searchModeFromString(modeStr) 61 79 } else if ctx.FormOptionalBool("fuzzy").ValueOrDefault(true) { // for backward compatibility in links 62 - mode = FuzzySearchMode 80 + mode = UnionSearchMode 63 81 } 64 82 65 83 ctx.Data["Keyword"] = keyword ··· 90 108 if setting.Indexer.RepoIndexerEnabled { 91 109 var err error 92 110 total, searchResults, searchResultLanguages, err = code_indexer.PerformSearch(ctx, &code_indexer.SearchOptions{ 93 - RepoIDs: []int64{ctx.Repo.Repository.ID}, 94 - Keyword: keyword, 95 - IsKeywordFuzzy: mode == FuzzySearchMode, 96 - Language: language, 97 - Filename: path, 111 + RepoIDs: []int64{ctx.Repo.Repository.ID}, 112 + Keyword: keyword, 113 + Mode: mode.ToIndexer(), 114 + Language: language, 115 + Filename: path, 98 116 Paginator: &db.ListOptions{ 99 117 Page: page, 100 118 PageSize: setting.UI.RepoSearchPagingNum, ··· 110 128 ctx.Data["CodeIndexerUnavailable"] = !code_indexer.IsAvailable(ctx) 111 129 } 112 130 } else { 113 - grepOpt := git.GrepOptions{ 131 + res, err := git.GrepSearch(ctx, ctx.Repo.GitRepo, keyword, git.GrepOptions{ 114 132 ContextLineNumber: 1, 115 133 RefName: ctx.Repo.RefName, 116 134 Filename: path, 117 - } 118 - switch mode { 119 - case FuzzySearchMode: 120 - grepOpt.Mode = git.FixedAnyGrepMode 121 - ctx.Data["CodeSearchMode"] = "union" 122 - case RegExpSearchMode: 123 - grepOpt.Mode = git.RegExpGrepMode 124 - } 125 - res, err := git.GrepSearch(ctx, ctx.Repo.GitRepo, keyword, grepOpt) 135 + Mode: mode.ToGitGrep(), 136 + }) 126 137 if err != nil { 127 138 ctx.ServerError("GrepSearch", err) 128 139 return
+12 -14
routers/web/user/code.go
··· 41 41 keyword := ctx.FormTrim("q") 42 42 path := ctx.FormTrim("path") 43 43 44 - isFuzzy := ctx.FormOptionalBool("fuzzy").ValueOrDefault(true) 45 - if mode := ctx.FormTrim("mode"); len(mode) > 0 { 46 - isFuzzy = mode == "fuzzy" 44 + mode := code_indexer.SearchModeExact 45 + if m := ctx.FormTrim("mode"); m == "union" || 46 + m == "fuzzy" || 47 + ctx.FormBool("fuzzy") { 48 + mode = code_indexer.SearchModeUnion 47 49 } 48 50 49 51 ctx.Data["Keyword"] = keyword 50 52 ctx.Data["Language"] = language 51 - ctx.Data["CodeSearchOptions"] = []string{"exact", "fuzzy"} 52 - if isFuzzy { 53 - ctx.Data["CodeSearchMode"] = "fuzzy" 54 - } else { 55 - ctx.Data["CodeSearchMode"] = "exact" 56 - } 53 + ctx.Data["CodeSearchOptions"] = code_indexer.CodeSearchOptions 54 + ctx.Data["CodeSearchMode"] = mode.String() 57 55 ctx.Data["IsCodePage"] = true 58 56 59 57 if keyword == "" { ··· 85 83 86 84 if len(repoIDs) > 0 { 87 85 total, searchResults, searchResultLanguages, err = code_indexer.PerformSearch(ctx, &code_indexer.SearchOptions{ 88 - RepoIDs: repoIDs, 89 - Keyword: keyword, 90 - IsKeywordFuzzy: isFuzzy, 91 - Language: language, 92 - Filename: path, 86 + RepoIDs: repoIDs, 87 + Keyword: keyword, 88 + Mode: mode, 89 + Language: language, 90 + Filename: path, 93 91 Paginator: &db.ListOptions{ 94 92 Page: page, 95 93 PageSize: setting.UI.RepoSearchPagingNum,
+6 -15
tests/integration/repo_search_test.go
··· 82 82 testSearch(t, "/user2/glob/search?q=loren&page=1", []string{"a.txt"}, indexer) 83 83 testSearch(t, "/user2/glob/search?q=loren&page=1&mode=exact", []string{"a.txt"}, indexer) 84 84 85 - if indexer { 86 - // fuzzy search: matches both file3 (x/b.txt) and file1 (a.txt) 87 - // when indexer is enabled 88 - testSearch(t, "/user2/glob/search?q=file3&mode=fuzzy&page=1", []string{"x/b.txt", "a.txt"}, indexer) 89 - testSearch(t, "/user2/glob/search?q=file4&mode=fuzzy&page=1", []string{"x/b.txt", "a.txt"}, indexer) 90 - testSearch(t, "/user2/glob/search?q=file5&mode=fuzzy&page=1", []string{"x/b.txt", "a.txt"}, indexer) 91 - } else { 92 - // fuzzy search: Union/OR of all the keywords 93 - // when indexer is disabled 94 - testSearch(t, "/user2/glob/search?q=file3+file1&mode=union&page=1", []string{"a.txt", "x/b.txt"}, indexer) 95 - testSearch(t, "/user2/glob/search?q=file4&mode=union&page=1", []string{}, indexer) 96 - testSearch(t, "/user2/glob/search?q=file5&mode=union&page=1", []string{}, indexer) 97 - } 85 + // union search: Union/OR of all the keywords 86 + testSearch(t, "/user2/glob/search?q=file3+file1&mode=union&page=1", []string{"a.txt", "x/b.txt"}, indexer) 87 + testSearch(t, "/user2/glob/search?q=file4&mode=union&page=1", []string{}, indexer) 88 + testSearch(t, "/user2/glob/search?q=file5&mode=union&page=1", []string{}, indexer) 98 89 99 90 testSearch(t, "/user2/glob/search?q=file3&page=1&mode=exact", []string{"x/b.txt"}, indexer) 100 91 testSearch(t, "/user2/glob/search?q=file4&page=1&mode=exact", []string{}, indexer) ··· 121 112 }) 122 113 123 114 if indexer { 124 - assert.EqualValues(t, []string{"exact", "fuzzy"}, dropdownOptions) 115 + assert.EqualValues(t, []string{"exact", "union"}, dropdownOptions) 125 116 } else { 126 117 assert.EqualValues(t, []string{"exact", "union", "regexp"}, dropdownOptions) 127 118 } 128 119 129 120 filenames := resultFilenames(t, doc) 130 - assert.EqualValues(t, expected, filenames) 121 + assert.ElementsMatch(t, expected, filenames) 131 122 }