Fix issues indexer document mapping (#25619)

Fix regression of #5363 (so long ago).

The old code definded a document mapping for `issueIndexerDocType`, and
assigned it to `BleveIndexerData` as its type. (`BleveIndexerData` has
been renamed to `IndexerData` in #25174, but nothing more.) But the old
code never used `BleveIndexerData`, it wrote the index with an anonymous
struct type. Nonetheless, bleve would use the default auto-mapping for
struct it didn't know, so the indexer still worked. This means the
custom document mapping was always dead code.

The custom document mapping is not useless, it can reduce index storage,
this PR brings it back and disable default mapping to prevent it from
happening again. Since `IndexerData`(`BleveIndexerData`) has JSON tags,
and bleve uses them first, so we should use `repo_id` as the field name
instead of `RepoID`.

I did a test to compare the storage size before and after this, with
about 3k real comments that were migrated from some public repos.

Before:

```text
[ 160] .
├── [ 42] index_meta.json
├── [ 13] rupture_meta.json
└── [ 128] store
├── [6.9M] 00000000005d.zap
└── [256K] root.bolt
```

After:

```text
[ 160] .
├── [ 42] index_meta.json
├── [ 13] rupture_meta.json
└── [ 128] store
├── [3.5M] 000000000065.zap
└── [256K] root.bolt
```

It saves about half the storage space.

---------

Co-authored-by: Giteabot <teabot@gitea.io>

authored by

Jason Song

Giteabot and committed by

GitHub 3 years ago 99586425 dae022ab

+12 -20

1 changed file

expand all

modules

indexer

issues

bleve

bleve.go

+12 -20

modules/indexer/issues/bleve/bleve.go

··· 23 23 const ( 24 24 issueIndexerAnalyzer = "issueIndexer" 25 25 issueIndexerDocType = "issueIndexerDocType" 26 - issueIndexerLatestVersion = 2 26 + issueIndexerLatestVersion = 3 27 27 ) 28 28 29 29 // numericEqualityQuery a numeric equality query for the given value and field ··· 67 67 docMapping := bleve.NewDocumentMapping() 68 68 69 69 numericFieldMapping := bleve.NewNumericFieldMapping() 70 + numericFieldMapping.Store = false 70 71 numericFieldMapping.IncludeInAll = false 71 - docMapping.AddFieldMappingsAt("RepoID", numericFieldMapping) 72 + docMapping.AddFieldMappingsAt("repo_id", numericFieldMapping) 72 73 73 74 textFieldMapping := bleve.NewTextFieldMapping() 74 75 textFieldMapping.Store = false 75 76 textFieldMapping.IncludeInAll = false 76 - docMapping.AddFieldMappingsAt("Title", textFieldMapping) 77 - docMapping.AddFieldMappingsAt("Content", textFieldMapping) 78 - docMapping.AddFieldMappingsAt("Comments", textFieldMapping) 77 + docMapping.AddFieldMappingsAt("title", textFieldMapping) 78 + docMapping.AddFieldMappingsAt("content", textFieldMapping) 79 + docMapping.AddFieldMappingsAt("comments", textFieldMapping) 79 80 80 81 if err := addUnicodeNormalizeTokenFilter(mapping); err != nil { 81 82 return nil, err ··· 91 92 mapping.DefaultAnalyzer = issueIndexerAnalyzer 92 93 mapping.AddDocumentMapping(issueIndexerDocType, docMapping) 93 94 mapping.AddDocumentMapping("_all", bleve.NewDocumentDisabledMapping()) 95 + mapping.DefaultMapping = bleve.NewDocumentDisabledMapping() // disable default mapping, avoid indexing unexpected structs 94 96 95 97 return mapping, nil 96 98 } ··· 116 118 func (b *Indexer) Index(_ context.Context, issues []*internal.IndexerData) error { 117 119 batch := inner_bleve.NewFlushingBatch(b.inner.Indexer, maxBatchSize) 118 120 for _, issue := range issues { 119 - if err := batch.Index(indexer_internal.Base36(issue.ID), struct { 120 - RepoID int64 121 - Title string 122 - Content string 123 - Comments []string 124 - }{ 125 - RepoID: issue.RepoID, 126 - Title: issue.Title, 127 - Content: issue.Content, 128 - Comments: issue.Comments, 129 - }); err != nil { 121 + if err := batch.Index(indexer_internal.Base36(issue.ID), (*IndexerData)(issue)); err != nil { 130 122 return err 131 123 } 132 124 } ··· 149 141 func (b *Indexer) Search(ctx context.Context, keyword string, repoIDs []int64, limit, start int) (*internal.SearchResult, error) { 150 142 var repoQueriesP []*query.NumericRangeQuery 151 143 for _, repoID := range repoIDs { 152 - repoQueriesP = append(repoQueriesP, numericEqualityQuery(repoID, "RepoID")) 144 + repoQueriesP = append(repoQueriesP, numericEqualityQuery(repoID, "repo_id")) 153 145 } 154 146 repoQueries := make([]query.Query, len(repoQueriesP)) 155 147 for i, v := range repoQueriesP { ··· 159 151 indexerQuery := bleve.NewConjunctionQuery( 160 152 bleve.NewDisjunctionQuery(repoQueries...), 161 153 bleve.NewDisjunctionQuery( 162 - newMatchPhraseQuery(keyword, "Title", issueIndexerAnalyzer), 163 - newMatchPhraseQuery(keyword, "Content", issueIndexerAnalyzer), 164 - newMatchPhraseQuery(keyword, "Comments", issueIndexerAnalyzer), 154 + newMatchPhraseQuery(keyword, "title", issueIndexerAnalyzer), 155 + newMatchPhraseQuery(keyword, "content", issueIndexerAnalyzer), 156 + newMatchPhraseQuery(keyword, "comments", issueIndexerAnalyzer), 165 157 )) 166 158 search := bleve.NewSearchRequestOptions(indexerQuery, limit, start, false) 167 159 search.SortBy([]string{"-_score"})

Configure Feed

Configure Feed