cli + tui to publish to leaflet (wip) & manage tasks, notes & watch/read lists 馃崈
charm leaflet readability golang
29
fork

Configure Feed

Select the types of activity you want to include in your feed.

at main 153 lines 3.0 kB view raw
1// Term Frequency-Inverse Document Frequency search model for notes 2package documents 3 4import ( 5 "math" 6 "regexp" 7 "sort" 8 "strings" 9 "time" 10) 11 12type DocKind int64 13 14const ( 15 NoteDoc DocKind = iota 16 ArticleDoc 17 MovieDoc 18 BookDoc 19 TVDoc 20) 21 22type Document struct { 23 ID int64 24 Title string 25 Body string 26 CreatedAt time.Time 27 DocKind int64 28} 29 30type Posting struct { 31 DocID int64 32 TF int 33} 34 35type Index struct { 36 Postings map[string][]Posting 37 DocLengths map[int64]int 38 NumDocs int 39} 40 41type Result struct { 42 DocID int64 43 Score float64 44} 45 46type Searchable interface { 47 Search(query string, limit int) ([]Result, error) 48} 49 50// Tokenizer handles text tokenization and normalization 51type Tokenizer struct { 52 pattern *regexp.Regexp 53} 54 55// NewTokenizer creates a new tokenizer with Unicode-aware word/number matching 56func NewTokenizer() *Tokenizer { 57 return &Tokenizer{ 58 pattern: regexp.MustCompile(`\p{L}+\p{M}*|\p{N}+`), 59 } 60} 61 62// Tokenize splits text into normalized tokens (lowercase words and numbers) 63func (t *Tokenizer) Tokenize(text string) []string { 64 lowered := strings.ToLower(text) 65 return t.pattern.FindAllString(lowered, -1) 66} 67 68// TokenFrequency computes term frequency map for tokens 69func TokenFrequency(tokens []string) map[string]int { 70 freq := make(map[string]int) 71 for _, token := range tokens { 72 freq[token]++ 73 } 74 return freq 75} 76 77// BuildIndex constructs a TF-IDF index from a collection of documents 78func BuildIndex(docs []Document) *Index { 79 idx := &Index{ 80 Postings: make(map[string][]Posting), 81 DocLengths: make(map[int64]int), 82 NumDocs: 0, 83 } 84 85 tokenizer := NewTokenizer() 86 87 for _, doc := range docs { 88 text := doc.Title + " " + doc.Body 89 tokens := tokenizer.Tokenize(text) 90 91 idx.NumDocs++ 92 idx.DocLengths[doc.ID] = len(tokens) 93 94 freq := TokenFrequency(tokens) 95 96 for term, tf := range freq { 97 idx.Postings[term] = append(idx.Postings[term], Posting{ 98 DocID: doc.ID, 99 TF: tf, 100 }) 101 } 102 } 103 104 return idx 105} 106 107// Search performs TF-IDF ranked search on the index 108func (idx *Index) Search(query string, limit int) ([]Result, error) { 109 tokenizer := NewTokenizer() 110 queryTokens := tokenizer.Tokenize(query) 111 112 if len(queryTokens) == 0 { 113 return []Result{}, nil 114 } 115 116 scores := make(map[int64]float64) 117 118 for _, term := range queryTokens { 119 postings, exists := idx.Postings[term] 120 if !exists { 121 continue 122 } 123 124 df := len(postings) 125 idf := math.Log(float64(idx.NumDocs) / float64(df)) 126 127 for _, posting := range postings { 128 tf := float64(posting.TF) 129 scores[posting.DocID] += tf * idf 130 } 131 } 132 133 results := make([]Result, 0, len(scores)) 134 for docID, score := range scores { 135 results = append(results, Result{ 136 DocID: docID, 137 Score: score, 138 }) 139 } 140 141 sort.Slice(results, func(i, j int) bool { 142 if results[i].Score != results[j].Score { 143 return results[i].Score > results[j].Score 144 } 145 return results[i].DocID > results[j].DocID 146 }) 147 148 if limit > 0 && limit < len(results) { 149 results = results[:limit] 150 } 151 152 return results, nil 153}