support tokenize while keeping common censor chars (#767)

+61 -5

2 changed files

expand all

automod

keyword

tokenize.go

tokenize_test.go

+14 -5

automod/keyword/tokenize.go

··· 12 12 ) 13 13 14 14 var ( 15 - puncChars = regexp.MustCompile(`[[:punct:]]+`) 16 - nonTokenChars = regexp.MustCompile(`[^\pL\pN\s]+`) 15 + puncChars = regexp.MustCompile(`[[:punct:]]+`) 16 + nonTokenChars = regexp.MustCompile(`[^\pL\pN\s]+`) 17 + nonTokenCharsSkipCensorChars = regexp.MustCompile(`[^\pL\pN\s#*_-]`) 17 18 ) 18 19 19 20 // Splits free-form text in to tokens, including lower-case, unicode normalization, and some unicode folding. 20 21 // 21 22 // The intent is for this to work similarly to an NLP tokenizer, as might be used in a fulltext search engine, and enable fast matching to a list of known tokens. It might eventually even do stemming, removing pluralization (trailing "s" for English), etc. 22 - func TokenizeText(text string) []string { 23 + func TokenizeTextWithRegex(text string, nonTokenCharsRegex *regexp.Regexp) []string { 23 24 // this function needs to be re-defined in every function call to prevent a race condition 24 25 normFunc := transform.Chain(norm.NFD, runes.Remove(runes.In(unicode.Mn)), norm.NFC) 25 - split := strings.ToLower(nonTokenChars.ReplaceAllString(text, " ")) 26 - bare := strings.ToLower(nonTokenChars.ReplaceAllString(split, "")) 26 + split := strings.ToLower(nonTokenCharsRegex.ReplaceAllString(text, " ")) 27 + bare := strings.ToLower(nonTokenCharsRegex.ReplaceAllString(split, "")) 27 28 norm, _, err := transform.String(normFunc, bare) 28 29 if err != nil { 29 30 slog.Warn("unicode normalization error", "err", err) 30 31 norm = bare 31 32 } 32 33 return strings.Fields(norm) 34 + } 35 + 36 + func TokenizeText(text string) []string { 37 + return TokenizeTextWithRegex(text, nonTokenChars) 38 + } 39 + 40 + func TokenizeTextSkippingCensorChars(text string) []string { 41 + return TokenizeTextWithRegex(text, nonTokenCharsSkipCensorChars) 33 42 } 34 43 35 44 func splitIdentRune(c rune) bool {

+47

automod/keyword/tokenize_test.go

··· 1 1 package keyword 2 2 3 3 import ( 4 + "regexp" 4 5 "testing" 5 6 6 7 "github.com/stretchr/testify/assert" ··· 17 18 {text: "Hello, โลก!", out: []string{"hello", "โลก"}}, 18 19 {text: "Gdańsk", out: []string{"gdansk"}}, 19 20 {text: " foo1;bar2,baz3...", out: []string{"foo1", "bar2", "baz3"}}, 21 + {text: "foo*bar", out: []string{"foo", "bar"}}, 22 + {text: "foo-bar", out: []string{"foo", "bar"}}, 23 + {text: "foo_bar", out: []string{"foo", "bar"}}, 20 24 } 21 25 22 26 for _, fix := range fixtures { 23 27 assert.Equal(fix.out, TokenizeText(fix.text)) 28 + } 29 + } 30 + 31 + func TestTokenizeTextWithCensorChars(t *testing.T) { 32 + assert := assert.New(t) 33 + 34 + fixtures := []struct { 35 + text string 36 + out []string 37 + }{ 38 + {text: "", out: []string{}}, 39 + {text: "Hello, โลก!", out: []string{"hello", "โลก"}}, 40 + {text: "Gdańsk", out: []string{"gdansk"}}, 41 + {text: " foo1;bar2,baz3...", out: []string{"foo1", "bar2", "baz3"}}, 42 + {text: "foo*bar,foo&bar", out: []string{"foo*bar", "foo", "bar"}}, 43 + {text: "foo-bar,foo&bar", out: []string{"foo-bar", "foo", "bar"}}, 44 + {text: "foo_bar,foo&bar", out: []string{"foo_bar", "foo", "bar"}}, 45 + {text: "foo#bar,foo&bar", out: []string{"foo#bar", "foo", "bar"}}, 46 + } 47 + 48 + for _, fix := range fixtures { 49 + assert.Equal(fix.out, TokenizeTextSkippingCensorChars(fix.text)) 50 + } 51 + } 52 + 53 + func TestTokenizeTextWithCustomRegex(t *testing.T) { 54 + assert := assert.New(t) 55 + 56 + fixtures := []struct { 57 + text string 58 + out []string 59 + }{ 60 + {text: "", out: []string{}}, 61 + {text: "Hello, โลก!", out: []string{"hello", "โลก"}}, 62 + {text: "Gdańsk", out: []string{"gdansk"}}, 63 + {text: " foo1;bar2,baz3...", out: []string{"foo1", "bar2", "baz3"}}, 64 + {text: "foo*bar", out: []string{"foo", "bar"}}, 65 + {text: "foo&bar,foo*bar", out: []string{"foo&bar", "foo", "bar"}}, 66 + } 67 + 68 + regex := regexp.MustCompile(`[^\pL\pN\s&]`) 69 + for _, fix := range fixtures { 70 + assert.Equal(fix.out, TokenizeTextWithRegex(fix.text, regex)) 24 71 } 25 72 } 26 73

Configure Feed

Configure Feed