this repo has no description
0
fork

Configure Feed

Select the types of activity you want to include in your feed.

keyword: hotfix for datarace in tokenization function (#539)

authored by

bnewbold and committed by
GitHub
a245c6aa 7f2ed82f

+2 -1
+2 -1
automod/keyword/tokenize.go
··· 14 14 var ( 15 15 puncChars = regexp.MustCompile(`[[:punct:]]+`) 16 16 nonTokenChars = regexp.MustCompile(`[^\pL\pN\s]+`) 17 - normFunc = transform.Chain(norm.NFD, runes.Remove(runes.In(unicode.Mn)), norm.NFC) 18 17 ) 19 18 20 19 // Splits free-form text in to tokens, including lower-case, unicode normalization, and some unicode folding. 21 20 // 22 21 // The intent is for this to work similarly to an NLP tokenizer, as might be used in a fulltext search engine, and enable fast matching to a list of known tokens. It might eventually even do stemming, removing pluralization (trailing "s" for English), etc. 23 22 func TokenizeText(text string) []string { 23 + // this function needs to be re-defined in every function call to prevent a race condition 24 + normFunc := transform.Chain(norm.NFD, runes.Remove(runes.In(unicode.Mn)), norm.NFC) 24 25 split := strings.ToLower(nonTokenChars.ReplaceAllString(text, " ")) 25 26 bare := strings.ToLower(nonTokenChars.ReplaceAllString(split, "")) 26 27 norm, _, err := transform.String(normFunc, bare)