···1414var (
1515 puncChars = regexp.MustCompile(`[[:punct:]]+`)
1616 nonTokenChars = regexp.MustCompile(`[^\pL\pN\s]+`)
1717- normFunc = transform.Chain(norm.NFD, runes.Remove(runes.In(unicode.Mn)), norm.NFC)
1817)
19182019// Splits free-form text in to tokens, including lower-case, unicode normalization, and some unicode folding.
2120//
2221// The intent is for this to work similarly to an NLP tokenizer, as might be used in a fulltext search engine, and enable fast matching to a list of known tokens. It might eventually even do stemming, removing pluralization (trailing "s" for English), etc.
2322func TokenizeText(text string) []string {
2323+ // this function needs to be re-defined in every function call to prevent a race condition
2424+ normFunc := transform.Chain(norm.NFD, runes.Remove(runes.In(unicode.Mn)), norm.NFC)
2425 split := strings.ToLower(nonTokenChars.ReplaceAllString(text, " "))
2526 bare := strings.ToLower(nonTokenChars.ReplaceAllString(split, ""))
2627 norm, _, err := transform.String(normFunc, bare)