package logstorage import ( "sync" "unicode" "unicode/utf8" ) // tokenizeStrings extracts word tokens from a, appends them to dst and returns the result. // // the order of returned tokens is unspecified. func tokenizeStrings(dst, a []string) []string { t := getTokenizer() for i, s := range a { if i > 0 && s == a[i-1] { // This string has been already tokenized continue } dst = t.tokenizeString(dst, s) } putTokenizer(t) return dst } type tokenizer struct { m map[string]struct{} } func (t *tokenizer) reset() { clear(t.m) } func (t *tokenizer) tokenizeString(dst []string, s string) []string { if !isASCII(s) { // Slow path - s contains unicode chars return t.tokenizeStringUnicode(dst, s) } // Fast path for ASCII s m := t.m i := 0 for i < len(s) { // Search for the next token. start := len(s) for i < len(s) { if !isTokenChar(s[i]) { i++ continue } start = i i++ break } // Search for the end of the token. end := len(s) for i < len(s) { if isTokenChar(s[i]) { i++ continue } end = i i++ break } if end <= start { break } // Register the token. token := s[start:end] if _, ok := m[token]; !ok { m[token] = struct{}{} dst = append(dst, token) } } return dst } func (t *tokenizer) tokenizeStringUnicode(dst []string, s string) []string { m := t.m i := 0 for i < len(s) { // Search for the next token. start := len(s) for i < len(s) { r, size := utf8.DecodeRuneInString(s[i:]) if !isTokenRune(r) { i += size continue } start = i i += size break } // Search for the end of the token. end := len(s) for i < len(s) { r, size := utf8.DecodeRuneInString(s[i:]) if isTokenRune(r) { i += size continue } end = i i += size break } if end <= start { break } // Register the token token := s[start:end] if _, ok := m[token]; !ok { m[token] = struct{}{} dst = append(dst, token) } } return dst } func isASCII(s string) bool { for i := range s { if s[i] >= utf8.RuneSelf { return false } } return true } func isTokenChar(c byte) bool { return c >= 'a' && c <= 'z' || c >= 'A' && c <= 'Z' || c >= '0' && c <= '9' || c == '_' } func isTokenRune(c rune) bool { return unicode.IsLetter(c) || unicode.IsDigit(c) || c == '_' } func getTokenizer() *tokenizer { v := tokenizerPool.Get() if v == nil { return &tokenizer{ m: make(map[string]struct{}), } } return v.(*tokenizer) } func putTokenizer(t *tokenizer) { t.reset() tokenizerPool.Put(t) } var tokenizerPool sync.Pool type tokensBuf struct { A []string } func (tb *tokensBuf) reset() { clear(tb.A) tb.A = tb.A[:0] } func getTokensBuf() *tokensBuf { v := tokensBufPool.Get() if v == nil { return &tokensBuf{} } return v.(*tokensBuf) } func putTokensBuf(tb *tokensBuf) { tb.reset() tokensBufPool.Put(tb) } var tokensBufPool sync.Pool