2023-06-20 05:55:12 +00:00
|
|
|
package logstorage
|
|
|
|
|
|
|
|
import (
|
|
|
|
"sync"
|
|
|
|
"unicode"
|
2024-05-12 14:33:29 +00:00
|
|
|
"unicode/utf8"
|
2023-06-20 05:55:12 +00:00
|
|
|
)
|
|
|
|
|
|
|
|
// tokenizeStrings extracts word tokens from a, appends them to dst and returns the result.
|
2024-05-12 14:33:29 +00:00
|
|
|
//
|
|
|
|
// the order of returned tokens is unspecified.
|
2023-06-20 05:55:12 +00:00
|
|
|
func tokenizeStrings(dst, a []string) []string {
|
|
|
|
t := getTokenizer()
|
|
|
|
for i, s := range a {
|
|
|
|
if i > 0 && s == a[i-1] {
|
|
|
|
// This string has been already tokenized
|
|
|
|
continue
|
|
|
|
}
|
2024-05-12 14:33:29 +00:00
|
|
|
dst = t.tokenizeString(dst, s)
|
2023-06-20 05:55:12 +00:00
|
|
|
}
|
|
|
|
putTokenizer(t)
|
|
|
|
|
|
|
|
return dst
|
|
|
|
}
|
|
|
|
|
|
|
|
type tokenizer struct {
|
|
|
|
m map[string]struct{}
|
|
|
|
}
|
|
|
|
|
|
|
|
func (t *tokenizer) reset() {
|
2024-05-12 14:33:29 +00:00
|
|
|
clear(t.m)
|
|
|
|
}
|
|
|
|
|
|
|
|
func (t *tokenizer) tokenizeString(dst []string, s string) []string {
|
|
|
|
if !isASCII(s) {
|
|
|
|
// Slow path - s contains unicode chars
|
|
|
|
return t.tokenizeStringUnicode(dst, s)
|
|
|
|
}
|
|
|
|
|
|
|
|
// Fast path for ASCII s
|
2023-06-20 05:55:12 +00:00
|
|
|
m := t.m
|
2024-05-12 14:33:29 +00:00
|
|
|
i := 0
|
|
|
|
for i < len(s) {
|
|
|
|
// Search for the next token.
|
|
|
|
start := len(s)
|
|
|
|
for i < len(s) {
|
|
|
|
if !isTokenChar(s[i]) {
|
|
|
|
i++
|
|
|
|
continue
|
|
|
|
}
|
|
|
|
start = i
|
|
|
|
i++
|
|
|
|
break
|
|
|
|
}
|
|
|
|
// Search for the end of the token.
|
|
|
|
end := len(s)
|
|
|
|
for i < len(s) {
|
|
|
|
if isTokenChar(s[i]) {
|
|
|
|
i++
|
|
|
|
continue
|
|
|
|
}
|
|
|
|
end = i
|
|
|
|
i++
|
|
|
|
break
|
|
|
|
}
|
|
|
|
if end <= start {
|
|
|
|
break
|
|
|
|
}
|
|
|
|
|
|
|
|
// Register the token.
|
|
|
|
token := s[start:end]
|
|
|
|
if _, ok := m[token]; !ok {
|
|
|
|
m[token] = struct{}{}
|
|
|
|
dst = append(dst, token)
|
|
|
|
}
|
2023-06-20 05:55:12 +00:00
|
|
|
}
|
2024-05-12 14:33:29 +00:00
|
|
|
return dst
|
2023-06-20 05:55:12 +00:00
|
|
|
}
|
|
|
|
|
2024-05-12 14:33:29 +00:00
|
|
|
func (t *tokenizer) tokenizeStringUnicode(dst []string, s string) []string {
|
|
|
|
m := t.m
|
2023-06-20 05:55:12 +00:00
|
|
|
for len(s) > 0 {
|
|
|
|
// Search for the next token.
|
2024-05-12 14:33:29 +00:00
|
|
|
n := len(s)
|
|
|
|
for offset, r := range s {
|
|
|
|
if isTokenRune(r) {
|
|
|
|
n = offset
|
2023-06-20 05:55:12 +00:00
|
|
|
break
|
|
|
|
}
|
|
|
|
}
|
2024-05-12 14:33:29 +00:00
|
|
|
s = s[n:]
|
|
|
|
// Search for the end of the token.
|
|
|
|
n = len(s)
|
|
|
|
for offset, r := range s {
|
|
|
|
if !isTokenRune(r) {
|
|
|
|
n = offset
|
2023-06-20 05:55:12 +00:00
|
|
|
break
|
|
|
|
}
|
|
|
|
}
|
2024-05-12 14:33:29 +00:00
|
|
|
if n == 0 {
|
|
|
|
break
|
|
|
|
}
|
|
|
|
|
|
|
|
// Register the token
|
|
|
|
token := s[:n]
|
|
|
|
s = s[n:]
|
|
|
|
if _, ok := m[token]; !ok {
|
|
|
|
m[token] = struct{}{}
|
|
|
|
dst = append(dst, token)
|
2023-06-20 05:55:12 +00:00
|
|
|
}
|
|
|
|
}
|
2024-05-12 14:33:29 +00:00
|
|
|
return dst
|
|
|
|
}
|
|
|
|
|
|
|
|
func isASCII(s string) bool {
|
|
|
|
for i := range s {
|
|
|
|
if s[i] >= utf8.RuneSelf {
|
|
|
|
return false
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return true
|
|
|
|
}
|
|
|
|
|
|
|
|
func isTokenChar(c byte) bool {
|
|
|
|
return c >= 'a' && c <= 'z' || c >= 'A' && c <= 'Z' || c >= '0' && c <= '9' || c == '_'
|
2023-06-20 05:55:12 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
func isTokenRune(c rune) bool {
|
|
|
|
return unicode.IsLetter(c) || unicode.IsDigit(c) || c == '_'
|
|
|
|
}
|
|
|
|
|
|
|
|
func getTokenizer() *tokenizer {
|
|
|
|
v := tokenizerPool.Get()
|
|
|
|
if v == nil {
|
|
|
|
return &tokenizer{
|
|
|
|
m: make(map[string]struct{}),
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return v.(*tokenizer)
|
|
|
|
}
|
|
|
|
|
|
|
|
func putTokenizer(t *tokenizer) {
|
|
|
|
t.reset()
|
|
|
|
tokenizerPool.Put(t)
|
|
|
|
}
|
|
|
|
|
|
|
|
var tokenizerPool sync.Pool
|
|
|
|
|
|
|
|
type tokensBuf struct {
|
|
|
|
A []string
|
|
|
|
}
|
|
|
|
|
|
|
|
func (tb *tokensBuf) reset() {
|
2024-05-12 14:33:29 +00:00
|
|
|
clear(tb.A)
|
|
|
|
tb.A = tb.A[:0]
|
2023-06-20 05:55:12 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
func getTokensBuf() *tokensBuf {
|
|
|
|
v := tokensBufPool.Get()
|
|
|
|
if v == nil {
|
|
|
|
return &tokensBuf{}
|
|
|
|
}
|
|
|
|
return v.(*tokensBuf)
|
|
|
|
}
|
|
|
|
|
|
|
|
func putTokensBuf(tb *tokensBuf) {
|
|
|
|
tb.reset()
|
|
|
|
tokensBufPool.Put(tb)
|
|
|
|
}
|
|
|
|
|
|
|
|
var tokensBufPool sync.Pool
|