This commit is contained in:
Aliaksandr Valialkin 2024-05-09 21:10:40 +02:00
parent 6da2f28d36
commit 713172308e
No known key found for this signature in database
GPG key ID: 52C003EE2BCDB9EB
2 changed files with 95 additions and 25 deletions

View file

@ -3,6 +3,7 @@ package logstorage
import (
"sync"
"unicode"
"unicode/utf8"
)
// tokenizeStrings extracts word tokens from a, appends them to dst and returns the result.
@ -31,37 +32,106 @@ func (t *tokenizer) reset() {
}
func (t *tokenizer) tokenizeString(dst []string, s string) []string {
if !isASCII(s) {
// Slow path - s contains unicode chars
return t.tokenizeStringUnicode(dst, s)
}
// Fast path for ASCII s
m := t.m
for len(s) > 0 {
i := 0
for i < len(s) {
// Search for the next token.
nextIdx := len(s)
for i, c := range s {
if isTokenRune(c) {
nextIdx = i
start := len(s)
for i < len(s) {
if !isTokenChar(s[i]) {
i++
continue
}
start = i
i++
break
}
// Search for the end of the token.
end := len(s)
for i < len(s) {
if isTokenChar(s[i]) {
i++
continue
}
s = s[nextIdx:]
// Search for the end of the token
nextIdx = len(s)
for i, c := range s {
if !isTokenRune(c) {
nextIdx = i
end = i
i++
break
}
if end <= start {
break
}
token := s[:nextIdx]
if len(token) > 0 {
if _, ok := m[token]; ok {
// Register the token.
token := s[start:end]
if _, ok := m[token]; !ok {
m[token] = struct{}{}
dst = append(dst, token)
}
}
s = s[nextIdx:]
return dst
}
func (t *tokenizer) tokenizeStringUnicode(dst []string, s string) []string {
m := t.m
i := 0
for i < len(s) {
// Search for the next token.
start := len(s)
for i < len(s) {
r, size := utf8.DecodeRuneInString(s[i:])
if !isTokenRune(r) {
i += size
continue
}
start = i
i += size
break
}
// Search for the end of the token.
end := len(s)
for i < len(s) {
r, size := utf8.DecodeRuneInString(s[i:])
if isTokenRune(r) {
i += size
continue
}
end = i
i += size
break
}
if end <= start {
break
}
// Register the token
token := s[start:end]
if _, ok := m[token]; !ok {
m[token] = struct{}{}
dst = append(dst, token)
}
}
return dst
}
func isASCII(s string) bool {
for i := range s {
if s[i] >= utf8.RuneSelf {
return false
}
}
return true
}
func isTokenChar(c byte) bool {
return c >= 'a' && c <= 'z' || c >= 'A' && c <= 'Z' || c >= '0' && c <= '9' || c == '_'
}
func isTokenRune(c rune) bool {
return unicode.IsLetter(c) || unicode.IsDigit(c) || c == '_'
}

View file

@ -17,13 +17,13 @@ func TestTokenizeStrings(t *testing.T) {
f(nil, nil)
f([]string{""}, nil)
f([]string{"foo"}, []string{"foo"})
f([]string{"foo bar---.!!([baz]!!! %$# TaSte"}, []string{"TaSte", "bar", "baz", "foo"})
f([]string{"теСТ 1234 f12.34", "34 f12 AS"}, []string{"1234", "34", "AS", "f12", "теСТ"})
f([]string{"foo bar---.!!([baz]!!! %$# TaSte"}, []string{"foo", "bar", "baz", "TaSte"})
f([]string{"теСТ 1234 f12.34", "34 f12 AS"}, []string{"теСТ", "1234", "f12", "34", "AS"})
f(strings.Split(`
Apr 28 13:43:38 localhost whoopsie[2812]: [13:43:38] online
Apr 28 13:45:01 localhost CRON[12181]: (root) CMD (command -v debian-sa1 > /dev/null && debian-sa1 1 1)
Apr 28 13:48:01 localhost kernel: [36020.497806] CPU0: Core temperature above threshold, cpu clock throttled (total events = 22034)
`, "\n"), []string{"01", "1", "12181", "13", "22034", "28", "2812", "36020", "38", "43", "45", "48", "497806", "Apr", "CMD", "CPU0", "CRON",
"Core", "above", "clock", "command", "cpu", "debian", "dev", "events", "kernel", "localhost", "null", "online", "root",
"sa1", "temperature", "threshold", "throttled", "total", "v", "whoopsie"})
`, "\n"), []string{"Apr", "28", "13", "43", "38", "localhost", "whoopsie", "2812", "online", "45", "01", "CRON", "12181",
"root", "CMD", "command", "v", "debian", "sa1", "dev", "null", "1", "48", "kernel", "36020", "497806", "CPU0", "Core",
"temperature", "above", "threshold", "cpu", "clock", "throttled", "total", "events", "22034"})
}