From 713172308e866c99c533d5888740ff0193694852 Mon Sep 17 00:00:00 2001 From: Aliaksandr Valialkin Date: Thu, 9 May 2024 21:10:40 +0200 Subject: [PATCH] wip --- lib/logstorage/tokenizer.go | 110 +++++++++++++++++++++++++------ lib/logstorage/tokenizer_test.go | 10 +-- 2 files changed, 95 insertions(+), 25 deletions(-) diff --git a/lib/logstorage/tokenizer.go b/lib/logstorage/tokenizer.go index f957b2ed2..2e3ffff2a 100644 --- a/lib/logstorage/tokenizer.go +++ b/lib/logstorage/tokenizer.go @@ -3,6 +3,7 @@ package logstorage import ( "sync" "unicode" + "unicode/utf8" ) // tokenizeStrings extracts word tokens from a, appends them to dst and returns the result. @@ -31,37 +32,106 @@ func (t *tokenizer) reset() { } func (t *tokenizer) tokenizeString(dst []string, s string) []string { + if !isASCII(s) { + // Slow path - s contains unicode chars + return t.tokenizeStringUnicode(dst, s) + } + + // Fast path for ASCII s m := t.m - for len(s) > 0 { + i := 0 + for i < len(s) { // Search for the next token. - nextIdx := len(s) - for i, c := range s { - if isTokenRune(c) { - nextIdx = i - break + start := len(s) + for i < len(s) { + if !isTokenChar(s[i]) { + i++ + continue } + start = i + i++ + break } - s = s[nextIdx:] - // Search for the end of the token - nextIdx = len(s) - for i, c := range s { - if !isTokenRune(c) { - nextIdx = i - break + // Search for the end of the token. + end := len(s) + for i < len(s) { + if isTokenChar(s[i]) { + i++ + continue } + end = i + i++ + break } - token := s[:nextIdx] - if len(token) > 0 { - if _, ok := m[token]; ok { - m[token] = struct{}{} - dst = append(dst, token) - } + if end <= start { + break + } + + // Register the token. + token := s[start:end] + if _, ok := m[token]; !ok { + m[token] = struct{}{} + dst = append(dst, token) } - s = s[nextIdx:] } return dst } +func (t *tokenizer) tokenizeStringUnicode(dst []string, s string) []string { + m := t.m + i := 0 + for i < len(s) { + // Search for the next token. + start := len(s) + for i < len(s) { + r, size := utf8.DecodeRuneInString(s[i:]) + if !isTokenRune(r) { + i += size + continue + } + start = i + i += size + break + } + // Search for the end of the token. + end := len(s) + for i < len(s) { + r, size := utf8.DecodeRuneInString(s[i:]) + if isTokenRune(r) { + i += size + continue + } + end = i + i += size + break + } + if end <= start { + break + } + + // Register the token + token := s[start:end] + if _, ok := m[token]; !ok { + m[token] = struct{}{} + dst = append(dst, token) + } + } + return dst +} + +func isASCII(s string) bool { + for i := range s { + if s[i] >= utf8.RuneSelf { + return false + } + } + return true +} + +func isTokenChar(c byte) bool { + return c >= 'a' && c <= 'z' || c >= 'A' && c <= 'Z' || c >= '0' && c <= '9' || c == '_' +} + func isTokenRune(c rune) bool { return unicode.IsLetter(c) || unicode.IsDigit(c) || c == '_' } diff --git a/lib/logstorage/tokenizer_test.go b/lib/logstorage/tokenizer_test.go index 67ca9d0f6..37573752f 100644 --- a/lib/logstorage/tokenizer_test.go +++ b/lib/logstorage/tokenizer_test.go @@ -17,13 +17,13 @@ func TestTokenizeStrings(t *testing.T) { f(nil, nil) f([]string{""}, nil) f([]string{"foo"}, []string{"foo"}) - f([]string{"foo bar---.!!([baz]!!! %$# TaSte"}, []string{"TaSte", "bar", "baz", "foo"}) - f([]string{"теСТ 1234 f12.34", "34 f12 AS"}, []string{"1234", "34", "AS", "f12", "теСТ"}) + f([]string{"foo bar---.!!([baz]!!! %$# TaSte"}, []string{"foo", "bar", "baz", "TaSte"}) + f([]string{"теСТ 1234 f12.34", "34 f12 AS"}, []string{"теСТ", "1234", "f12", "34", "AS"}) f(strings.Split(` Apr 28 13:43:38 localhost whoopsie[2812]: [13:43:38] online Apr 28 13:45:01 localhost CRON[12181]: (root) CMD (command -v debian-sa1 > /dev/null && debian-sa1 1 1) Apr 28 13:48:01 localhost kernel: [36020.497806] CPU0: Core temperature above threshold, cpu clock throttled (total events = 22034) -`, "\n"), []string{"01", "1", "12181", "13", "22034", "28", "2812", "36020", "38", "43", "45", "48", "497806", "Apr", "CMD", "CPU0", "CRON", - "Core", "above", "clock", "command", "cpu", "debian", "dev", "events", "kernel", "localhost", "null", "online", "root", - "sa1", "temperature", "threshold", "throttled", "total", "v", "whoopsie"}) +`, "\n"), []string{"Apr", "28", "13", "43", "38", "localhost", "whoopsie", "2812", "online", "45", "01", "CRON", "12181", + "root", "CMD", "command", "v", "debian", "sa1", "dev", "null", "1", "48", "kernel", "36020", "497806", "CPU0", "Core", + "temperature", "above", "threshold", "cpu", "clock", "throttled", "total", "events", "22034"}) }