wip

2024-12-31 15:06:26 +00:00 · 2024-05-09 21:10:40 +02:00 · 2024-05-09 21:10:40 +02:00 · 713172308e
commit 713172308e
parent 6da2f28d36
2 changed files with 95 additions and 25 deletions
--- a/lib/logstorage/tokenizer.go
+++ b/lib/logstorage/tokenizer.go
@ -3,6 +3,7 @@ package logstorage
 import (
 	"sync"
 	"unicode"
 	"unicode/utf8"
 )
 // tokenizeStrings extracts word tokens from a, appends them to dst and returns the result.
@ -31,37 +32,106 @@ func (t *tokenizer) reset() {
 }
 func (t *tokenizer) tokenizeString(dst []string, s string) []string {
 	if !isASCII(s) {
 		// Slow path - s contains unicode chars
 		return t.tokenizeStringUnicode(dst, s)
 	}
 	// Fast path for ASCII s
 	m := t.m
-	for len(s) > 0 {
+	i := 0
 	for i < len(s) {
 		// Search for the next token.
-		nextIdx := len(s)
+		start := len(s)
-		for i, c := range s {
+		for i < len(s) {
-			if isTokenRune(c) {
+			if !isTokenChar(s[i]) {
-				nextIdx = i
+				i++
-				break
+				continue
 			}
 			start = i
 			i++
 			break
 		}
-		s = s[nextIdx:]
+		// Search for the end of the token.
-		// Search for the end of the token
+		end := len(s)
-		nextIdx = len(s)
+		for i < len(s) {
-		for i, c := range s {
+			if isTokenChar(s[i]) {
-			if !isTokenRune(c) {
+				i++
-				nextIdx = i
+				continue
 				break
 			}
 			end = i
 			i++
 			break
 		}
-		token := s[:nextIdx]
+		if end <= start {
-		if len(token) > 0 {
+			break
-			if _, ok := m[token]; ok {
+		}
-				m[token] = struct{}{}
+
-				dst = append(dst, token)
+		// Register the token.
-			}
+		token := s[start:end]
 		if _, ok := m[token]; !ok {
 			m[token] = struct{}{}
 			dst = append(dst, token)
 		}
 		s = s[nextIdx:]
 	}
 	return dst
 }
 func (t *tokenizer) tokenizeStringUnicode(dst []string, s string) []string {
 	m := t.m
 	i := 0
 	for i < len(s) {
 		// Search for the next token.
 		start := len(s)
 		for i < len(s) {
 			r, size := utf8.DecodeRuneInString(s[i:])
 			if !isTokenRune(r) {
 				i += size
 				continue
 			}
 			start = i
 			i += size
 			break
 		}
 		// Search for the end of the token.
 		end := len(s)
 		for i < len(s) {
 			r, size := utf8.DecodeRuneInString(s[i:])
 			if isTokenRune(r) {
 				i += size
 				continue
 			}
 			end = i
 			i += size
 			break
 		}
 		if end <= start {
 			break
 		}
 		// Register the token
 		token := s[start:end]
 		if _, ok := m[token]; !ok {
 			m[token] = struct{}{}
 			dst = append(dst, token)
 		}
 	}
 	return dst
 }
 func isASCII(s string) bool {
 	for i := range s {
 		if s[i] >= utf8.RuneSelf {
 			return false
 		}
 	}
 	return true
 }
 func isTokenChar(c byte) bool {
 	return c >= 'a' && c <= 'z' || c >= 'A' && c <= 'Z' || c >= '0' && c <= '9' || c == '_'
 }
 func isTokenRune(c rune) bool {
 	return unicode.IsLetter(c) || unicode.IsDigit(c) || c == '_'
 }
--- a/lib/logstorage/tokenizer_test.go
+++ b/lib/logstorage/tokenizer_test.go
@ -17,13 +17,13 @@ func TestTokenizeStrings(t *testing.T) {
 	f(nil, nil)
 	f([]string{""}, nil)
 	f([]string{"foo"}, []string{"foo"})
-	f([]string{"foo bar---.!!([baz]!!! %$# TaSte"}, []string{"TaSte", "bar", "baz", "foo"})
+	f([]string{"foo bar---.!!([baz]!!! %$# TaSte"}, []string{"foo", "bar", "baz", "TaSte"})
-	f([]string{"теСТ 1234 f12.34", "34 f12 AS"}, []string{"1234", "34", "AS", "f12", "теСТ"})
+	f([]string{"теСТ 1234 f12.34", "34 f12 AS"}, []string{"теСТ", "1234", "f12", "34", "AS"})
 	f(strings.Split(`
 Apr 28 13:43:38 localhost whoopsie[2812]: [13:43:38] online
 Apr 28 13:45:01 localhost CRON[12181]: (root) CMD (command -v debian-sa1 > /dev/null && debian-sa1 1 1)
 Apr 28 13:48:01 localhost kernel: [36020.497806] CPU0: Core temperature above threshold, cpu clock throttled (total events = 22034)
-`, "\n"), []string{"01", "1", "12181", "13", "22034", "28", "2812", "36020", "38", "43", "45", "48", "497806", "Apr", "CMD", "CPU0", "CRON",
+`, "\n"), []string{"Apr", "28", "13", "43", "38", "localhost", "whoopsie", "2812", "online", "45", "01", "CRON", "12181",
-		"Core", "above", "clock", "command", "cpu", "debian", "dev", "events", "kernel", "localhost", "null", "online", "root",
+	"root", "CMD", "command", "v", "debian", "sa1", "dev", "null", "1", "48", "kernel", "36020", "497806", "CPU0", "Core",
-		"sa1", "temperature", "threshold", "throttled", "total", "v", "whoopsie"})
+	"temperature", "above", "threshold", "cpu", "clock", "throttled", "total", "events", "22034"})
 }