mirror of
https://github.com/VictoriaMetrics/VictoriaMetrics.git
synced 2024-12-01 14:47:38 +00:00
181 lines
3.2 KiB
Go
181 lines
3.2 KiB
Go
|
package logstorage
|
||
|
|
||
|
import (
|
||
|
"sync"
|
||
|
|
||
|
"github.com/cespare/xxhash/v2"
|
||
|
|
||
|
"github.com/VictoriaMetrics/VictoriaMetrics/lib/bytesutil"
|
||
|
)
|
||
|
|
||
|
// tokenizeHashes extracts word tokens from a, hashes them, appends hashes to dst and returns the result.
|
||
|
//
|
||
|
// The returned hashes can be used for building bloom filters.
|
||
|
func tokenizeHashes(dst []uint64, a []string) []uint64 {
|
||
|
t := getHashTokenizer()
|
||
|
for i, s := range a {
|
||
|
if i > 0 && s == a[i-1] {
|
||
|
// This string has been already tokenized
|
||
|
continue
|
||
|
}
|
||
|
dst = t.tokenizeString(dst, s)
|
||
|
}
|
||
|
putHashTokenizer(t)
|
||
|
|
||
|
return dst
|
||
|
}
|
||
|
|
||
|
const hashTokenizerBucketsCount = 1024
|
||
|
|
||
|
type hashTokenizer struct {
|
||
|
buckets [hashTokenizerBucketsCount]hashTokenizerBucket
|
||
|
bm bitmap
|
||
|
}
|
||
|
|
||
|
type hashTokenizerBucket struct {
|
||
|
v uint64
|
||
|
overflow []uint64
|
||
|
}
|
||
|
|
||
|
func (b *hashTokenizerBucket) reset() {
|
||
|
// do not spend CPU time on clearing v and b.overflow items,
|
||
|
// since they'll be overwritten with new items.
|
||
|
b.overflow = b.overflow[:0]
|
||
|
}
|
||
|
|
||
|
func newHashTokenizer() *hashTokenizer {
|
||
|
var t hashTokenizer
|
||
|
t.bm.init(len(t.buckets))
|
||
|
return &t
|
||
|
}
|
||
|
|
||
|
func (t *hashTokenizer) reset() {
|
||
|
if t.bm.onesCount() <= len(t.buckets)/4 {
|
||
|
t.bm.forEachSetBit(func(idx int) bool {
|
||
|
t.buckets[idx].reset()
|
||
|
return false
|
||
|
})
|
||
|
} else {
|
||
|
buckets := t.buckets[:]
|
||
|
for i := range buckets {
|
||
|
buckets[i].reset()
|
||
|
}
|
||
|
t.bm.init(len(t.buckets))
|
||
|
}
|
||
|
}
|
||
|
|
||
|
func (t *hashTokenizer) tokenizeString(dst []uint64, s string) []uint64 {
|
||
|
if !isASCII(s) {
|
||
|
// Slow path - s contains unicode chars
|
||
|
return t.tokenizeStringUnicode(dst, s)
|
||
|
}
|
||
|
|
||
|
// Fast path for ASCII s
|
||
|
i := 0
|
||
|
for i < len(s) {
|
||
|
// Search for the next token.
|
||
|
start := len(s)
|
||
|
for i < len(s) {
|
||
|
if !isTokenChar(s[i]) {
|
||
|
i++
|
||
|
continue
|
||
|
}
|
||
|
start = i
|
||
|
i++
|
||
|
break
|
||
|
}
|
||
|
// Search for the end of the token.
|
||
|
end := len(s)
|
||
|
for i < len(s) {
|
||
|
if isTokenChar(s[i]) {
|
||
|
i++
|
||
|
continue
|
||
|
}
|
||
|
end = i
|
||
|
i++
|
||
|
break
|
||
|
}
|
||
|
if end <= start {
|
||
|
break
|
||
|
}
|
||
|
|
||
|
// Register the token.
|
||
|
token := s[start:end]
|
||
|
if h, ok := t.addToken(token); ok {
|
||
|
dst = append(dst, h)
|
||
|
}
|
||
|
}
|
||
|
return dst
|
||
|
}
|
||
|
|
||
|
func (t *hashTokenizer) tokenizeStringUnicode(dst []uint64, s string) []uint64 {
|
||
|
for len(s) > 0 {
|
||
|
// Search for the next token.
|
||
|
n := len(s)
|
||
|
for offset, r := range s {
|
||
|
if isTokenRune(r) {
|
||
|
n = offset
|
||
|
break
|
||
|
}
|
||
|
}
|
||
|
s = s[n:]
|
||
|
// Search for the end of the token.
|
||
|
n = len(s)
|
||
|
for offset, r := range s {
|
||
|
if !isTokenRune(r) {
|
||
|
n = offset
|
||
|
break
|
||
|
}
|
||
|
}
|
||
|
if n == 0 {
|
||
|
break
|
||
|
}
|
||
|
|
||
|
// Register the token
|
||
|
token := s[:n]
|
||
|
s = s[n:]
|
||
|
if h, ok := t.addToken(token); ok {
|
||
|
dst = append(dst, h)
|
||
|
}
|
||
|
}
|
||
|
return dst
|
||
|
}
|
||
|
|
||
|
func (t *hashTokenizer) addToken(token string) (uint64, bool) {
|
||
|
h := xxhash.Sum64(bytesutil.ToUnsafeBytes(token))
|
||
|
idx := int(h % uint64(len(t.buckets)))
|
||
|
|
||
|
b := &t.buckets[idx]
|
||
|
if !t.bm.isSetBit(idx) {
|
||
|
b.v = h
|
||
|
t.bm.setBit(idx)
|
||
|
return h, true
|
||
|
}
|
||
|
|
||
|
if b.v == h {
|
||
|
return h, false
|
||
|
}
|
||
|
for _, v := range b.overflow {
|
||
|
if v == h {
|
||
|
return h, false
|
||
|
}
|
||
|
}
|
||
|
b.overflow = append(b.overflow, h)
|
||
|
return h, true
|
||
|
}
|
||
|
|
||
|
func getHashTokenizer() *hashTokenizer {
|
||
|
v := hashTokenizerPool.Get()
|
||
|
if v == nil {
|
||
|
return newHashTokenizer()
|
||
|
}
|
||
|
return v.(*hashTokenizer)
|
||
|
}
|
||
|
|
||
|
func putHashTokenizer(t *hashTokenizer) {
|
||
|
t.reset()
|
||
|
hashTokenizerPool.Put(t)
|
||
|
}
|
||
|
|
||
|
var hashTokenizerPool sync.Pool
|