This commit is contained in:
Aliaksandr Valialkin 2024-05-13 15:59:25 +02:00
parent ecd51e48ec
commit 09e81cb5aa
No known key found for this signature in database
GPG key ID: 52C003EE2BCDB9EB
6 changed files with 239 additions and 28 deletions

View file

@ -1175,7 +1175,10 @@ See also:
### sort pipe
By default logs are selected in arbitrary order because of performance reasons. If logs must be sorted, then `| sort by (field1, ..., fieldN)` [pipe](#pipes) must be used.
By default logs are selected in arbitrary order because of performance reasons. If logs must be sorted, then `| sort by (field1, ..., fieldN)` [pipe](#pipes) can be used.
The returned logs are sorted by the given [fields](https://docs.victoriametrics.com/VictoriaLogs/keyConcepts.html#data-model)
using [natural sorting](https://en.wikipedia.org/wiki/Natural_sort_order).
For example, the following query returns logs for the last 5 minutes sorted by [`_stream`](https://docs.victoriametrics.com/victorialogs/keyconcepts/#stream-fields)
and then by [`_time`](https://docs.victoriametrics.com/victorialogs/keyconcepts/#time-field):
@ -1210,7 +1213,7 @@ See also:
### uniq pipe
`| uniq ...` pipe allows returning only unique results over the selected logs. For example, the following LogsQL query
returns uniq values for `ip` [log field](https://docs.victoriametrics.com/victorialogs/keyconcepts/#data-model)
returns unique values for `ip` [log field](https://docs.victoriametrics.com/victorialogs/keyconcepts/#data-model)
over logs for the last 5 minutes:
```logsql
@ -1536,7 +1539,7 @@ See also:
`uniq_values(field1, ..., fieldN)` [stats pipe](#stats-pipe) returns the unique non-empty values across
the mentioned [log fields](https://docs.victoriametrics.com/victorialogs/keyconcepts/#data-model).
The returned values are sorted and encoded in JSON array.
The returned values are encoded in JSON array. The order of the returned values is arbitrary.
For example, the following query returns unique non-empty values for the `ip` [field](https://docs.victoriametrics.com/victorialogs/keyconcepts/#data-model)
over logs for the last 5 minutes:

View file

@ -13,6 +13,7 @@ import (
"github.com/VictoriaMetrics/VictoriaMetrics/lib/bytesutil"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/memory"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/stringsutil"
)
// pipeSort processes '| sort ...' queries.
@ -639,9 +640,9 @@ func sortBlockLess(shardA *pipeSortProcessorShard, rowIdxA int, shardB *pipeSort
continue
}
if isDesc {
return sB < sA
return stringsutil.LessNatural(sB, sA)
}
return sA < sB
return stringsutil.LessNatural(sA, sB)
}
return false
}

View file

@ -202,12 +202,10 @@ func (sup *statsUniqValuesProcessor) finalizeStats() string {
return "[]"
}
// Sort unique items
items := make([]string, 0, len(sup.m))
for k := range sup.m {
items = append(items, k)
}
slices.SortFunc(items, compareValues)
if limit := sup.su.limit; limit > 0 && uint64(len(items)) > limit {
items = items[:limit]
@ -242,27 +240,6 @@ func marshalJSONArray(items []string) string {
return bytesutil.ToUnsafeString(b)
}
func compareValues(a, b string) int {
fA, okA := tryParseFloat64(a)
fB, okB := tryParseFloat64(b)
if okA && okB {
if fA == fB {
return 0
}
if fA < fB {
return -1
}
return 1
}
if okA {
return -1
}
if okB {
return 1
}
return strings.Compare(a, b)
}
func parseStatsUniqValues(lex *lexer) (*statsUniqValues, error) {
fields, err := parseFieldNamesForStatsFunc(lex, "uniq_values")
if err != nil {

View file

@ -0,0 +1,111 @@
package stringsutil
import (
"math"
)
// LessNatural returns true if a is less than b using natural sort comparison.
//
// See https://en.wikipedia.org/wiki/Natural_sort_order
func LessNatural(a, b string) bool {
isReverse := false
for {
if len(a) > len(b) {
a, b = b, a
isReverse = !isReverse
}
// Skip common prefix except of decimal digits
i := 0
for i < len(a) {
cA := a[i]
cB := b[i]
if cA >= '0' && cA <= '9' {
if cB >= '0' && cB <= '9' {
break
}
return !isReverse
}
if cB >= '0' && cB <= '9' {
return isReverse
}
if cA != cB {
// This should work properly for utf8 bytes in the middle of encoded unicode char, since:
// - utf8 bytes for multi-byte chars are bigger than decimal digit chars
// - sorting of utf8-encoded strings works properly thanks to utf8 properties
if isReverse {
return cB < cA
}
return cA < cB
}
i++
}
a = a[i:]
b = b[i:]
if len(a) == 0 {
if isReverse {
return false
}
return len(b) > 0
}
// Collect digit prefixes for a and b and then compare them.
iA := 1
nA := uint64(a[0] - '0')
for iA < len(a) {
c := a[iA]
if c < '0' || c > '9' {
break
}
if nA > (math.MaxUint64-9)/10 {
// Too big integer. Fall back to string comparison
if isReverse {
return b < a
}
return a < b
}
nA *= 10
nA += uint64(c - '0')
iA++
}
iB := 1
nB := uint64(b[0] - '0')
for iB < len(b) {
c := b[iB]
if c < '0' || c > '9' {
break
}
if nB > (math.MaxUint64-9)/10 {
// Too big integer. Fall back to string comparison
if isReverse {
return b < a
}
return a < b
}
nB *= 10
nB += uint64(c - '0')
iB++
}
if nA != nB {
if isReverse {
return nB < nA
}
return nA < nB
}
if iA != iB {
if isReverse {
return iB < iA
}
return iA < iB
}
a = a[iA:]
b = b[iB:]
}
}

View file

@ -0,0 +1,90 @@
package stringsutil
import (
"testing"
)
func TestLessNatural(t *testing.T) {
f := func(a, b string, resultExpected bool) {
t.Helper()
result := LessNatural(a, b)
if result != resultExpected {
t.Fatalf("unexpected result for LessNatural(%q, %q); got %v; want %v", a, b, result, resultExpected)
}
}
// comparison with empty string
f("", "", false)
f("", "foo", true)
f("foo", "", false)
f("", "123", true)
f("123", "", false)
// identical values
f("foo", "foo", false)
f("123", "123", false)
f("foo123", "foo123", false)
f("123foo", "123foo", false)
f("000", "000", false)
f("00123", "00123", false)
f("00foo", "00foo", false)
f("abc00foo0123", "abc00foo0123", false)
// identical values with different number of zeroes in front of them
f("00123", "0123", false)
f("0123", "00123", true)
// numeric comparsion
f("123", "99", false)
f("99", "123", true)
// floating-point comparsion (works unexpectedly - this is OK for natural sort order)
f("1.23", "1.123", true)
f("1.123", "1.23", false)
// non-numeric comparison
f("foo", "bar", false)
f("fo", "bar", false)
f("bar", "foo", true)
f("bar", "fo", true)
// comparison with common non-numeric prefix
f("abc_foo", "abc_bar", false)
f("abc_bar", "abc_foo", true)
f("abc_foo", "abc_", false)
f("abc_", "abc_foo", true)
f("abc_123", "abc_foo", true)
f("abc_foo", "abc_123", false)
// comparison with common numeric prefix
f("123foo", "123bar", false)
f("123bar", "123foo", true)
f("123", "123bar", true)
f("123bar", "123", false)
f("123_456", "123_78", false)
f("123_78", "123_456", true)
// too big integers - fall back to string order
f("1234567890123456789012345", "1234567890123456789012345", false)
f("1234567890123456789012345", "123456789012345678901234", false)
f("123456789012345678901234", "1234567890123456789012345", true)
f("193456789012345678901234", "1234567890123456789012345", false)
f("123456789012345678901234", "1934567890123456789012345", true)
f("1934", "1234567890123456789012345", false)
f("1234567890123456789012345", "1934", true)
// integers with many zeroes in front
f("00000000000000000000000000123", "0000000000000000000000000045", false)
f("0000000000000000000000000045", "00000000000000000000000000123", true)
// unicode strings
f("бвг", "мирг", true)
f("мирг", "бвг", false)
f("abcde", "мирг", true)
f("мирг", "abcde", false)
f("123", "мирг", true)
f("мирг", "123", false)
f("12345", "мирг", true)
f("мирг", "12345", false)
}

View file

@ -0,0 +1,29 @@
package stringsutil
import (
"testing"
)
func BenchmarkLessNatural(b *testing.B) {
b.Run("distinct_string_prefixes", func(b *testing.B) {
benchmarkLessNatural(b, []string{
"aaa", "bbb", "ccc", "ddd", "eee", "fff",
})
})
}
func benchmarkLessNatural(b *testing.B, a []string) {
b.ReportAllocs()
b.SetBytes(int64(len(a) - 1))
b.RunParallel(func(pb *testing.PB) {
n := uint64(0)
for pb.Next() {
for i := 1; i < len(a); i++ {
if LessNatural(a[i-1], a[i]) {
n++
}
}
}
GlobalSink.Add(n)
})
}