From 09e81cb5aa419f35d9ee680c2474148130f0333e Mon Sep 17 00:00:00 2001 From: Aliaksandr Valialkin Date: Mon, 13 May 2024 15:59:25 +0200 Subject: [PATCH] wip --- docs/VictoriaLogs/LogsQL.md | 9 +- lib/logstorage/pipe_sort.go | 5 +- lib/logstorage/stats_uniq_values.go | 23 ---- lib/stringsutil/less_natural.go | 111 ++++++++++++++++++++ lib/stringsutil/less_natural_test.go | 90 ++++++++++++++++ lib/stringsutil/less_natural_timing_test.go | 29 +++++ 6 files changed, 239 insertions(+), 28 deletions(-) create mode 100644 lib/stringsutil/less_natural.go create mode 100644 lib/stringsutil/less_natural_test.go create mode 100644 lib/stringsutil/less_natural_timing_test.go diff --git a/docs/VictoriaLogs/LogsQL.md b/docs/VictoriaLogs/LogsQL.md index 05ee16bb9..40f6ce03d 100644 --- a/docs/VictoriaLogs/LogsQL.md +++ b/docs/VictoriaLogs/LogsQL.md @@ -1175,7 +1175,10 @@ See also: ### sort pipe -By default logs are selected in arbitrary order because of performance reasons. If logs must be sorted, then `| sort by (field1, ..., fieldN)` [pipe](#pipes) must be used. +By default logs are selected in arbitrary order because of performance reasons. If logs must be sorted, then `| sort by (field1, ..., fieldN)` [pipe](#pipes) can be used. +The returned logs are sorted by the given [fields](https://docs.victoriametrics.com/VictoriaLogs/keyConcepts.html#data-model) +using [natural sorting](https://en.wikipedia.org/wiki/Natural_sort_order). + For example, the following query returns logs for the last 5 minutes sorted by [`_stream`](https://docs.victoriametrics.com/victorialogs/keyconcepts/#stream-fields) and then by [`_time`](https://docs.victoriametrics.com/victorialogs/keyconcepts/#time-field): @@ -1210,7 +1213,7 @@ See also: ### uniq pipe `| uniq ...` pipe allows returning only unique results over the selected logs. For example, the following LogsQL query -returns uniq values for `ip` [log field](https://docs.victoriametrics.com/victorialogs/keyconcepts/#data-model) +returns unique values for `ip` [log field](https://docs.victoriametrics.com/victorialogs/keyconcepts/#data-model) over logs for the last 5 minutes: ```logsql @@ -1536,7 +1539,7 @@ See also: `uniq_values(field1, ..., fieldN)` [stats pipe](#stats-pipe) returns the unique non-empty values across the mentioned [log fields](https://docs.victoriametrics.com/victorialogs/keyconcepts/#data-model). -The returned values are sorted and encoded in JSON array. +The returned values are encoded in JSON array. The order of the returned values is arbitrary. For example, the following query returns unique non-empty values for the `ip` [field](https://docs.victoriametrics.com/victorialogs/keyconcepts/#data-model) over logs for the last 5 minutes: diff --git a/lib/logstorage/pipe_sort.go b/lib/logstorage/pipe_sort.go index 961029c41..f35c80957 100644 --- a/lib/logstorage/pipe_sort.go +++ b/lib/logstorage/pipe_sort.go @@ -13,6 +13,7 @@ import ( "github.com/VictoriaMetrics/VictoriaMetrics/lib/bytesutil" "github.com/VictoriaMetrics/VictoriaMetrics/lib/memory" + "github.com/VictoriaMetrics/VictoriaMetrics/lib/stringsutil" ) // pipeSort processes '| sort ...' queries. @@ -639,9 +640,9 @@ func sortBlockLess(shardA *pipeSortProcessorShard, rowIdxA int, shardB *pipeSort continue } if isDesc { - return sB < sA + return stringsutil.LessNatural(sB, sA) } - return sA < sB + return stringsutil.LessNatural(sA, sB) } return false } diff --git a/lib/logstorage/stats_uniq_values.go b/lib/logstorage/stats_uniq_values.go index df0e561ae..d11e61387 100644 --- a/lib/logstorage/stats_uniq_values.go +++ b/lib/logstorage/stats_uniq_values.go @@ -202,12 +202,10 @@ func (sup *statsUniqValuesProcessor) finalizeStats() string { return "[]" } - // Sort unique items items := make([]string, 0, len(sup.m)) for k := range sup.m { items = append(items, k) } - slices.SortFunc(items, compareValues) if limit := sup.su.limit; limit > 0 && uint64(len(items)) > limit { items = items[:limit] @@ -242,27 +240,6 @@ func marshalJSONArray(items []string) string { return bytesutil.ToUnsafeString(b) } -func compareValues(a, b string) int { - fA, okA := tryParseFloat64(a) - fB, okB := tryParseFloat64(b) - if okA && okB { - if fA == fB { - return 0 - } - if fA < fB { - return -1 - } - return 1 - } - if okA { - return -1 - } - if okB { - return 1 - } - return strings.Compare(a, b) -} - func parseStatsUniqValues(lex *lexer) (*statsUniqValues, error) { fields, err := parseFieldNamesForStatsFunc(lex, "uniq_values") if err != nil { diff --git a/lib/stringsutil/less_natural.go b/lib/stringsutil/less_natural.go new file mode 100644 index 000000000..6e272ee8e --- /dev/null +++ b/lib/stringsutil/less_natural.go @@ -0,0 +1,111 @@ +package stringsutil + +import ( + "math" +) + +// LessNatural returns true if a is less than b using natural sort comparison. +// +// See https://en.wikipedia.org/wiki/Natural_sort_order +func LessNatural(a, b string) bool { + isReverse := false + for { + if len(a) > len(b) { + a, b = b, a + isReverse = !isReverse + } + + // Skip common prefix except of decimal digits + i := 0 + for i < len(a) { + cA := a[i] + cB := b[i] + + if cA >= '0' && cA <= '9' { + if cB >= '0' && cB <= '9' { + break + } + return !isReverse + } + if cB >= '0' && cB <= '9' { + return isReverse + } + if cA != cB { + // This should work properly for utf8 bytes in the middle of encoded unicode char, since: + // - utf8 bytes for multi-byte chars are bigger than decimal digit chars + // - sorting of utf8-encoded strings works properly thanks to utf8 properties + if isReverse { + return cB < cA + } + return cA < cB + } + + i++ + } + a = a[i:] + b = b[i:] + if len(a) == 0 { + if isReverse { + return false + } + return len(b) > 0 + } + + // Collect digit prefixes for a and b and then compare them. + + iA := 1 + nA := uint64(a[0] - '0') + for iA < len(a) { + c := a[iA] + if c < '0' || c > '9' { + break + } + if nA > (math.MaxUint64-9)/10 { + // Too big integer. Fall back to string comparison + if isReverse { + return b < a + } + return a < b + } + nA *= 10 + nA += uint64(c - '0') + iA++ + } + + iB := 1 + nB := uint64(b[0] - '0') + for iB < len(b) { + c := b[iB] + if c < '0' || c > '9' { + break + } + if nB > (math.MaxUint64-9)/10 { + // Too big integer. Fall back to string comparison + if isReverse { + return b < a + } + return a < b + } + nB *= 10 + nB += uint64(c - '0') + iB++ + } + + if nA != nB { + if isReverse { + return nB < nA + } + return nA < nB + } + + if iA != iB { + if isReverse { + return iB < iA + } + return iA < iB + } + + a = a[iA:] + b = b[iB:] + } +} diff --git a/lib/stringsutil/less_natural_test.go b/lib/stringsutil/less_natural_test.go new file mode 100644 index 000000000..1de93e2e9 --- /dev/null +++ b/lib/stringsutil/less_natural_test.go @@ -0,0 +1,90 @@ +package stringsutil + +import ( + "testing" +) + +func TestLessNatural(t *testing.T) { + f := func(a, b string, resultExpected bool) { + t.Helper() + + result := LessNatural(a, b) + if result != resultExpected { + t.Fatalf("unexpected result for LessNatural(%q, %q); got %v; want %v", a, b, result, resultExpected) + } + } + + // comparison with empty string + f("", "", false) + f("", "foo", true) + f("foo", "", false) + f("", "123", true) + f("123", "", false) + + // identical values + f("foo", "foo", false) + f("123", "123", false) + f("foo123", "foo123", false) + f("123foo", "123foo", false) + f("000", "000", false) + f("00123", "00123", false) + f("00foo", "00foo", false) + f("abc00foo0123", "abc00foo0123", false) + + // identical values with different number of zeroes in front of them + f("00123", "0123", false) + f("0123", "00123", true) + + // numeric comparsion + f("123", "99", false) + f("99", "123", true) + + // floating-point comparsion (works unexpectedly - this is OK for natural sort order) + f("1.23", "1.123", true) + f("1.123", "1.23", false) + + // non-numeric comparison + f("foo", "bar", false) + f("fo", "bar", false) + f("bar", "foo", true) + f("bar", "fo", true) + + // comparison with common non-numeric prefix + f("abc_foo", "abc_bar", false) + f("abc_bar", "abc_foo", true) + f("abc_foo", "abc_", false) + f("abc_", "abc_foo", true) + f("abc_123", "abc_foo", true) + f("abc_foo", "abc_123", false) + + // comparison with common numeric prefix + f("123foo", "123bar", false) + f("123bar", "123foo", true) + f("123", "123bar", true) + f("123bar", "123", false) + f("123_456", "123_78", false) + f("123_78", "123_456", true) + + // too big integers - fall back to string order + f("1234567890123456789012345", "1234567890123456789012345", false) + f("1234567890123456789012345", "123456789012345678901234", false) + f("123456789012345678901234", "1234567890123456789012345", true) + f("193456789012345678901234", "1234567890123456789012345", false) + f("123456789012345678901234", "1934567890123456789012345", true) + f("1934", "1234567890123456789012345", false) + f("1234567890123456789012345", "1934", true) + + // integers with many zeroes in front + f("00000000000000000000000000123", "0000000000000000000000000045", false) + f("0000000000000000000000000045", "00000000000000000000000000123", true) + + // unicode strings + f("бвг", "мирг", true) + f("мирг", "бвг", false) + f("abcde", "мирг", true) + f("мирг", "abcde", false) + f("123", "мирг", true) + f("мирг", "123", false) + f("12345", "мирг", true) + f("мирг", "12345", false) +} diff --git a/lib/stringsutil/less_natural_timing_test.go b/lib/stringsutil/less_natural_timing_test.go new file mode 100644 index 000000000..bbefe66ca --- /dev/null +++ b/lib/stringsutil/less_natural_timing_test.go @@ -0,0 +1,29 @@ +package stringsutil + +import ( + "testing" +) + +func BenchmarkLessNatural(b *testing.B) { + b.Run("distinct_string_prefixes", func(b *testing.B) { + benchmarkLessNatural(b, []string{ + "aaa", "bbb", "ccc", "ddd", "eee", "fff", + }) + }) +} + +func benchmarkLessNatural(b *testing.B, a []string) { + b.ReportAllocs() + b.SetBytes(int64(len(a) - 1)) + b.RunParallel(func(pb *testing.PB) { + n := uint64(0) + for pb.Next() { + for i := 1; i < len(a); i++ { + if LessNatural(a[i-1], a[i]) { + n++ + } + } + } + GlobalSink.Add(n) + }) +}