diff --git a/lib/stringsutil/less_natural.go b/lib/stringsutil/less_natural.go new file mode 100644 index 000000000..6e272ee8e --- /dev/null +++ b/lib/stringsutil/less_natural.go @@ -0,0 +1,111 @@ +package stringsutil + +import ( + "math" +) + +// LessNatural returns true if a is less than b using natural sort comparison. +// +// See https://en.wikipedia.org/wiki/Natural_sort_order +func LessNatural(a, b string) bool { + isReverse := false + for { + if len(a) > len(b) { + a, b = b, a + isReverse = !isReverse + } + + // Skip common prefix except of decimal digits + i := 0 + for i < len(a) { + cA := a[i] + cB := b[i] + + if cA >= '0' && cA <= '9' { + if cB >= '0' && cB <= '9' { + break + } + return !isReverse + } + if cB >= '0' && cB <= '9' { + return isReverse + } + if cA != cB { + // This should work properly for utf8 bytes in the middle of encoded unicode char, since: + // - utf8 bytes for multi-byte chars are bigger than decimal digit chars + // - sorting of utf8-encoded strings works properly thanks to utf8 properties + if isReverse { + return cB < cA + } + return cA < cB + } + + i++ + } + a = a[i:] + b = b[i:] + if len(a) == 0 { + if isReverse { + return false + } + return len(b) > 0 + } + + // Collect digit prefixes for a and b and then compare them. + + iA := 1 + nA := uint64(a[0] - '0') + for iA < len(a) { + c := a[iA] + if c < '0' || c > '9' { + break + } + if nA > (math.MaxUint64-9)/10 { + // Too big integer. Fall back to string comparison + if isReverse { + return b < a + } + return a < b + } + nA *= 10 + nA += uint64(c - '0') + iA++ + } + + iB := 1 + nB := uint64(b[0] - '0') + for iB < len(b) { + c := b[iB] + if c < '0' || c > '9' { + break + } + if nB > (math.MaxUint64-9)/10 { + // Too big integer. Fall back to string comparison + if isReverse { + return b < a + } + return a < b + } + nB *= 10 + nB += uint64(c - '0') + iB++ + } + + if nA != nB { + if isReverse { + return nB < nA + } + return nA < nB + } + + if iA != iB { + if isReverse { + return iB < iA + } + return iA < iB + } + + a = a[iA:] + b = b[iB:] + } +} diff --git a/lib/stringsutil/less_natural_test.go b/lib/stringsutil/less_natural_test.go new file mode 100644 index 000000000..1de93e2e9 --- /dev/null +++ b/lib/stringsutil/less_natural_test.go @@ -0,0 +1,90 @@ +package stringsutil + +import ( + "testing" +) + +func TestLessNatural(t *testing.T) { + f := func(a, b string, resultExpected bool) { + t.Helper() + + result := LessNatural(a, b) + if result != resultExpected { + t.Fatalf("unexpected result for LessNatural(%q, %q); got %v; want %v", a, b, result, resultExpected) + } + } + + // comparison with empty string + f("", "", false) + f("", "foo", true) + f("foo", "", false) + f("", "123", true) + f("123", "", false) + + // identical values + f("foo", "foo", false) + f("123", "123", false) + f("foo123", "foo123", false) + f("123foo", "123foo", false) + f("000", "000", false) + f("00123", "00123", false) + f("00foo", "00foo", false) + f("abc00foo0123", "abc00foo0123", false) + + // identical values with different number of zeroes in front of them + f("00123", "0123", false) + f("0123", "00123", true) + + // numeric comparsion + f("123", "99", false) + f("99", "123", true) + + // floating-point comparsion (works unexpectedly - this is OK for natural sort order) + f("1.23", "1.123", true) + f("1.123", "1.23", false) + + // non-numeric comparison + f("foo", "bar", false) + f("fo", "bar", false) + f("bar", "foo", true) + f("bar", "fo", true) + + // comparison with common non-numeric prefix + f("abc_foo", "abc_bar", false) + f("abc_bar", "abc_foo", true) + f("abc_foo", "abc_", false) + f("abc_", "abc_foo", true) + f("abc_123", "abc_foo", true) + f("abc_foo", "abc_123", false) + + // comparison with common numeric prefix + f("123foo", "123bar", false) + f("123bar", "123foo", true) + f("123", "123bar", true) + f("123bar", "123", false) + f("123_456", "123_78", false) + f("123_78", "123_456", true) + + // too big integers - fall back to string order + f("1234567890123456789012345", "1234567890123456789012345", false) + f("1234567890123456789012345", "123456789012345678901234", false) + f("123456789012345678901234", "1234567890123456789012345", true) + f("193456789012345678901234", "1234567890123456789012345", false) + f("123456789012345678901234", "1934567890123456789012345", true) + f("1934", "1234567890123456789012345", false) + f("1234567890123456789012345", "1934", true) + + // integers with many zeroes in front + f("00000000000000000000000000123", "0000000000000000000000000045", false) + f("0000000000000000000000000045", "00000000000000000000000000123", true) + + // unicode strings + f("бвг", "мирг", true) + f("мирг", "бвг", false) + f("abcde", "мирг", true) + f("мирг", "abcde", false) + f("123", "мирг", true) + f("мирг", "123", false) + f("12345", "мирг", true) + f("мирг", "12345", false) +} diff --git a/lib/stringsutil/less_natural_timing_test.go b/lib/stringsutil/less_natural_timing_test.go new file mode 100644 index 000000000..edb3a6484 --- /dev/null +++ b/lib/stringsutil/less_natural_timing_test.go @@ -0,0 +1,59 @@ +package stringsutil + +import ( + "testing" +) + +func BenchmarkLessNatural(b *testing.B) { + b.Run("distinct_string_prefixes", func(b *testing.B) { + benchmarkLessNatural(b, []string{ + "aaa", "bbb", "ccc", "ddd", "eee", "fff", "g", "hh", "kkk", "ooo", "ppppp", "wwww", "zzz", "qqq", + }) + }) + b.Run("distinct_numeric_values", func(b *testing.B) { + benchmarkLessNatural(b, []string{ + "111", "222", "333", "44", "5555", "666", "7", "88", "999", "000", "123", "452", "34", "234", + }) + }) + b.Run("common_string_prefixes_distinct_string_suffixes", func(b *testing.B) { + benchmarkLessNatural(b, []string{ + "XXXaaa", "XXXbbb", "XXXccc", "XXXddd", "XXXeee", "XXXfff", "XXXg", "XXXhh", "XXXkkk", "XXXooo", "XXXppppp", "XXXwwww", "XXXzzz", "XXXqqq", + }) + }) + b.Run("common_string_prefixes_distinct_numeric_suffixes", func(b *testing.B) { + benchmarkLessNatural(b, []string{ + "XXX111", "XXX222", "XXX33", "XXX4", "XXX555", "XXX666", "XXX7", "XXX88", "XXX999", "XXX000", "XXX12345", "XXX3211", "XXX873", "XXX98", + }) + }) + b.Run("common_string_values", func(b *testing.B) { + benchmarkLessNatural(b, []string{ + "XXXXX", "XXXXX", "XXXXX", "XXXXX", "XXXXX", "XXXXX", "XXXXX", "XXXXX", "XXXXX", "XXXXX", "XXXXX", "XXXXX", "XXXXX", "XXXXX", + }) + }) + b.Run("common_numeric_prefixes", func(b *testing.B) { + benchmarkLessNatural(b, []string{ + "111aaa", "111bbb", "111ccc", "111ddd", "111eee", "111fff", "111g", "111hh", "111kkk", "111ooo", "111ppppp", "111wwww", "111zzz", "111qqq", + }) + }) + b.Run("common_numeric_values", func(b *testing.B) { + benchmarkLessNatural(b, []string{ + "11111", "11111", "11111", "11111", "11111", "11111", "11111", "11111", "11111", "11111", "11111", "11111", "11111", "11111", + }) + }) +} + +func benchmarkLessNatural(b *testing.B, a []string) { + b.ReportAllocs() + b.SetBytes(int64(len(a) - 1)) + b.RunParallel(func(pb *testing.PB) { + n := uint64(0) + for pb.Next() { + for i := 1; i < len(a); i++ { + if LessNatural(a[i-1], a[i]) { + n++ + } + } + } + GlobalSink.Add(n) + }) +}