From 6470eac7dc756c8bbcf7fd694e91bfa879f3da3a Mon Sep 17 00:00:00 2001 From: Aliaksandr Valialkin Date: Mon, 10 Jun 2024 17:58:02 +0200 Subject: [PATCH] lib/bytesutil: reduce the number of memory allocations per each interned string in bytesutil.InternString() from 5 to 1 This should reduce GC overhead when tens of millions of strings are interned (for example, during stream deduplication of millions of active time series). --- lib/bytesutil/internstring.go | 164 ++++++++++++++++++++++++---------- 1 file changed, 117 insertions(+), 47 deletions(-) diff --git a/lib/bytesutil/internstring.go b/lib/bytesutil/internstring.go index e62445ec3..9c19c7840 100644 --- a/lib/bytesutil/internstring.go +++ b/lib/bytesutil/internstring.go @@ -19,6 +19,121 @@ var ( "See https://en.wikipedia.org/wiki/String_interning . See also -internStringMaxLen and -internStringDisableCache") ) +type internStringMap struct { + mu sync.Mutex + mutable map[string]string + mutableReads uint64 + + readonly atomic.Pointer[map[string]internStringMapEntry] + + nextCleanupTime atomic.Uint64 +} + +type internStringMapEntry struct { + deadline uint64 + s string +} + +func newInternStringMap() *internStringMap { + ism := &internStringMap{ + mutable: make(map[string]string), + } + readonly := make(map[string]internStringMapEntry) + ism.readonly.Store(&readonly) + ism.nextCleanupTime.Store(fasttime.UnixTimestamp() + 61) + return ism +} + +func (m *internStringMap) getReadonly() map[string]internStringMapEntry { + return *m.readonly.Load() +} + +func (m *internStringMap) intern(s string) string { + if *disableCache || len(s) > *internStringMaxLen { + return strings.Clone(s) + } + currentTime := fasttime.UnixTimestamp() + if currentTime >= m.nextCleanupTime.Load() { + m.nextCleanupTime.Store(currentTime + 61) + m.cleanup() + } + + readonly := m.getReadonly() + e, ok := readonly[s] + if ok { + // Fast path - the string has been found in readonly map + return e.s + } + + // Slower path - search for the string in mutable map + m.mu.Lock() + sInterned, ok := m.mutable[s] + if !ok { + // Verify whether the s has been already registered by concurrent goroutines in m.readonly + readonly = m.getReadonly() + e, ok = readonly[s] + if !ok { + // Slowest path - register the string in mutable map. + // Make a new copy for s in order to remove references from possible bigger string s refers to. + sInterned = strings.Clone(s) + m.mutable[sInterned] = sInterned + } else { + sInterned = e.s + } + } + m.mutableReads++ + if m.mutableReads > uint64(len(readonly)) { + m.migrateMutableToReadonlyLocked() + m.mutableReads = 0 + } + m.mu.Unlock() + + return sInterned +} + +func (m *internStringMap) migrateMutableToReadonlyLocked() { + readonly := m.getReadonly() + readonlyCopy := make(map[string]internStringMapEntry, len(readonly)+len(m.mutable)) + for k, e := range readonly { + readonlyCopy[k] = e + } + deadline := fasttime.UnixTimestamp() + uint64(cacheExpireDuration.Seconds()+0.5) + for k, s := range m.mutable { + readonlyCopy[k] = internStringMapEntry{ + s: s, + deadline: deadline, + } + } + m.mutable = make(map[string]string) + m.readonly.Store(&readonlyCopy) +} + +func (m *internStringMap) cleanup() { + m.mu.Lock() + defer m.mu.Unlock() + + readonly := m.getReadonly() + currentTime := fasttime.UnixTimestamp() + needCleanup := false + for _, e := range readonly { + if e.deadline <= currentTime { + needCleanup = true + break + } + } + if !needCleanup { + return + } + + readonlyCopy := make(map[string]internStringMapEntry, len(readonly)) + for k, e := range readonly { + if e.deadline > currentTime { + readonlyCopy[k] = e + } + } + m.readonly.Store(&readonlyCopy) +} + func isSkipCache(s string) bool { return *disableCache || len(s) > *internStringMaxLen } @@ -33,52 +148,7 @@ func InternBytes(b []byte) string { // // This may be needed for reducing the amounts of allocated memory. func InternString(s string) string { - if isSkipCache(s) { - // Make a new copy for s in order to remove references from possible bigger string s refers to. - // This also protects from cases when s points to unsafe string - see https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3227 - return strings.Clone(s) - } - - ct := fasttime.UnixTimestamp() - if v, ok := internStringsMap.Load(s); ok { - e := v.(*ismEntry) - if e.lastAccessTime.Load()+10 < ct { - // Reduce the frequency of e.lastAccessTime update to once per 10 seconds - // in order to improve the fast path speed on systems with many CPU cores. - e.lastAccessTime.Store(ct) - } - return e.s - } - // Make a new copy for s in order to remove references from possible bigger string s refers to. - sCopy := strings.Clone(s) - e := &ismEntry{ - s: sCopy, - } - e.lastAccessTime.Store(ct) - internStringsMap.Store(sCopy, e) - - if needCleanup(&internStringsMapLastCleanupTime, ct) { - // Perform a global cleanup for internStringsMap by removing items, which weren't accessed during the last 5 minutes. - m := &internStringsMap - deadline := ct - uint64(cacheExpireDuration.Seconds()) - m.Range(func(k, v interface{}) bool { - e := v.(*ismEntry) - if e.lastAccessTime.Load() < deadline { - m.Delete(k) - } - return true - }) - } - - return sCopy + return ism.intern(s) } -type ismEntry struct { - lastAccessTime atomic.Uint64 - s string -} - -var ( - internStringsMap sync.Map - internStringsMapLastCleanupTime atomic.Uint64 -) +var ism = newInternStringMap()