mirror of
https://github.com/VictoriaMetrics/VictoriaMetrics.git
synced 2024-11-21 14:44:00 +00:00
8a23d08c21
* lib/logstorage: switch to read-only mode when running out of disk space Added support of `--storage.minFreeDiskSpaceBytes` command-line flag to allow graceful handling of running out of disk space at `--storageDataPath`. See: https://github.com/VictoriaMetrics/VictoriaMetrics/issues/4737 Signed-off-by: Zakhar Bessarab <z.bessarab@victoriametrics.com> * lib/logstorage: fix error handling logic during merge Signed-off-by: Zakhar Bessarab <z.bessarab@victoriametrics.com> * lib/logstorage: fix log level Signed-off-by: Zakhar Bessarab <z.bessarab@victoriametrics.com> --------- Signed-off-by: Zakhar Bessarab <z.bessarab@victoriametrics.com> Co-authored-by: Nikolay <nik@victoriametrics.com>
591 lines
16 KiB
Go
591 lines
16 KiB
Go
package logstorage
|
|
|
|
import (
|
|
"os"
|
|
"path/filepath"
|
|
"sort"
|
|
"sync"
|
|
"sync/atomic"
|
|
"time"
|
|
|
|
"github.com/VictoriaMetrics/VictoriaMetrics/lib/fs"
|
|
"github.com/VictoriaMetrics/VictoriaMetrics/lib/logger"
|
|
"github.com/VictoriaMetrics/VictoriaMetrics/lib/memory"
|
|
"github.com/VictoriaMetrics/VictoriaMetrics/lib/workingsetcache"
|
|
)
|
|
|
|
// StorageStats represents stats for the storage. It may be obtained by calling Storage.UpdateStats().
|
|
type StorageStats struct {
|
|
// RowsDroppedTooBigTimestamp is the number of rows dropped during data ingestion because their timestamp is smaller than the minimum allowed
|
|
RowsDroppedTooBigTimestamp uint64
|
|
|
|
// RowsDroppedTooSmallTimestamp is the number of rows dropped during data ingestion because their timestamp is bigger than the maximum allowed
|
|
RowsDroppedTooSmallTimestamp uint64
|
|
|
|
// PartitionsCount is the number of partitions in the storage
|
|
PartitionsCount uint64
|
|
|
|
PartitionStats
|
|
|
|
// IsReadOnly indicates whether the storage is read-only.
|
|
IsReadOnly bool
|
|
}
|
|
|
|
// Reset resets s.
|
|
func (s *StorageStats) Reset() {
|
|
*s = StorageStats{}
|
|
}
|
|
|
|
// StorageConfig is the config for the Storage.
|
|
type StorageConfig struct {
|
|
// Retention is the retention for the ingested data.
|
|
//
|
|
// Older data is automatically deleted.
|
|
Retention time.Duration
|
|
|
|
// FlushInterval is the interval for flushing the in-memory data to disk at the Storage
|
|
FlushInterval time.Duration
|
|
|
|
// FutureRetention is the allowed retention from the current time to future for the ingested data.
|
|
//
|
|
// Log entries with timestamps bigger than now+FutureRetention are ignored.
|
|
FutureRetention time.Duration
|
|
|
|
// LogNewStreams indicates whether to log newly created log streams.
|
|
//
|
|
// This can be useful for debugging of high cardinality issues.
|
|
// https://docs.victoriametrics.com/VictoriaLogs/keyConcepts.html#high-cardinality
|
|
LogNewStreams bool
|
|
|
|
// LogIngestedRows indicates whether to log the ingested log entries.
|
|
//
|
|
// This can be useful for debugging of data ingestion.
|
|
LogIngestedRows bool
|
|
|
|
// MinFreeDiskSpaceBytes is the minimum free disk space at -storageDataPath after which the storage stops accepting new data
|
|
MinFreeDiskSpaceBytes int64
|
|
}
|
|
|
|
// Storage is the storage for log entries.
|
|
type Storage struct {
|
|
rowsDroppedTooBigTimestamp uint64
|
|
rowsDroppedTooSmallTimestamp uint64
|
|
|
|
// path is the path to the Storage directory
|
|
path string
|
|
|
|
// retention is the retention for the stored data
|
|
//
|
|
// older data is automatically deleted
|
|
retention time.Duration
|
|
|
|
// flushInterval is the interval for flushing in-memory data to disk
|
|
flushInterval time.Duration
|
|
|
|
// futureRetention is the maximum allowed interval to write data into the future
|
|
futureRetention time.Duration
|
|
|
|
// logNewStreams instructs to log new streams if it is set to true
|
|
logNewStreams bool
|
|
|
|
// logIngestedRows instructs to log all the ingested log entries if it is set to true
|
|
logIngestedRows bool
|
|
|
|
// flockF is a file, which makes sure that the Storage is opened by a single process
|
|
flockF *os.File
|
|
|
|
// partitions is a list of partitions for the Storage.
|
|
//
|
|
// It must be accessed under partitionsLock.
|
|
partitions []*partitionWrapper
|
|
|
|
// ptwHot is the "hot" partition, were the last rows were ingested.
|
|
//
|
|
// It must be accessed under partitionsLock.
|
|
ptwHot *partitionWrapper
|
|
|
|
// partitionsLock protects partitions and ptwHot.
|
|
partitionsLock sync.Mutex
|
|
|
|
// stopCh is closed when the Storage must be stopped.
|
|
stopCh chan struct{}
|
|
|
|
// wg is used for waiting for background workers at MustClose().
|
|
wg sync.WaitGroup
|
|
|
|
// streamIDCache caches (partition, streamIDs) seen during data ingestion.
|
|
//
|
|
// It reduces the load on persistent storage during data ingestion by skipping
|
|
// the check whether the given stream is already registered in the persistent storage.
|
|
streamIDCache *workingsetcache.Cache
|
|
|
|
// streamTagsCache caches StreamTags entries keyed by streamID.
|
|
//
|
|
// There is no need to put partition into the key for StreamTags,
|
|
// since StreamTags are uniquely identified by streamID.
|
|
//
|
|
// It reduces the load on persistent storage during querying
|
|
// when StreamTags must be found for the particular streamID
|
|
streamTagsCache *workingsetcache.Cache
|
|
|
|
// streamFilterCache caches streamIDs keyed by (partition, []TenanID, StreamFilter).
|
|
//
|
|
// It reduces the load on persistent storage during querying by _stream:{...} filter.
|
|
streamFilterCache *workingsetcache.Cache
|
|
|
|
isReadOnly uint32
|
|
|
|
freeDiskSpaceWatcherWG sync.WaitGroup
|
|
}
|
|
|
|
type partitionWrapper struct {
|
|
// refCount is the number of active references to p.
|
|
// When it reaches zero, then the p is closed.
|
|
refCount int32
|
|
|
|
// The flag, which is set when the partition must be deleted after refCount reaches zero.
|
|
mustBeDeleted uint32
|
|
|
|
// day is the day for the partition in the unix timestamp divided by the number of seconds in the day.
|
|
day int64
|
|
|
|
// pt is the wrapped partition.
|
|
pt *partition
|
|
}
|
|
|
|
func newPartitionWrapper(pt *partition, day int64) *partitionWrapper {
|
|
pw := &partitionWrapper{
|
|
day: day,
|
|
pt: pt,
|
|
}
|
|
pw.incRef()
|
|
return pw
|
|
}
|
|
|
|
func (ptw *partitionWrapper) incRef() {
|
|
atomic.AddInt32(&ptw.refCount, 1)
|
|
}
|
|
|
|
func (ptw *partitionWrapper) decRef() {
|
|
n := atomic.AddInt32(&ptw.refCount, -1)
|
|
if n > 0 {
|
|
return
|
|
}
|
|
|
|
deletePath := ""
|
|
if atomic.LoadUint32(&ptw.mustBeDeleted) != 0 {
|
|
deletePath = ptw.pt.path
|
|
}
|
|
|
|
// Close pw.pt, since nobody refers to it.
|
|
mustClosePartition(ptw.pt)
|
|
ptw.pt = nil
|
|
|
|
// Delete partition if needed.
|
|
if deletePath != "" {
|
|
mustDeletePartition(deletePath)
|
|
}
|
|
}
|
|
|
|
func (ptw *partitionWrapper) canAddAllRows(lr *LogRows) bool {
|
|
minTimestamp := ptw.day * nsecPerDay
|
|
maxTimestamp := minTimestamp + nsecPerDay - 1
|
|
for _, ts := range lr.timestamps {
|
|
if ts < minTimestamp || ts > maxTimestamp {
|
|
return false
|
|
}
|
|
}
|
|
return true
|
|
}
|
|
|
|
// mustCreateStorage creates Storage at the given path.
|
|
func mustCreateStorage(path string) {
|
|
fs.MustMkdirFailIfExist(path)
|
|
|
|
partitionsPath := filepath.Join(path, partitionsDirname)
|
|
fs.MustMkdirFailIfExist(partitionsPath)
|
|
}
|
|
|
|
// MustOpenStorage opens Storage at the given path.
|
|
//
|
|
// MustClose must be called on the returned Storage when it is no longer needed.
|
|
func MustOpenStorage(path string, cfg *StorageConfig) *Storage {
|
|
flushInterval := cfg.FlushInterval
|
|
if flushInterval < time.Second {
|
|
flushInterval = time.Second
|
|
}
|
|
|
|
retention := cfg.Retention
|
|
if retention < 24*time.Hour {
|
|
retention = 24 * time.Hour
|
|
}
|
|
|
|
futureRetention := cfg.FutureRetention
|
|
if futureRetention < 24*time.Hour {
|
|
futureRetention = 24 * time.Hour
|
|
}
|
|
|
|
if !fs.IsPathExist(path) {
|
|
mustCreateStorage(path)
|
|
}
|
|
|
|
flockF := fs.MustCreateFlockFile(path)
|
|
|
|
// Load caches
|
|
mem := memory.Allowed()
|
|
streamIDCachePath := filepath.Join(path, cacheDirname, streamIDCacheFilename)
|
|
streamIDCache := workingsetcache.Load(streamIDCachePath, mem/16)
|
|
|
|
streamTagsCache := workingsetcache.New(mem / 10)
|
|
|
|
streamFilterCache := workingsetcache.New(mem / 10)
|
|
|
|
s := &Storage{
|
|
path: path,
|
|
retention: retention,
|
|
flushInterval: flushInterval,
|
|
futureRetention: futureRetention,
|
|
logNewStreams: cfg.LogNewStreams,
|
|
logIngestedRows: cfg.LogIngestedRows,
|
|
flockF: flockF,
|
|
stopCh: make(chan struct{}),
|
|
|
|
streamIDCache: streamIDCache,
|
|
streamTagsCache: streamTagsCache,
|
|
streamFilterCache: streamFilterCache,
|
|
}
|
|
|
|
partitionsPath := filepath.Join(path, partitionsDirname)
|
|
fs.MustMkdirIfNotExist(partitionsPath)
|
|
des := fs.MustReadDir(partitionsPath)
|
|
ptws := make([]*partitionWrapper, len(des))
|
|
for i, de := range des {
|
|
fname := de.Name()
|
|
|
|
// Parse the day for the partition
|
|
t, err := time.Parse(partitionNameFormat, fname)
|
|
if err != nil {
|
|
logger.Panicf("FATAL: cannot parse partition filename %q at %q; it must be in the form YYYYMMDD: %s", fname, partitionsPath, err)
|
|
}
|
|
day := t.UTC().UnixNano() / nsecPerDay
|
|
|
|
partitionPath := filepath.Join(partitionsPath, fname)
|
|
pt := mustOpenPartition(s, partitionPath)
|
|
ptws[i] = newPartitionWrapper(pt, day)
|
|
}
|
|
sort.Slice(ptws, func(i, j int) bool {
|
|
return ptws[i].day < ptws[j].day
|
|
})
|
|
|
|
// Delete partitions from the future if needed
|
|
maxAllowedDay := s.getMaxAllowedDay()
|
|
j := len(ptws) - 1
|
|
for j >= 0 {
|
|
ptw := ptws[j]
|
|
if ptw.day <= maxAllowedDay {
|
|
break
|
|
}
|
|
logger.Infof("the partition %s is scheduled to be deleted because it is outside the -futureRetention=%dd", ptw.pt.path, durationToDays(s.futureRetention))
|
|
atomic.StoreUint32(&ptw.mustBeDeleted, 1)
|
|
ptw.decRef()
|
|
j--
|
|
}
|
|
j++
|
|
for i := j; i < len(ptws); i++ {
|
|
ptws[i] = nil
|
|
}
|
|
ptws = ptws[:j]
|
|
|
|
s.partitions = ptws
|
|
s.runRetentionWatcher()
|
|
s.startFreeDiskSpaceWatcher(uint64(cfg.MinFreeDiskSpaceBytes))
|
|
return s
|
|
}
|
|
|
|
const partitionNameFormat = "20060102"
|
|
|
|
func (s *Storage) runRetentionWatcher() {
|
|
s.wg.Add(1)
|
|
go func() {
|
|
s.watchRetention()
|
|
s.wg.Done()
|
|
}()
|
|
}
|
|
|
|
func (s *Storage) watchRetention() {
|
|
ticker := time.NewTicker(time.Hour)
|
|
defer ticker.Stop()
|
|
for {
|
|
var ptwsToDelete []*partitionWrapper
|
|
minAllowedDay := s.getMinAllowedDay()
|
|
|
|
s.partitionsLock.Lock()
|
|
|
|
// Delete outdated partitions.
|
|
// s.partitions are sorted by day, so the partitions, which can become outdated, are located at the beginning of the list
|
|
for _, ptw := range s.partitions {
|
|
if ptw.day >= minAllowedDay {
|
|
break
|
|
}
|
|
ptwsToDelete = append(ptwsToDelete, ptw)
|
|
if ptw == s.ptwHot {
|
|
s.ptwHot = nil
|
|
}
|
|
}
|
|
for i := range ptwsToDelete {
|
|
s.partitions[i] = nil
|
|
}
|
|
s.partitions = s.partitions[len(ptwsToDelete):]
|
|
|
|
s.partitionsLock.Unlock()
|
|
|
|
for _, ptw := range ptwsToDelete {
|
|
logger.Infof("the partition %s is scheduled to be deleted because it is outside the -retentionPeriod=%dd", ptw.pt.path, durationToDays(s.retention))
|
|
atomic.StoreUint32(&ptw.mustBeDeleted, 1)
|
|
ptw.decRef()
|
|
}
|
|
|
|
select {
|
|
case <-s.stopCh:
|
|
return
|
|
case <-ticker.C:
|
|
}
|
|
}
|
|
}
|
|
|
|
func (s *Storage) getMinAllowedDay() int64 {
|
|
return time.Now().UTC().Add(-s.retention).UnixNano() / nsecPerDay
|
|
}
|
|
|
|
func (s *Storage) getMaxAllowedDay() int64 {
|
|
return time.Now().UTC().Add(s.futureRetention).UnixNano() / nsecPerDay
|
|
}
|
|
|
|
// MustClose closes s.
|
|
//
|
|
// It is expected that nobody uses the storage at the close time.
|
|
func (s *Storage) MustClose() {
|
|
// Stop background workers
|
|
close(s.stopCh)
|
|
s.wg.Wait()
|
|
s.freeDiskSpaceWatcherWG.Wait()
|
|
|
|
// Close partitions
|
|
for _, pw := range s.partitions {
|
|
pw.decRef()
|
|
if pw.refCount != 0 {
|
|
logger.Panicf("BUG: there are %d users of partition", pw.refCount)
|
|
}
|
|
}
|
|
s.partitions = nil
|
|
s.ptwHot = nil
|
|
|
|
// Save caches
|
|
streamIDCachePath := filepath.Join(s.path, cacheDirname, streamIDCacheFilename)
|
|
if err := s.streamIDCache.Save(streamIDCachePath); err != nil {
|
|
logger.Panicf("FATAL: cannot save streamID cache to %q: %s", streamIDCachePath, err)
|
|
}
|
|
s.streamIDCache.Stop()
|
|
s.streamIDCache = nil
|
|
|
|
s.streamTagsCache.Stop()
|
|
s.streamTagsCache = nil
|
|
|
|
s.streamFilterCache.Stop()
|
|
s.streamFilterCache = nil
|
|
|
|
// release lock file
|
|
fs.MustClose(s.flockF)
|
|
s.flockF = nil
|
|
|
|
s.path = ""
|
|
}
|
|
|
|
// AddRows adds lr to s.
|
|
func (s *Storage) AddRows(lr *LogRows) error {
|
|
if s.IsReadOnly() {
|
|
return errReadOnly
|
|
}
|
|
|
|
// Fast path - try adding all the rows to the hot partition
|
|
s.partitionsLock.Lock()
|
|
ptwHot := s.ptwHot
|
|
if ptwHot != nil {
|
|
ptwHot.incRef()
|
|
}
|
|
s.partitionsLock.Unlock()
|
|
|
|
if ptwHot != nil {
|
|
if ptwHot.canAddAllRows(lr) {
|
|
ptwHot.pt.mustAddRows(lr)
|
|
ptwHot.decRef()
|
|
return nil
|
|
}
|
|
ptwHot.decRef()
|
|
}
|
|
|
|
// Slow path - rows cannot be added to the hot partition, so split rows among available partitions
|
|
minAllowedDay := s.getMinAllowedDay()
|
|
maxAllowedDay := s.getMaxAllowedDay()
|
|
m := make(map[int64]*LogRows)
|
|
for i, ts := range lr.timestamps {
|
|
day := ts / nsecPerDay
|
|
if day < minAllowedDay {
|
|
rf := RowFormatter(lr.rows[i])
|
|
tsf := TimeFormatter(ts)
|
|
minAllowedTsf := TimeFormatter(minAllowedDay * nsecPerDay)
|
|
tooSmallTimestampLogger.Warnf("skipping log entry with too small timestamp=%s; it must be bigger than %s according "+
|
|
"to the configured -retentionPeriod=%dd. See https://docs.victoriametrics.com/VictoriaLogs/#retention ; "+
|
|
"log entry: %s", &tsf, &minAllowedTsf, durationToDays(s.retention), &rf)
|
|
atomic.AddUint64(&s.rowsDroppedTooSmallTimestamp, 1)
|
|
continue
|
|
}
|
|
if day > maxAllowedDay {
|
|
rf := RowFormatter(lr.rows[i])
|
|
tsf := TimeFormatter(ts)
|
|
maxAllowedTsf := TimeFormatter(maxAllowedDay * nsecPerDay)
|
|
tooBigTimestampLogger.Warnf("skipping log entry with too big timestamp=%s; it must be smaller than %s according "+
|
|
"to the configured -futureRetention=%dd; see https://docs.victoriametrics.com/VictoriaLogs/#retention ; "+
|
|
"log entry: %s", &tsf, &maxAllowedTsf, durationToDays(s.futureRetention), &rf)
|
|
atomic.AddUint64(&s.rowsDroppedTooBigTimestamp, 1)
|
|
continue
|
|
}
|
|
lrPart := m[day]
|
|
if lrPart == nil {
|
|
lrPart = GetLogRows(nil, nil)
|
|
m[day] = lrPart
|
|
}
|
|
lrPart.mustAddInternal(lr.streamIDs[i], ts, lr.rows[i], lr.streamTagsCanonicals[i])
|
|
}
|
|
for day, lrPart := range m {
|
|
ptw := s.getPartitionForDay(day)
|
|
ptw.pt.mustAddRows(lrPart)
|
|
ptw.decRef()
|
|
PutLogRows(lrPart)
|
|
}
|
|
return nil
|
|
}
|
|
|
|
var tooSmallTimestampLogger = logger.WithThrottler("too_small_timestamp", 5*time.Second)
|
|
var tooBigTimestampLogger = logger.WithThrottler("too_big_timestamp", 5*time.Second)
|
|
|
|
const nsecPerDay = 24 * 3600 * 1e9
|
|
|
|
// TimeFormatter implements fmt.Stringer for timestamp in nanoseconds
|
|
type TimeFormatter int64
|
|
|
|
// String returns human-readable representation for tf.
|
|
func (tf *TimeFormatter) String() string {
|
|
ts := int64(*tf)
|
|
t := time.Unix(0, ts).UTC()
|
|
return t.Format(time.RFC3339Nano)
|
|
}
|
|
|
|
func (s *Storage) getPartitionForDay(day int64) *partitionWrapper {
|
|
s.partitionsLock.Lock()
|
|
|
|
// Search for the partition using binary search
|
|
ptws := s.partitions
|
|
n := sort.Search(len(ptws), func(i int) bool {
|
|
return ptws[i].day >= day
|
|
})
|
|
var ptw *partitionWrapper
|
|
if n < len(ptws) {
|
|
ptw = ptws[n]
|
|
if ptw.day != day {
|
|
ptw = nil
|
|
}
|
|
}
|
|
if ptw == nil {
|
|
// Missing partition for the given day. Create it.
|
|
fname := time.Unix(0, day*nsecPerDay).UTC().Format(partitionNameFormat)
|
|
partitionPath := filepath.Join(s.path, partitionsDirname, fname)
|
|
mustCreatePartition(partitionPath)
|
|
|
|
pt := mustOpenPartition(s, partitionPath)
|
|
ptw = newPartitionWrapper(pt, day)
|
|
if n == len(ptws) {
|
|
ptws = append(ptws, ptw)
|
|
} else {
|
|
ptws = append(ptws[:n+1], ptws[n:]...)
|
|
ptws[n] = ptw
|
|
}
|
|
s.partitions = ptws
|
|
}
|
|
|
|
s.ptwHot = ptw
|
|
ptw.incRef()
|
|
|
|
s.partitionsLock.Unlock()
|
|
|
|
return ptw
|
|
}
|
|
|
|
// UpdateStats updates ss for the given s.
|
|
func (s *Storage) UpdateStats(ss *StorageStats) {
|
|
ss.RowsDroppedTooBigTimestamp += atomic.LoadUint64(&s.rowsDroppedTooBigTimestamp)
|
|
ss.RowsDroppedTooSmallTimestamp += atomic.LoadUint64(&s.rowsDroppedTooSmallTimestamp)
|
|
|
|
s.partitionsLock.Lock()
|
|
ss.PartitionsCount += uint64(len(s.partitions))
|
|
for _, ptw := range s.partitions {
|
|
ptw.pt.updateStats(&ss.PartitionStats)
|
|
}
|
|
s.partitionsLock.Unlock()
|
|
ss.IsReadOnly = s.IsReadOnly()
|
|
}
|
|
|
|
// IsReadOnly returns information is storage in read only mode
|
|
func (s *Storage) IsReadOnly() bool {
|
|
return atomic.LoadUint32(&s.isReadOnly) == 1
|
|
}
|
|
|
|
func (s *Storage) startFreeDiskSpaceWatcher(freeDiskSpaceLimitBytes uint64) {
|
|
f := func() {
|
|
freeSpaceBytes := fs.MustGetFreeSpace(s.path)
|
|
if freeSpaceBytes < freeDiskSpaceLimitBytes {
|
|
// Switch the storage to readonly mode if there is no enough free space left at s.path
|
|
logger.Warnf("switching the storage at %s to read-only mode, since it has less than -storage.minFreeDiskSpaceBytes=%d of free space: %d bytes left",
|
|
s.path, freeDiskSpaceLimitBytes, freeSpaceBytes)
|
|
atomic.StoreUint32(&s.isReadOnly, 1)
|
|
return
|
|
}
|
|
if atomic.CompareAndSwapUint32(&s.isReadOnly, 1, 0) {
|
|
logger.Warnf("enabling writing to the storage at %s, since it has more than -storage.minFreeDiskSpaceBytes=%d of free space: %d bytes left",
|
|
s.path, freeDiskSpaceLimitBytes, freeSpaceBytes)
|
|
}
|
|
}
|
|
f()
|
|
s.freeDiskSpaceWatcherWG.Add(1)
|
|
go func() {
|
|
defer s.freeDiskSpaceWatcherWG.Done()
|
|
ticker := time.NewTicker(time.Second)
|
|
defer ticker.Stop()
|
|
for {
|
|
select {
|
|
case <-s.stopCh:
|
|
return
|
|
case <-ticker.C:
|
|
f()
|
|
}
|
|
}
|
|
}()
|
|
}
|
|
|
|
func (s *Storage) debugFlush() {
|
|
s.partitionsLock.Lock()
|
|
ptws := append([]*partitionWrapper{}, s.partitions...)
|
|
for _, ptw := range ptws {
|
|
ptw.incRef()
|
|
}
|
|
s.partitionsLock.Unlock()
|
|
|
|
for _, ptw := range ptws {
|
|
ptw.pt.debugFlush()
|
|
ptw.decRef()
|
|
}
|
|
}
|
|
|
|
func durationToDays(d time.Duration) int64 {
|
|
return int64(d / (time.Hour * 24))
|
|
}
|