package logstorage import ( "context" "fmt" "math" "slices" "sort" "strings" "sync" "github.com/VictoriaMetrics/VictoriaMetrics/lib/bytesutil" "github.com/VictoriaMetrics/VictoriaMetrics/lib/cgroup" "github.com/VictoriaMetrics/VictoriaMetrics/lib/encoding" "github.com/VictoriaMetrics/VictoriaMetrics/lib/logger" ) // genericSearchOptions contain options used for search. type genericSearchOptions struct { // tenantIDs must contain the list of tenantIDs for the search. tenantIDs []TenantID // streamIDs is an optional sorted list of streamIDs for the search. // If it is empty, then the search is performed by tenantIDs streamIDs []streamID // minTimestamp is the minimum timestamp for the search minTimestamp int64 // maxTimestamp is the maximum timestamp for the search maxTimestamp int64 // filter is the filter to use for the search filter filter // neededColumnNames contains names of columns to return in the result neededColumnNames []string // unneededColumnNames contains names of columns, which mustn't be returned in the result. // // This list is consulted if needAllColumns=true unneededColumnNames []string // needAllColumns is set to true when all the columns except of unneededColumnNames must be returned in the result needAllColumns bool } type searchOptions struct { // Optional sorted list of tenantIDs for the search. // If it is empty, then the search is performed by streamIDs tenantIDs []TenantID // Optional sorted list of streamIDs for the search. // If it is empty, then the search is performed by tenantIDs streamIDs []streamID // minTimestamp is the minimum timestamp for the search minTimestamp int64 // maxTimestamp is the maximum timestamp for the search maxTimestamp int64 // filter is the filter to use for the search filter filter // neededColumnNames contains names of columns to return in the result neededColumnNames []string // unneededColumnNames contains names of columns, which mustn't be returned in the result. // // This list is consulted when needAllColumns=true. unneededColumnNames []string // needAllColumns is set to true when all the columns except of unneededColumnNames must be returned in the result needAllColumns bool } // WriteBlockFunc must write a block with the given timestamps and columns. // // WriteBlockFunc cannot hold references to timestamps and columns after returning. type WriteBlockFunc func(workerID uint, timestamps []int64, columns []BlockColumn) // RunQuery runs the given q and calls writeBlock for results. func (s *Storage) RunQuery(ctx context.Context, tenantIDs []TenantID, q *Query, writeBlock WriteBlockFunc) error { writeBlockResult := func(workerID uint, br *blockResult) { if br.rowsLen == 0 { return } brs := getBlockRows() csDst := brs.cs cs := br.getColumns() for _, c := range cs { values := c.getValues(br) csDst = append(csDst, BlockColumn{ Name: c.name, Values: values, }) } timestamps := br.getTimestamps() writeBlock(workerID, timestamps, csDst) brs.cs = csDst putBlockRows(brs) } return s.runQuery(ctx, tenantIDs, q, writeBlockResult) } func (s *Storage) runQuery(ctx context.Context, tenantIDs []TenantID, q *Query, writeBlockResultFunc func(workerID uint, br *blockResult)) error { qNew, err := s.initFilterInValues(ctx, tenantIDs, q) if err != nil { return err } qNew, err = s.initJoinMaps(ctx, tenantIDs, qNew) if err != nil { return err } q = qNew streamIDs := q.getStreamIDs() sort.Slice(streamIDs, func(i, j int) bool { return streamIDs[i].less(&streamIDs[j]) }) minTimestamp, maxTimestamp := q.GetFilterTimeRange() neededColumnNames, unneededColumnNames := q.getNeededColumns() so := &genericSearchOptions{ tenantIDs: tenantIDs, streamIDs: streamIDs, minTimestamp: minTimestamp, maxTimestamp: maxTimestamp, filter: q.f, neededColumnNames: neededColumnNames, unneededColumnNames: unneededColumnNames, needAllColumns: slices.Contains(neededColumnNames, "*"), } workersCount := cgroup.AvailableCPUs() ppMain := newDefaultPipeProcessor(writeBlockResultFunc) pp := ppMain stopCh := ctx.Done() cancels := make([]func(), len(q.pipes)) pps := make([]pipeProcessor, len(q.pipes)) var errPipe error for i := len(q.pipes) - 1; i >= 0; i-- { p := q.pipes[i] ctxChild, cancel := context.WithCancel(ctx) pp = p.newPipeProcessor(workersCount, stopCh, cancel, pp) pcp, ok := pp.(*pipeStreamContextProcessor) if ok { pcp.init(s, neededColumnNames, unneededColumnNames) if i > 0 { errPipe = fmt.Errorf("[%s] pipe must go after [%s] filter; now it goes after the [%s] pipe", p, q.f, q.pipes[i-1]) } } stopCh = ctxChild.Done() ctx = ctxChild cancels[i] = cancel pps[i] = pp } if errPipe == nil { s.search(workersCount, so, stopCh, pp.writeBlock) } var errFlush error for i, pp := range pps { if err := pp.flush(); err != nil && errFlush == nil { errFlush = err } cancel := cancels[i] cancel() } if err := ppMain.flush(); err != nil && errFlush == nil { errFlush = err } if errPipe != nil { return errPipe } return errFlush } // GetFieldNames returns field names from q results for the given tenantIDs. func (s *Storage) GetFieldNames(ctx context.Context, tenantIDs []TenantID, q *Query) ([]ValueWithHits, error) { pipes := append([]pipe{}, q.pipes...) pipeStr := "field_names" lex := newLexer(pipeStr) pf, err := parsePipeFieldNames(lex) if err != nil { logger.Panicf("BUG: unexpected error when parsing 'field_names' pipe at [%s]: %s", pipeStr, err) } pf.isFirstPipe = len(pipes) == 0 if !lex.isEnd() { logger.Panicf("BUG: unexpected tail left after parsing pipes [%s]: %q", pipeStr, lex.s) } pipes = append(pipes, pf) q = &Query{ f: q.f, pipes: pipes, } return s.runValuesWithHitsQuery(ctx, tenantIDs, q) } func (s *Storage) getJoinMap(ctx context.Context, tenantIDs []TenantID, q *Query, byFields []string, prefix string) (map[string][][]Field, error) { // TODO: track memory usage m := make(map[string][][]Field) var mLock sync.Mutex writeBlockResult := func(_ uint, br *blockResult) { if br.rowsLen == 0 { return } cs := br.getColumns() columnNames := make([]string, len(cs)) byValuesIdxs := make([]int, len(cs)) for i := range cs { name := strings.Clone(cs[i].name) idx := slices.Index(byFields, name) if prefix != "" && idx < 0 { name = prefix + name } columnNames[i] = name byValuesIdxs[i] = idx } byValues := make([]string, len(byFields)) var tmpBuf []byte for rowIdx := 0; rowIdx < br.rowsLen; rowIdx++ { fields := make([]Field, 0, len(cs)) clear(byValues) for j := range cs { name := columnNames[j] v := cs[j].getValueAtRow(br, rowIdx) if cIdx := byValuesIdxs[j]; cIdx >= 0 { byValues[cIdx] = v continue } if v == "" { continue } value := strings.Clone(v) fields = append(fields, Field{ Name: name, Value: value, }) } tmpBuf = marshalStrings(tmpBuf[:0], byValues) k := string(tmpBuf) mLock.Lock() m[k] = append(m[k], fields) mLock.Unlock() } } if err := s.runQuery(ctx, tenantIDs, q, writeBlockResult); err != nil { return nil, err } return m, nil } func marshalStrings(dst []byte, a []string) []byte { for _, v := range a { dst = encoding.MarshalBytes(dst, bytesutil.ToUnsafeBytes(v)) } return dst } func (s *Storage) getFieldValuesNoHits(ctx context.Context, tenantIDs []TenantID, q *Query, fieldName string) ([]string, error) { // TODO: track memory usage pipes := append([]pipe{}, q.pipes...) quotedFieldName := quoteTokenIfNeeded(fieldName) pipeStr := fmt.Sprintf("uniq by (%s)", quotedFieldName) lex := newLexer(pipeStr) pu, err := parsePipeUniq(lex) if err != nil { logger.Panicf("BUG: unexpected error when parsing 'uniq' pipe at [%s]: %s", pipeStr, err) } if !lex.isEnd() { logger.Panicf("BUG: unexpected tail left after parsing pipes [%s]: %q", pipeStr, lex.s) } pipes = append(pipes, pu) q = &Query{ f: q.f, pipes: pipes, } var values []string var valuesLock sync.Mutex writeBlockResult := func(_ uint, br *blockResult) { if br.rowsLen == 0 { return } cs := br.getColumns() if len(cs) != 1 { logger.Panicf("BUG: expecting one column; got %d columns", len(cs)) } columnValues := cs[0].getValues(br) columnValuesCopy := make([]string, len(columnValues)) for i := range columnValues { columnValuesCopy[i] = strings.Clone(columnValues[i]) } valuesLock.Lock() values = append(values, columnValuesCopy...) valuesLock.Unlock() } if err := s.runQuery(ctx, tenantIDs, q, writeBlockResult); err != nil { return nil, err } return values, nil } // GetFieldValues returns unique values with the number of hits for the given fieldName returned by q for the given tenantIDs. // // If limit > 0, then up to limit unique values are returned. func (s *Storage) GetFieldValues(ctx context.Context, tenantIDs []TenantID, q *Query, fieldName string, limit uint64) ([]ValueWithHits, error) { pipes := append([]pipe{}, q.pipes...) quotedFieldName := quoteTokenIfNeeded(fieldName) pipeStr := fmt.Sprintf("field_values %s limit %d", quotedFieldName, limit) lex := newLexer(pipeStr) pu, err := parsePipeFieldValues(lex) if err != nil { logger.Panicf("BUG: unexpected error when parsing 'field_values' pipe at [%s]: %s", pipeStr, err) } if !lex.isEnd() { logger.Panicf("BUG: unexpected tail left after parsing pipes [%s]: %q", pipeStr, lex.s) } pipes = append(pipes, pu) q = &Query{ f: q.f, pipes: pipes, } return s.runValuesWithHitsQuery(ctx, tenantIDs, q) } // ValueWithHits contains value and hits. type ValueWithHits struct { Value string Hits uint64 } func toValuesWithHits(m map[string]*uint64) []ValueWithHits { results := make([]ValueWithHits, 0, len(m)) for k, pHits := range m { results = append(results, ValueWithHits{ Value: k, Hits: *pHits, }) } sortValuesWithHits(results) return results } func sortValuesWithHits(results []ValueWithHits) { slices.SortFunc(results, func(a, b ValueWithHits) int { if a.Hits == b.Hits { if a.Value == b.Value { return 0 } if lessString(a.Value, b.Value) { return -1 } return 1 } // Sort in descending order of hits if a.Hits < b.Hits { return 1 } return -1 }) } // GetStreamFieldNames returns stream field names from q results for the given tenantIDs. func (s *Storage) GetStreamFieldNames(ctx context.Context, tenantIDs []TenantID, q *Query) ([]ValueWithHits, error) { streams, err := s.GetStreams(ctx, tenantIDs, q, math.MaxUint64) if err != nil { return nil, err } m := make(map[string]*uint64) forEachStreamField(streams, func(f Field, hits uint64) { pHits := m[f.Name] if pHits == nil { nameCopy := strings.Clone(f.Name) hitsLocal := uint64(0) pHits = &hitsLocal m[nameCopy] = pHits } *pHits += hits }) names := toValuesWithHits(m) return names, nil } // GetStreamFieldValues returns stream field values for the given fieldName from q results for the given tenantIDs. // // If limit > 9, then up to limit unique values are returned. func (s *Storage) GetStreamFieldValues(ctx context.Context, tenantIDs []TenantID, q *Query, fieldName string, limit uint64) ([]ValueWithHits, error) { streams, err := s.GetStreams(ctx, tenantIDs, q, math.MaxUint64) if err != nil { return nil, err } m := make(map[string]*uint64) forEachStreamField(streams, func(f Field, hits uint64) { if f.Name != fieldName { return } pHits := m[f.Value] if pHits == nil { valueCopy := strings.Clone(f.Value) hitsLocal := uint64(0) pHits = &hitsLocal m[valueCopy] = pHits } *pHits += hits }) values := toValuesWithHits(m) if limit > 0 && uint64(len(values)) > limit { values = values[:limit] } return values, nil } // GetStreams returns streams from q results for the given tenantIDs. // // If limit > 0, then up to limit unique streams are returned. func (s *Storage) GetStreams(ctx context.Context, tenantIDs []TenantID, q *Query, limit uint64) ([]ValueWithHits, error) { return s.GetFieldValues(ctx, tenantIDs, q, "_stream", limit) } // GetStreamIDs returns stream_id field values from q results for the given tenantIDs. // // If limit > 0, then up to limit unique streams are returned. func (s *Storage) GetStreamIDs(ctx context.Context, tenantIDs []TenantID, q *Query, limit uint64) ([]ValueWithHits, error) { return s.GetFieldValues(ctx, tenantIDs, q, "_stream_id", limit) } func (s *Storage) runValuesWithHitsQuery(ctx context.Context, tenantIDs []TenantID, q *Query) ([]ValueWithHits, error) { var results []ValueWithHits var resultsLock sync.Mutex writeBlockResult := func(_ uint, br *blockResult) { if br.rowsLen == 0 { return } cs := br.getColumns() if len(cs) != 2 { logger.Panicf("BUG: expecting two columns; got %d columns", len(cs)) } columnValues := cs[0].getValues(br) columnHits := cs[1].getValues(br) valuesWithHits := make([]ValueWithHits, len(columnValues)) for i := range columnValues { x := &valuesWithHits[i] hits, _ := tryParseUint64(columnHits[i]) x.Value = strings.Clone(columnValues[i]) x.Hits = hits } resultsLock.Lock() results = append(results, valuesWithHits...) resultsLock.Unlock() } err := s.runQuery(ctx, tenantIDs, q, writeBlockResult) if err != nil { return nil, err } sortValuesWithHits(results) return results, nil } func (s *Storage) initFilterInValues(ctx context.Context, tenantIDs []TenantID, q *Query) (*Query, error) { if !hasFilterInWithQueryForFilter(q.f) && !hasFilterInWithQueryForPipes(q.pipes) { return q, nil } getFieldValues := func(q *Query, fieldName string) ([]string, error) { return s.getFieldValuesNoHits(ctx, tenantIDs, q, fieldName) } cache := make(map[string][]string) fNew, err := initFilterInValuesForFilter(cache, q.f, getFieldValues) if err != nil { return nil, err } pipesNew, err := initFilterInValuesForPipes(cache, q.pipes, getFieldValues) if err != nil { return nil, err } qNew := &Query{ f: fNew, pipes: pipesNew, } return qNew, nil } type getJoinMapFunc func(q *Query, byFields []string, prefix string) (map[string][][]Field, error) func (s *Storage) initJoinMaps(ctx context.Context, tenantIDs []TenantID, q *Query) (*Query, error) { if !hasJoinPipes(q.pipes) { return q, nil } getJoinMap := func(q *Query, byFields []string, prefix string) (map[string][][]Field, error) { return s.getJoinMap(ctx, tenantIDs, q, byFields, prefix) } pipesNew := make([]pipe, len(q.pipes)) for i := range q.pipes { p := q.pipes[i] if pj, ok := p.(*pipeJoin); ok { pNew, err := pj.initJoinMap(getJoinMap) if err != nil { return nil, err } p = pNew } pipesNew[i] = p } qNew := &Query{ f: q.f, pipes: pipesNew, } return qNew, nil } func hasJoinPipes(pipes []pipe) bool { for _, p := range pipes { if _, ok := p.(*pipeJoin); ok { return true } } return false } func (iff *ifFilter) hasFilterInWithQuery() bool { if iff == nil { return false } return hasFilterInWithQueryForFilter(iff.f) } func hasFilterInWithQueryForFilter(f filter) bool { if f == nil { return false } visitFunc := func(f filter) bool { switch t := f.(type) { case *filterIn: return t.needExecuteQuery case *filterStreamID: return t.needExecuteQuery default: return false } } return visitFilter(f, visitFunc) } func hasFilterInWithQueryForPipes(pipes []pipe) bool { for _, p := range pipes { if p.hasFilterInWithQuery() { return true } } return false } type getFieldValuesFunc func(q *Query, fieldName string) ([]string, error) func (iff *ifFilter) initFilterInValues(cache map[string][]string, getFieldValuesFunc getFieldValuesFunc) (*ifFilter, error) { if iff == nil { return nil, nil } f, err := initFilterInValuesForFilter(cache, iff.f, getFieldValuesFunc) if err != nil { return nil, err } iffNew := *iff iffNew.f = f return &iffNew, nil } func initFilterInValuesForFilter(cache map[string][]string, f filter, getFieldValuesFunc getFieldValuesFunc) (filter, error) { if f == nil { return nil, nil } visitFunc := func(f filter) bool { switch t := f.(type) { case *filterIn: return t.needExecuteQuery case *filterStreamID: return t.needExecuteQuery default: return false } } copyFunc := func(f filter) (filter, error) { switch t := f.(type) { case *filterIn: values, err := getValuesForQuery(t.q, t.qFieldName, cache, getFieldValuesFunc) if err != nil { return nil, fmt.Errorf("cannot obtain unique values for %s: %w", t, err) } fiNew := &filterIn{ fieldName: t.fieldName, q: t.q, values: values, } return fiNew, nil case *filterStreamID: values, err := getValuesForQuery(t.q, t.qFieldName, cache, getFieldValuesFunc) if err != nil { return nil, fmt.Errorf("cannot obtain unique values for %s: %w", t, err) } // convert values to streamID list streamIDs := make([]streamID, 0, len(values)) for _, v := range values { var sid streamID if sid.tryUnmarshalFromString(v) { streamIDs = append(streamIDs, sid) } } fsNew := &filterStreamID{ streamIDs: streamIDs, q: t.q, } return fsNew, nil default: return f, nil } } return copyFilter(f, visitFunc, copyFunc) } func getValuesForQuery(q *Query, qFieldName string, cache map[string][]string, getFieldValuesFunc getFieldValuesFunc) ([]string, error) { qStr := q.String() values, ok := cache[qStr] if ok { return values, nil } vs, err := getFieldValuesFunc(q, qFieldName) if err != nil { return nil, err } cache[qStr] = vs return vs, nil } func initFilterInValuesForPipes(cache map[string][]string, pipes []pipe, getFieldValuesFunc getFieldValuesFunc) ([]pipe, error) { pipesNew := make([]pipe, len(pipes)) for i, p := range pipes { pNew, err := p.initFilterInValues(cache, getFieldValuesFunc) if err != nil { return nil, err } pipesNew[i] = pNew } return pipesNew, nil } type blockRows struct { cs []BlockColumn } func (brs *blockRows) reset() { cs := brs.cs for i := range cs { cs[i].reset() } brs.cs = cs[:0] } func getBlockRows() *blockRows { v := blockRowsPool.Get() if v == nil { return &blockRows{} } return v.(*blockRows) } func putBlockRows(brs *blockRows) { brs.reset() blockRowsPool.Put(brs) } var blockRowsPool sync.Pool // BlockColumn is a single column of a block of data type BlockColumn struct { // Name is the column name Name string // Values is column values Values []string } func (c *BlockColumn) reset() { c.Name = "" c.Values = nil } // searchResultFunc must process sr. // // The callback is called at the worker with the given workerID. type searchResultFunc func(workerID uint, br *blockResult) // search searches for the matching rows according to so. // // It calls processBlockResult for each matching block. func (s *Storage) search(workersCount int, so *genericSearchOptions, stopCh <-chan struct{}, processBlockResult searchResultFunc) { // Spin up workers var wgWorkers sync.WaitGroup workCh := make(chan *blockSearchWorkBatch, workersCount) wgWorkers.Add(workersCount) for i := 0; i < workersCount; i++ { go func(workerID uint) { bs := getBlockSearch() bm := getBitmap(0) for bswb := range workCh { bsws := bswb.bsws for i := range bsws { bsw := &bsws[i] if needStop(stopCh) { // The search has been canceled. Just skip all the scheduled work in order to save CPU time. bsw.reset() continue } bs.search(bsw, bm) if bs.br.rowsLen > 0 { processBlockResult(workerID, &bs.br) } bsw.reset() } bswb.bsws = bswb.bsws[:0] putBlockSearchWorkBatch(bswb) } putBlockSearch(bs) putBitmap(bm) wgWorkers.Done() }(uint(i)) } // Select partitions according to the selected time range s.partitionsLock.Lock() ptws := s.partitions minDay := so.minTimestamp / nsecsPerDay n := sort.Search(len(ptws), func(i int) bool { return ptws[i].day >= minDay }) ptws = ptws[n:] maxDay := so.maxTimestamp / nsecsPerDay n = sort.Search(len(ptws), func(i int) bool { return ptws[i].day > maxDay }) ptws = ptws[:n] // Copy the selected partitions, so they don't interfere with s.partitions. ptws = append([]*partitionWrapper{}, ptws...) for _, ptw := range ptws { ptw.incRef() } s.partitionsLock.Unlock() // Obtain common filterStream from f sf, f := getCommonStreamFilter(so.filter) // Schedule concurrent search across matching partitions. psfs := make([]partitionSearchFinalizer, len(ptws)) var wgSearchers sync.WaitGroup for i, ptw := range ptws { partitionSearchConcurrencyLimitCh <- struct{}{} wgSearchers.Add(1) go func(idx int, pt *partition) { psfs[idx] = pt.search(sf, f, so, workCh, stopCh) wgSearchers.Done() <-partitionSearchConcurrencyLimitCh }(i, ptw.pt) } wgSearchers.Wait() // Wait until workers finish their work close(workCh) wgWorkers.Wait() // Finalize partition search for _, psf := range psfs { psf() } // Decrement references to partitions for _, ptw := range ptws { ptw.decRef() } } // partitionSearchConcurrencyLimitCh limits the number of concurrent searches in partition. // // This is needed for limiting memory usage under high load. var partitionSearchConcurrencyLimitCh = make(chan struct{}, cgroup.AvailableCPUs()) type partitionSearchFinalizer func() func (pt *partition) search(sf *StreamFilter, f filter, so *genericSearchOptions, workCh chan<- *blockSearchWorkBatch, stopCh <-chan struct{}) partitionSearchFinalizer { if needStop(stopCh) { // Do not spend CPU time on search, since it is already stopped. return func() {} } tenantIDs := so.tenantIDs var streamIDs []streamID if sf != nil { streamIDs = pt.idb.searchStreamIDs(tenantIDs, sf) if len(so.streamIDs) > 0 { streamIDs = intersectStreamIDs(streamIDs, so.streamIDs) } tenantIDs = nil } else if len(so.streamIDs) > 0 { streamIDs = getStreamIDsForTenantIDs(so.streamIDs, tenantIDs) tenantIDs = nil } if hasStreamFilters(f) { f = initStreamFilters(tenantIDs, pt.idb, f) } soInternal := &searchOptions{ tenantIDs: tenantIDs, streamIDs: streamIDs, minTimestamp: so.minTimestamp, maxTimestamp: so.maxTimestamp, filter: f, neededColumnNames: so.neededColumnNames, unneededColumnNames: so.unneededColumnNames, needAllColumns: so.needAllColumns, } return pt.ddb.search(soInternal, workCh, stopCh) } func intersectStreamIDs(a, b []streamID) []streamID { m := make(map[streamID]struct{}, len(b)) for _, streamID := range b { m[streamID] = struct{}{} } result := make([]streamID, 0, len(a)) for _, streamID := range a { if _, ok := m[streamID]; ok { result = append(result, streamID) } } return result } func getStreamIDsForTenantIDs(streamIDs []streamID, tenantIDs []TenantID) []streamID { m := make(map[TenantID]struct{}, len(tenantIDs)) for _, tenantID := range tenantIDs { m[tenantID] = struct{}{} } result := make([]streamID, 0, len(streamIDs)) for _, streamID := range streamIDs { if _, ok := m[streamID.tenantID]; ok { result = append(result, streamID) } } return result } func hasStreamFilters(f filter) bool { visitFunc := func(f filter) bool { _, ok := f.(*filterStream) return ok } return visitFilter(f, visitFunc) } func initStreamFilters(tenantIDs []TenantID, idb *indexdb, f filter) filter { visitFunc := func(f filter) bool { _, ok := f.(*filterStream) return ok } copyFunc := func(f filter) (filter, error) { fs := f.(*filterStream) fsNew := &filterStream{ f: fs.f, tenantIDs: tenantIDs, idb: idb, } return fsNew, nil } f, err := copyFilter(f, visitFunc, copyFunc) if err != nil { logger.Panicf("BUG: unexpected error: %s", err) } return f } func (ddb *datadb) search(so *searchOptions, workCh chan<- *blockSearchWorkBatch, stopCh <-chan struct{}) partitionSearchFinalizer { // Select parts with data for the given time range ddb.partsLock.Lock() pws := appendPartsInTimeRange(nil, ddb.bigParts, so.minTimestamp, so.maxTimestamp) pws = appendPartsInTimeRange(pws, ddb.smallParts, so.minTimestamp, so.maxTimestamp) pws = appendPartsInTimeRange(pws, ddb.inmemoryParts, so.minTimestamp, so.maxTimestamp) // Increase references to the searched parts, so they aren't deleted during search. // References to the searched parts must be decremented by calling the returned partitionSearchFinalizer. for _, pw := range pws { pw.incRef() } ddb.partsLock.Unlock() // Apply search to matching parts for _, pw := range pws { pw.p.search(so, workCh, stopCh) } return func() { for _, pw := range pws { pw.decRef() } } } func (p *part) search(so *searchOptions, workCh chan<- *blockSearchWorkBatch, stopCh <-chan struct{}) { bhss := getBlockHeaders() if len(so.tenantIDs) > 0 { p.searchByTenantIDs(so, bhss, workCh, stopCh) } else { p.searchByStreamIDs(so, bhss, workCh, stopCh) } putBlockHeaders(bhss) } func getBlockHeaders() *blockHeaders { v := blockHeadersPool.Get() if v == nil { return &blockHeaders{} } return v.(*blockHeaders) } func putBlockHeaders(bhss *blockHeaders) { bhss.reset() blockHeadersPool.Put(bhss) } var blockHeadersPool sync.Pool type blockHeaders struct { bhs []blockHeader } func (bhss *blockHeaders) reset() { bhs := bhss.bhs for i := range bhs { bhs[i].reset() } bhss.bhs = bhs[:0] } func (p *part) searchByTenantIDs(so *searchOptions, bhss *blockHeaders, workCh chan<- *blockSearchWorkBatch, stopCh <-chan struct{}) { // it is assumed that tenantIDs are sorted tenantIDs := so.tenantIDs bswb := getBlockSearchWorkBatch() scheduleBlockSearch := func(bh *blockHeader) bool { if bswb.appendBlockSearchWork(p, so, bh) { return true } select { case <-stopCh: return false case workCh <- bswb: bswb = getBlockSearchWorkBatch() return true } } // it is assumed that ibhs are sorted ibhs := p.indexBlockHeaders for len(ibhs) > 0 && len(tenantIDs) > 0 { if needStop(stopCh) { return } // locate tenantID equal or bigger than the tenantID in ibhs[0] tenantID := &tenantIDs[0] if tenantID.less(&ibhs[0].streamID.tenantID) { tenantID = &ibhs[0].streamID.tenantID n := sort.Search(len(tenantIDs), func(i int) bool { return !tenantIDs[i].less(tenantID) }) if n == len(tenantIDs) { tenantIDs = nil break } tenantID = &tenantIDs[n] tenantIDs = tenantIDs[n:] } // locate indexBlockHeader with equal or bigger tenantID than the given tenantID n := 0 if ibhs[0].streamID.tenantID.less(tenantID) { n = sort.Search(len(ibhs), func(i int) bool { return !ibhs[i].streamID.tenantID.less(tenantID) }) // The end of ibhs[n-1] may contain blocks for the given tenantID, so move it backwards n-- } ibh := &ibhs[n] ibhs = ibhs[n+1:] if so.minTimestamp > ibh.maxTimestamp || so.maxTimestamp < ibh.minTimestamp { // Skip the ibh, since it doesn't contain entries on the requested time range continue } bhss.bhs = ibh.mustReadBlockHeaders(bhss.bhs[:0], p) bhs := bhss.bhs for len(bhs) > 0 { // search for blocks with the given tenantID n = sort.Search(len(bhs), func(i int) bool { return !bhs[i].streamID.tenantID.less(tenantID) }) bhs = bhs[n:] for len(bhs) > 0 && bhs[0].streamID.tenantID.equal(tenantID) { bh := &bhs[0] bhs = bhs[1:] th := &bh.timestampsHeader if so.minTimestamp > th.maxTimestamp || so.maxTimestamp < th.minTimestamp { continue } if !scheduleBlockSearch(bh) { return } } if len(bhs) == 0 { break } // search for the next tenantID, which can potentially match tenantID from bhs[0] tenantID = &bhs[0].streamID.tenantID n = sort.Search(len(tenantIDs), func(i int) bool { return !tenantIDs[i].less(tenantID) }) if n == len(tenantIDs) { tenantIDs = nil break } tenantID = &tenantIDs[n] tenantIDs = tenantIDs[n:] } } // Flush the remaining work select { case <-stopCh: case workCh <- bswb: } } func (p *part) searchByStreamIDs(so *searchOptions, bhss *blockHeaders, workCh chan<- *blockSearchWorkBatch, stopCh <-chan struct{}) { // it is assumed that streamIDs are sorted streamIDs := so.streamIDs bswb := getBlockSearchWorkBatch() scheduleBlockSearch := func(bh *blockHeader) bool { if bswb.appendBlockSearchWork(p, so, bh) { return true } select { case <-stopCh: return false case workCh <- bswb: bswb = getBlockSearchWorkBatch() return true } } // it is assumed that ibhs are sorted ibhs := p.indexBlockHeaders for len(ibhs) > 0 && len(streamIDs) > 0 { if needStop(stopCh) { return } // locate streamID equal or bigger than the streamID in ibhs[0] streamID := &streamIDs[0] if streamID.less(&ibhs[0].streamID) { streamID = &ibhs[0].streamID n := sort.Search(len(streamIDs), func(i int) bool { return !streamIDs[i].less(streamID) }) if n == len(streamIDs) { streamIDs = nil break } streamID = &streamIDs[n] streamIDs = streamIDs[n:] } // locate indexBlockHeader with equal or bigger streamID than the given streamID n := 0 if ibhs[0].streamID.less(streamID) { n = sort.Search(len(ibhs), func(i int) bool { return !ibhs[i].streamID.less(streamID) }) // The end of ibhs[n-1] may contain blocks for the given streamID, so move it backwards. n-- } ibh := &ibhs[n] ibhs = ibhs[n+1:] if so.minTimestamp > ibh.maxTimestamp || so.maxTimestamp < ibh.minTimestamp { // Skip the ibh, since it doesn't contain entries on the requested time range continue } bhss.bhs = ibh.mustReadBlockHeaders(bhss.bhs[:0], p) bhs := bhss.bhs for len(bhs) > 0 { // search for blocks with the given streamID n = sort.Search(len(bhs), func(i int) bool { return !bhs[i].streamID.less(streamID) }) bhs = bhs[n:] for len(bhs) > 0 && bhs[0].streamID.equal(streamID) { bh := &bhs[0] bhs = bhs[1:] th := &bh.timestampsHeader if so.minTimestamp > th.maxTimestamp || so.maxTimestamp < th.minTimestamp { continue } if !scheduleBlockSearch(bh) { return } } if len(bhs) == 0 { break } // search for the next streamID, which can potentially match streamID from bhs[0] streamID = &bhs[0].streamID n = sort.Search(len(streamIDs), func(i int) bool { return !streamIDs[i].less(streamID) }) if n == len(streamIDs) { streamIDs = nil break } streamID = &streamIDs[n] streamIDs = streamIDs[n:] } } // Flush the remaining work select { case <-stopCh: case workCh <- bswb: } } func appendPartsInTimeRange(dst, src []*partWrapper, minTimestamp, maxTimestamp int64) []*partWrapper { for _, pw := range src { if maxTimestamp < pw.p.ph.MinTimestamp || minTimestamp > pw.p.ph.MaxTimestamp { continue } dst = append(dst, pw) } return dst } func getCommonStreamFilter(f filter) (*StreamFilter, filter) { switch t := f.(type) { case *filterAnd: filters := t.filters for i, filter := range filters { sf, ok := filter.(*filterStream) if ok && !sf.f.isEmpty() { // Remove sf from filters, since it doesn't filter out anything then. fa := &filterAnd{ filters: append(filters[:i:i], filters[i+1:]...), } return sf.f, fa } } case *filterStream: return t.f, &filterNoop{} } return nil, f } func forEachStreamField(streams []ValueWithHits, f func(f Field, hits uint64)) { var fields []Field for i := range streams { var err error fields, err = parseStreamFields(fields[:0], streams[i].Value) if err != nil { continue } hits := streams[i].Hits for j := range fields { f(fields[j], hits) } } } func parseStreamFields(dst []Field, s string) ([]Field, error) { if len(s) == 0 || s[0] != '{' { return dst, fmt.Errorf("missing '{' at the beginning of stream name") } s = s[1:] if len(s) == 0 || s[len(s)-1] != '}' { return dst, fmt.Errorf("missing '}' at the end of stream name") } s = s[:len(s)-1] if len(s) == 0 { return dst, nil } for { n := strings.Index(s, `="`) if n < 0 { return dst, fmt.Errorf("cannot find field value in double quotes at [%s]", s) } name := s[:n] s = s[n+1:] value, nOffset := tryUnquoteString(s, "") if nOffset < 0 { return dst, fmt.Errorf("cannot find parse field value in double quotes at [%s]", s) } s = s[nOffset:] dst = append(dst, Field{ Name: name, Value: value, }) if len(s) == 0 { return dst, nil } if s[0] != ',' { return dst, fmt.Errorf("missing ',' after %s=%q", name, value) } s = s[1:] } }