lib/logstorage: optimize performance for queries, which select all the log fields for logs containing hundreds of log fields (aka "wide events")

Unpack the full columnsHeader block instead of unpacking meta-information per each individual column
when the query, which selects all the columns, is executed. This improves performance when scanning
logs with big number of fields.

(cherry picked from commit 2023f017b1)
This commit is contained in:
Aliaksandr Valialkin 2024-10-18 00:21:20 +02:00 committed by hagen1778
parent 5d541322c6
commit 92b9b13df1
No known key found for this signature in database
GPG key ID: E92986095E0DD614
3 changed files with 19 additions and 55 deletions

View file

@ -18,6 +18,7 @@ according to [these docs](https://docs.victoriametrics.com/victorialogs/quicksta
* FEATURE: add basic [alerting rules](https://github.com/VictoriaMetrics/VictoriaMetrics/blob/master/deployment/docker/alerts-vlogs.yml) for VictoriaLogs process. See details at [monitoring docs](https://docs.victoriametrics.com/victorialogs/index.html#monitoring).
* FEATURE: improve [`stats` pipe](https://docs.victoriametrics.com/victorialogs/logsql/#stats-pipe) performance on systems with many CPU cores when `by(...)` fields contain big number of unique values. For example, `_time:1d | stats by (user_id) count() x` should be executed much faster when `user_id` field contains millions of unique values.
* FEATURE: improve performance for [`top`](https://docs.victoriametrics.com/victorialogs/logsql/#top-pipe), [`uniq`](https://docs.victoriametrics.com/victorialogs/logsql/#uniq-pipe) and [`field_values`](https://docs.victoriametrics.com/victorialogs/logsql/#field_values-pipe) pipes on systems with many CPU cores when it is applied to [log fields](https://docs.victoriametrics.com/victorialogs/keyconcepts/#data-model) with big number of unique values. For example, `_time:1d | top 5 (user_id)` should be executed much faster when `user_id` field contains millions of unique values.
* FEATURE: improve performance for [`field_names` pipe](https://docs.victoriametrics.com/victorialogs/logsql/#field_names-pipe) when it is applied to logs with hundreds of [log fields](https://docs.victoriametrics.com/victorialogs/keyconcepts/#data-model).
## [v0.36.0](https://github.com/VictoriaMetrics/VictoriaMetrics/releases/tag/v0.36.0-victorialogs)

View file

@ -316,8 +316,8 @@ func (br *blockResult) initAllColumns() {
}
// Add other const columns
ccs := br.bs.getConstColumns()
for _, cc := range ccs {
csh := br.bs.getColumnsHeader()
for _, cc := range csh.constColumns {
if cc.Name == "" {
continue
}
@ -327,7 +327,7 @@ func (br *blockResult) initAllColumns() {
}
// Add other non-const columns
chs := br.bs.getColumnHeaders()
chs := csh.columnHeaders
for i := range chs {
ch := &chs[i]
if ch.name == "" {

View file

@ -139,7 +139,7 @@ type blockSearch struct {
// cshCache is the columnsHeader associated with the given block.
//
// It is initialized lazily by calling getColumnsHeaderV0().
// It is initialized lazily by calling getColumnsHeader().
cshCache *columnsHeader
// seenStreams contains seen streamIDs for the recent searches.
@ -240,7 +240,7 @@ func (bs *blockSearch) getConstColumnValue(name string) string {
}
if bs.partFormatVersion() < 1 {
csh := bs.getColumnsHeaderV0()
csh := bs.getColumnsHeader()
for _, cc := range csh.constColumns {
if cc.Name == name {
return cc.Value
@ -288,7 +288,7 @@ func (bs *blockSearch) getColumnHeader(name string) *columnHeader {
}
if bs.partFormatVersion() < 1 {
csh := bs.getColumnsHeaderV0()
csh := bs.getColumnsHeader()
chs := csh.columnHeaders
for i := range chs {
ch := &chs[i]
@ -337,48 +337,6 @@ func (bs *blockSearch) getColumnNameID(name string) (uint64, bool) {
return id, ok
}
func (bs *blockSearch) getColumnNameByID(id uint64) (string, bool) {
columnNames := bs.bsw.p.columnNames
if id >= uint64(len(columnNames)) {
return "", false
}
return columnNames[id], true
}
func (bs *blockSearch) getConstColumns() []Field {
if bs.partFormatVersion() < 1 {
csh := bs.getColumnsHeaderV0()
return csh.constColumns
}
chsIndex := bs.getColumnsHeaderIndex()
for _, cr := range chsIndex.constColumnsRefs {
columnName, ok := bs.getColumnNameByID(cr.columnNameID)
if !ok {
logger.Panicf("FATAL: %s: missing column name for id=%d", bs.bsw.p.path, cr.columnNameID)
}
_ = bs.getConstColumnValue(columnName)
}
return bs.ccsCache
}
func (bs *blockSearch) getColumnHeaders() []columnHeader {
if bs.partFormatVersion() < 1 {
csh := bs.getColumnsHeaderV0()
return csh.columnHeaders
}
chsIndex := bs.getColumnsHeaderIndex()
for _, cr := range chsIndex.columnHeadersRefs {
columnName, ok := bs.getColumnNameByID(cr.columnNameID)
if !ok {
logger.Panicf("FATAL: %s: missing column name for id=%d", bs.bsw.p.path, cr.columnNameID)
}
_ = bs.getColumnHeader(columnName)
}
return bs.chsCache
}
func (bs *blockSearch) getColumnsHeaderIndex() *columnsHeaderIndex {
if bs.partFormatVersion() < 1 {
logger.Panicf("BUG: getColumnsHeaderIndex() can be called only for part encoding v1+, while it has been called for v%d", bs.partFormatVersion())
@ -395,18 +353,23 @@ func (bs *blockSearch) getColumnsHeaderIndex() *columnsHeaderIndex {
return bs.cshIndexCache
}
func (bs *blockSearch) getColumnsHeaderV0() *columnsHeader {
if bs.partFormatVersion() >= 1 {
logger.Panicf("BUG: getColumnsHeaderV0() can be called only for part encoding v0, while it has been called for v%d", bs.partFormatVersion())
}
func (bs *blockSearch) getColumnsHeader() *columnsHeader {
if bs.cshCache == nil {
b := bs.getColumnsHeaderBlock()
bs.cshCache = getColumnsHeader()
if err := bs.cshCache.unmarshalNoArena(b, 0); err != nil {
csh := getColumnsHeader()
partFormatVersion := bs.partFormatVersion()
if err := csh.unmarshalNoArena(b, partFormatVersion); err != nil {
logger.Panicf("FATAL: %s: cannot unmarshal columns header: %s", bs.bsw.p.path, err)
}
if partFormatVersion >= 1 {
cshIndex := bs.getColumnsHeaderIndex()
if err := csh.setColumnNames(cshIndex, bs.bsw.p.columnNames); err != nil {
logger.Panicf("FATAL: %s: %s", bs.bsw.p.path, err)
}
}
bs.cshCache = csh
}
return bs.cshCache
}