lib/storage: optimize matching speed for non-trivial regexp filters

Wrap re.Match into bytesutil.FastStringMatcher.

This increases performance for `{foo=~"complex_regex_here"}` filters
by up to 4x.
This commit is contained in:
Aliaksandr Valialkin 2022-10-01 11:55:47 +03:00
parent 9e8fbef27e
commit db16759c68
No known key found for this signature in database
GPG key ID: A72BEC6CD3D0DED1
2 changed files with 13 additions and 4 deletions

View file

@ -25,7 +25,8 @@ at [VictoriaMetrics cluster](https://docs.victoriametrics.com/Cluster-VictoriaMe
See [these docs](https://docs.victoriametrics.com/Cluster-VictoriaMetrics.html#multitenancy-via-labels) for details.
* FEATURE: [VictoriaMetrics cluster](https://docs.victoriametrics.com/Cluster-VictoriaMetrics.html): support specifying tenant ids via `vm_account_id` and `vm_project_id` labels. See [these docs](https://docs.victoriametrics.com/Cluster-VictoriaMetrics.html#multitenancy-via-labels) and [this feature request](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/2970).
* FEATURE: improve [relabeling](https://docs.victoriametrics.com/vmagent.html#relabeling) performance by up to 3x for non-trivial `regex` values.
* FEATURE: [vmagent](https://docs.victoriametrics.com/vmagent.html): improve [relabeling](https://docs.victoriametrics.com/vmagent.html#relabeling) performance by up to 3x for non-trivial `regex` values such as `([^:]+):.+`, which can be used for extracting a `host` part from `host:port` label value.
* FEATURE: [MetricsQL](https://docs.victoriametrics.com/MetricsQL.html): improve performance by up to 4x for queries containing non-trivial `regex` filters such as `{path=~"/foo/.+|/bar"}`.
* FEATURE: sanitize metric names for data ingested via [DataDog protocol](https://docs.victoriametrics.com/#how-to-send-data-from-datadog-agent) according to [DataDog metric naming](https://docs.datadoghq.com/metrics/custom_metrics/#naming-custom-metrics). The behaviour can be disabled by passing `-datadog.sanitizeMetricName=false` command-line flag. Thanks to @PerGon for [the pull request](https://github.com/VictoriaMetrics/VictoriaMetrics/pull/3105).
* FEATURE: add `-usePromCompatibleNaming` command-line flag to [vmagent](https://docs.victoriametrics.com/vmagent.html), to single-node VictoriaMetrics and to `vminsert` component of VictoriaMetrics cluster. This flag can be used for normalizing the ingested metric names and label names to [Prometheus-compatible form](https://prometheus.io/docs/concepts/data_model/#metric-names-and-labels). If this flag is set, then all the chars unsupported by Prometheus are replaced with `_` chars in metric names and labels of the ingested samples. See [this feature request](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3113).
* FEATURE: accept whitespace in metric names and tags ingested via [Graphite plaintext protocol](https://docs.victoriametrics.com/#how-to-send-data-from-graphite-compatible-agents-such-as-statsd) according to [the specs](https://graphite.readthedocs.io/en/latest/tags.html). See [this issue](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3102).

View file

@ -588,13 +588,21 @@ func getOptimizedReMatchFunc(reMatch func(b []byte) bool, expr string) (func(b [
if err != nil {
logger.Panicf("BUG: unexpected error when parsing verified expr=%q: %s", expr, err)
}
if matchFunc, literalSuffix, reCost := getOptimizedReMatchFuncExt(reMatch, sre); matchFunc != nil {
// Prepare fast string matcher for reMatch.
fsm := bytesutil.NewFastStringMatcher(func(s string) bool {
return reMatch(bytesutil.ToUnsafeBytes(s))
})
reMatchFast := func(b []byte) bool {
return fsm.Match(bytesutil.ToUnsafeString(b))
}
if matchFunc, literalSuffix, reCost := getOptimizedReMatchFuncExt(reMatchFast, sre); matchFunc != nil {
// Found optimized function for matching the expr.
suffixUnescaped := tagCharsReverseRegexpEscaper.Replace(literalSuffix)
return matchFunc, suffixUnescaped, reCost
}
// Fall back to un-optimized reMatch.
return reMatch, "", reMatchCost
// Fall back to reMatchFast.
return reMatchFast, "", reMatchCost
}
// These cost values are used for sorting tag filters in ascending order or the required CPU time for execution.