app/vmauth: properly release memory during config reload (#5805)

* app/vmauth: properly release memory during config reload
previously metrics package hold a refrence for channels for users concurrent requests.
it case of churn at `name`  field of users configuration, new metric was created. But previous one wasn't deleted.
It prevented full parsed configuration from being garbace collected.

now all config related metrics are bound to corresponding metrics.Set and unregistered during config reload process.

It also must fix an issue with incorrect values for current concurrent user requests

https://github.com/VictoriaMetrics/VictoriaMetrics/issues/4690

* wip

---------

Co-authored-by: Aliaksandr Valialkin <valyala@victoriametrics.com>
This commit is contained in:
Nikolay 2024-02-13 21:49:17 +03:00 committed by Aliaksandr Valialkin
parent 89aceb2b2f
commit 0a2cc0e873
No known key found for this signature in database
GPG key ID: 52C003EE2BCDB9EB
2 changed files with 28 additions and 15 deletions

View file

@ -43,6 +43,9 @@ var (
type AuthConfig struct {
Users []UserInfo `yaml:"users,omitempty"`
UnauthorizedUser *UserInfo `yaml:"unauthorized_user,omitempty"`
// ms holds all the metrics for the given AuthConfig
ms *metrics.Set
}
// UserInfo is user information read from authConfigPath
@ -503,6 +506,11 @@ func loadAuthConfig() (bool, error) {
}
logger.Infof("loaded information about %d users from -auth.config=%q", len(m), *authConfigPath)
prevAc := authConfig.Load()
if prevAc != nil {
metrics.UnregisterSet(prevAc.ms)
}
metrics.RegisterSet(ac.ms)
authConfig.Store(ac)
authConfigData.Store(&data)
authUsers.Store(&m)
@ -515,10 +523,13 @@ func parseAuthConfig(data []byte) (*AuthConfig, error) {
if err != nil {
return nil, fmt.Errorf("cannot expand environment vars: %w", err)
}
var ac AuthConfig
if err = yaml.UnmarshalStrict(data, &ac); err != nil {
ac := &AuthConfig{
ms: metrics.NewSet(),
}
if err = yaml.UnmarshalStrict(data, ac); err != nil {
return nil, fmt.Errorf("cannot unmarshal AuthConfig data: %w", err)
}
ui := ac.UnauthorizedUser
if ui != nil {
if ui.Username != "" {
@ -541,15 +552,15 @@ func parseAuthConfig(data []byte) (*AuthConfig, error) {
if err != nil {
return nil, fmt.Errorf("cannot parse metric_labels for unauthorized_user: %w", err)
}
ui.requests = metrics.GetOrCreateCounter(`vmauth_unauthorized_user_requests_total` + metricLabels)
ui.backendErrors = metrics.GetOrCreateCounter(`vmauth_unauthorized_user_request_backend_errors_total` + metricLabels)
ui.requestsDuration = metrics.GetOrCreateSummary(`vmauth_unauthorized_user_request_duration_seconds` + metricLabels)
ui.requests = ac.ms.NewCounter(`vmauth_unauthorized_user_requests_total` + metricLabels)
ui.backendErrors = ac.ms.NewCounter(`vmauth_unauthorized_user_request_backend_errors_total` + metricLabels)
ui.requestsDuration = ac.ms.NewSummary(`vmauth_unauthorized_user_request_duration_seconds` + metricLabels)
ui.concurrencyLimitCh = make(chan struct{}, ui.getMaxConcurrentRequests())
ui.concurrencyLimitReached = metrics.GetOrCreateCounter(`vmauth_unauthorized_user_concurrent_requests_limit_reached_total` + metricLabels)
_ = metrics.GetOrCreateGauge(`vmauth_unauthorized_user_concurrent_requests_capacity`+metricLabels, func() float64 {
ui.concurrencyLimitReached = ac.ms.NewCounter(`vmauth_unauthorized_user_concurrent_requests_limit_reached_total` + metricLabels)
_ = ac.ms.NewGauge(`vmauth_unauthorized_user_concurrent_requests_capacity`+metricLabels, func() float64 {
return float64(cap(ui.concurrencyLimitCh))
})
_ = metrics.GetOrCreateGauge(`vmauth_unauthorized_user_concurrent_requests_current`+metricLabels, func() float64 {
_ = ac.ms.NewGauge(`vmauth_unauthorized_user_concurrent_requests_current`+metricLabels, func() float64 {
return float64(len(ui.concurrencyLimitCh))
})
@ -559,7 +570,7 @@ func parseAuthConfig(data []byte) (*AuthConfig, error) {
}
ui.httpTransport = tr
}
return &ac, nil
return ac, nil
}
func parseAuthConfigUsers(ac *AuthConfig) (map[string]*UserInfo, error) {
@ -601,16 +612,16 @@ func parseAuthConfigUsers(ac *AuthConfig) (map[string]*UserInfo, error) {
if err != nil {
return nil, fmt.Errorf("cannot parse metric_labels: %w", err)
}
ui.requests = metrics.GetOrCreateCounter(`vmauth_user_requests_total` + metricLabels)
ui.backendErrors = metrics.GetOrCreateCounter(`vmauth_user_request_backend_errors_total` + metricLabels)
ui.requestsDuration = metrics.GetOrCreateSummary(`vmauth_user_request_duration_seconds` + metricLabels)
ui.requests = ac.ms.GetOrCreateCounter(`vmauth_user_requests_total` + metricLabels)
ui.backendErrors = ac.ms.GetOrCreateCounter(`vmauth_user_request_backend_errors_total` + metricLabels)
ui.requestsDuration = ac.ms.GetOrCreateSummary(`vmauth_user_request_duration_seconds` + metricLabels)
mcr := ui.getMaxConcurrentRequests()
ui.concurrencyLimitCh = make(chan struct{}, mcr)
ui.concurrencyLimitReached = metrics.GetOrCreateCounter(`vmauth_user_concurrent_requests_limit_reached_total` + metricLabels)
_ = metrics.GetOrCreateGauge(`vmauth_user_concurrent_requests_capacity`+metricLabels, func() float64 {
ui.concurrencyLimitReached = ac.ms.GetOrCreateCounter(`vmauth_user_concurrent_requests_limit_reached_total` + metricLabels)
_ = ac.ms.GetOrCreateGauge(`vmauth_user_concurrent_requests_capacity`+metricLabels, func() float64 {
return float64(cap(ui.concurrencyLimitCh))
})
_ = metrics.GetOrCreateGauge(`vmauth_user_concurrent_requests_current`+metricLabels, func() float64 {
_ = ac.ms.GetOrCreateGauge(`vmauth_user_concurrent_requests_current`+metricLabels, func() float64 {
return float64(len(ui.concurrencyLimitCh))
})

View file

@ -46,6 +46,8 @@ The sandbox cluster installation is running under the constant load generated by
* FEATURE: [vmalert](https://docs.victoriametrics.com/#vmalert): support [filtering](https://prometheus.io/docs/prometheus/2.49/querying/api/#rules) for `/api/v1/rules` API. See [the pull request](https://github.com/VictoriaMetrics/VictoriaMetrics/pull/5749) by @victoramsantos.
* FEATURE: [vmbackup](https://docs.victoriametrics.com/vmbackup.html): support client-side TLS configuration for creating and deleting snapshots via `-snapshot.tls*` cmd-line flags. See [this feature request](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/5724). Thanks to @khushijain21 for the [pull request](https://github.com/VictoriaMetrics/VictoriaMetrics/pull/5738).
* BUGFIX: [vmauth](https://docs.victoriametrics.com/vmauth.html): properly release memory during config reload. See [this issue](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/4690).
* BUGFIX: [vmauth](https://docs.victoriametrics.com/vmauth.html): properly expose `vmauth_unauthorized_user_concurrent_requests_capacity`, `vmauth_unauthorized_user_concurrent_requests_current`, `vmauth_user_concurrent_requests_capacity` and `vmauth_user_concurrent_requests_current` [metrics](https://docs.victoriametrics.com/vmauth.html#monitoring) after [config reload](https://docs.victoriametrics.com/vmauth.html#config-reload). Previously these metrics didn't work after config reload.
* BUGFIX: [MetricsQL](https://docs.victoriametrics.com/MetricsQL.html): properly propagate [label filters](https://docs.victoriametrics.com/keyconcepts/#filtering) from multiple arguments passed to [aggregate functions](https://docs.victoriametrics.com/metricsql/#aggregate-functions). For example, `sum({job="foo"}, {job="bar"}) by (job) + a` was improperly optimized to `sum({job="foo"}, {job="bar"}) by (job) + a{job="foo"}` before being executed. This could lead to unexpected results. See [this issue](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/5604).
* BUGFIX: [MetricsQL](https://docs.victoriametrics.com/MetricsQL.html): properly handle precision errors when calculating [changes](https://docs.victoriametrics.com/metricsql/#changes), [changes_prometheus](https://docs.victoriametrics.com/metricsql/#changes_prometheus), [increases_over_time](https://docs.victoriametrics.com/metricsql/#increases_over_time) and [resets](https://docs.victoriametrics.com/metricsql/#resets) functions. See [this issue](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/767).
* BUGFIX: all VictoriaMetrics components: consistently return 200 http status code from [`/-/reload` endpoint](https://docs.victoriametrics.com/vmagent/#configuration-update). Previously [single-node VictoriaMetrics](https://docs.victoriametrics.com/) was returning 204 http status code. See [this feature request](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/5774).