app/vmselect: properly cancel long running requests on client connection close

At this time `bufferedwriter` [silently ignores connection close
errors](78eaa056c0/lib/bufferedwriter/bufferedwriter.go (L67)).
It may be very convenient in some situations (to not log such
unimportant errors), but it's too implicit and unsafe for the others.
For example, if you close [export
API](https://docs.victoriametrics.com/#how-to-export-time-series) client
connection in the middle of communication, VictoriaMetrics won't notice
it and will start to hog CPU by exporting all the data into nowhere
until it process all of them. If you'll make a few retries, it will be
effectively a DoS on the server.

This commit replaces this implicit error suppressing with explicit error
handling which fixes the issue with export API.

Issue was introduced at e78f3ac8ac
This commit is contained in:
Dmitry Konishchev 2025-01-29 18:09:32 +03:00 committed by GitHub
parent 34204cc597
commit 17fd53bd37
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
3 changed files with 40 additions and 17 deletions
app/vmselect/prometheus
docs/changelog
lib/bufferedwriter

View file

@ -29,6 +29,7 @@ import (
"github.com/VictoriaMetrics/VictoriaMetrics/lib/httputils"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/logger"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/memory"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/netutil"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/querytracer"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/storage"
)
@ -142,10 +143,13 @@ func FederateHandler(startTime time.Time, w http.ResponseWriter, r *http.Request
WriteFederate(bb, rs)
return sw.maybeFlushBuffer(bb)
})
if err != nil {
if err == nil {
err = sw.flush()
}
if err != nil && !netutil.IsTrivialNetworkError(err) {
return fmt.Errorf("error during sending data to remote client: %w", err)
}
return sw.flush()
return nil
}
var federateDuration = metrics.NewSummary(`vm_request_duration_seconds{path="/federate"}`)
@ -226,10 +230,13 @@ func ExportCSVHandler(startTime time.Time, w http.ResponseWriter, r *http.Reques
}()
}
err = <-doneCh
if err != nil {
if err == nil {
err = sw.flush()
}
if err != nil && !netutil.IsTrivialNetworkError(err) {
return fmt.Errorf("error during sending the exported csv data to remote client: %w", err)
}
return sw.flush()
return nil
}
var exportCSVDuration = metrics.NewSummary(`vm_request_duration_seconds{path="/api/v1/export/csv"}`)
@ -281,10 +288,13 @@ func ExportNativeHandler(startTime time.Time, w http.ResponseWriter, r *http.Req
bb.B = dst
return sw.maybeFlushBuffer(bb)
})
if err != nil {
if err == nil {
err = sw.flush()
}
if err != nil && !netutil.IsTrivialNetworkError(err) {
return fmt.Errorf("error during sending native data to remote client: %w", err)
}
return sw.flush()
return nil
}
var exportNativeDuration = metrics.NewSummary(`vm_request_duration_seconds{path="/api/v1/export/native"}`)
@ -441,16 +451,19 @@ func exportHandler(qt *querytracer.Tracer, w http.ResponseWriter, cp *commonPara
}()
}
err := <-doneCh
if err != nil {
if err == nil {
err = sw.flush()
}
if err == nil {
if format == "promapi" {
WriteExportPromAPIFooter(bw, qt)
}
err = bw.Flush()
}
if err != nil && !netutil.IsTrivialNetworkError(err) {
return fmt.Errorf("cannot send data to remote client: %w", err)
}
if err := sw.flush(); err != nil {
return fmt.Errorf("cannot send data to remote client: %w", err)
}
if format == "promapi" {
WriteExportPromAPIFooter(bw, qt)
}
return bw.Flush()
return nil
}
type exportBlock struct {

View file

@ -51,7 +51,7 @@ Released at 2025-01-24
* FEATURE: [MetricsQL](https://docs.victoriametrics.com/metricsql/): allow executing queries with `$__interval` and `$__rate_interval` - these placeholders are automatically replaced with `1i` (e.g. `step` arg value at [`/api/v1/query_range`](https://docs.victoriametrics.com/keyconcepts/#range-query)) during query execution. This simplifies copying queries from Grafana dashboards.
* FEATURE: [vmsingle](https://docs.victoriametrics.com/single-server-victoriametrics/) and `vmselect` in [VictoriaMetrics cluster](https://docs.victoriametrics.com/cluster-victoriametrics/): add command-line flag `-search.maxDeleteDuration(default 5m)` to limit the duration of the `/api/v1/admin/tsdb/delete_series` call. Previously, the call is limited by `-search.maxQueryDuration`.
* FEATURE: [dashboards](https://github.com/VictoriaMetrics/VictoriaMetrics/blob/master/dashboards): all dashboards that use [VictoriaMetrics Grafana datasource](https://github.com/VictoriaMetrics/victoriametrics-datasource) were updated to use a [new datasource ID](https://github.com/VictoriaMetrics/victoriametrics-datasource/releases/tag/v0.12.0).
* FEATURE: [dashboards](https://github.com/VictoriaMetrics/VictoriaMetrics/blob/master/dashboards): all dashboards that use [VictoriaMetrics Grafana datasource](https://github.com/VictoriaMetrics/victoriametrics-datasource) were updated to use a [new datasource ID](https://github.com/VictoriaMetrics/victoriametrics-datasource/releases/tag/v0.12.0).
* FEATURE: [vmui](https://docs.victoriametrics.com/#vmui): reflect column settings for the table view in URL, so the table view can be shared via link. See [this issue](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/7662).
* BUGFIX: [vmauth](https://docs.victoriametrics.com/vmauth/): fix possible runtime panic during requests processing under heavy load. See [this issue](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/8051) for details.
@ -61,6 +61,7 @@ Released at 2025-01-24
* BUGFIX: [Single-node VictoriaMetrics](https://docs.victoriametrics.com/) and [vmselect](https://docs.victoriametrics.com/cluster-victoriametrics/): respect staleness detection in increase, increase_pure and delta functions when time series has gaps and `-search.maxStalenessInterval` is set. See [this issue](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/8072) for details.
* BUGFIX: [vmsingle](https://docs.victoriametrics.com/single-server-victoriametrics/), `vminsert` in [VictoriaMetrics cluster](https://docs.victoriametrics.com/cluster-victoriametrics/) and [vmagent](https://docs.victoriametrics.com/vmagent/): allow ingesting histograms with missing `_sum` metric via [OpenTelemetry ingestion protocol](https://docs.victoriametrics.com/#sending-data-via-opentelemetry) in the same way as Prometheus does.
* BUGFIX: [vmui](https://docs.victoriametrics.com/#vmui): fix an issue where pressing the "Enter" key in the query editor did not execute the query. See [this issue](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/8058).
* BUGFIX: [export API](https://docs.victoriametrics.com/#how-to-export-time-series): cancel export process on client connection close. Previously client connection close was ignored and VictoriaMetrics started to hog CPU by exporting metrics to nowhere until it export all of them.
## [v1.102.11](https://github.com/VictoriaMetrics/VictoriaMetrics/releases/tag/v1.102.11)

View file

@ -64,21 +64,30 @@ func (bw *Writer) Write(p []byte) (int, error) {
return 0, bw.err
}
n, err := bw.bw.Write(p)
if err != nil && !netutil.IsTrivialNetworkError(err) {
if err != nil {
bw.err = fmt.Errorf("cannot send %d bytes to client: %w", len(p), err)
}
return n, bw.err
}
// Flush flushes bw to the underlying writer.
//
// Connection close errors are ignored to not trigger on them and to not write to logs, but Write method doesn't ignore
// them since it may lead to an unexpected behaviour (see https://github.com/VictoriaMetrics/VictoriaMetrics/pull/8157)
func (bw *Writer) Flush() error {
bw.lock.Lock()
defer bw.lock.Unlock()
if bw.err != nil {
if netutil.IsTrivialNetworkError(bw.err) {
return nil
}
return bw.err
}
if err := bw.bw.Flush(); err != nil && !netutil.IsTrivialNetworkError(err) {
if err := bw.bw.Flush(); err != nil {
bw.err = fmt.Errorf("cannot flush data to client: %w", err)
if netutil.IsTrivialNetworkError(err) {
return nil
}
}
return bw.err
}