From 3121d76bee9c5f8003ef52d998da00eacdb194a8 Mon Sep 17 00:00:00 2001 From: Nikolay Date: Tue, 14 Nov 2023 20:58:28 +0100 Subject: [PATCH] lib/querytracer: makes package concurrent safe to use (#5322) * lib/querytracer: makes package concurrent safe to use it must fix various issues with concurrent code usage. Especially, when it's not reasonable to wait for all goroutines to be finished * wip --------- Co-authored-by: Aliaksandr Valialkin --- docs/CHANGELOG.md | 1 + lib/querytracer/tracer.go | 46 +++++++++++++++++++++------------- lib/querytracer/tracer_test.go | 40 +++++++++++++++++++++++++---- 3 files changed, 65 insertions(+), 22 deletions(-) diff --git a/docs/CHANGELOG.md b/docs/CHANGELOG.md index a3f96236c..3065e99b8 100644 --- a/docs/CHANGELOG.md +++ b/docs/CHANGELOG.md @@ -118,6 +118,7 @@ The sandbox cluster installation is running under the constant load generated by * BUGFIX: [vmui](https://docs.victoriametrics.com/#vmui): correctly display query errors on [Explore Prometheus Metrics](https://docs.victoriametrics.com/#metrics-explorer) page. See [this issue](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/5202) for details. * BUGFIX: [vmui](https://docs.victoriametrics.com/#vmui): properly handle trailing slash in the server URL. See [this issue](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/5203). * BUGFIX: [vmbackupmanager](https://docs.victoriametrics.com/vmbackupmanager.html): correctly print error in logs when copying backup fails. Previously, error was displayed in metrics but was missing in logs. +* BUGFIX: fix panic, which could occur when [query tracing](https://docs.victoriametrics.com/#query-tracing) is enabled. See [this issue](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/5319). ## [v1.94.0](https://github.com/VictoriaMetrics/VictoriaMetrics/releases/tag/v1.94.0) diff --git a/lib/querytracer/tracer.go b/lib/querytracer/tracer.go index 1d7e00e5c..8b4d228c9 100644 --- a/lib/querytracer/tracer.go +++ b/lib/querytracer/tracer.go @@ -7,6 +7,7 @@ import ( "fmt" "io" "strings" + "sync/atomic" "time" "github.com/VictoriaMetrics/VictoriaMetrics/lib/buildinfo" @@ -22,6 +23,14 @@ var denyQueryTracing = flag.Bool("denyQueryTracing", false, "Whether to disable // Tracer may contain sub-tracers (branches) in order to build tree-like execution order. // Call Tracer.NewChild func for adding sub-tracer. type Tracer struct { + // isDone is set to true after Done* call. + // + // It is used for determining whether it is safe to print the trace. + // It is unsafe to print the trace when it isn't closed yet, since it may be modified + // by concurrently running goroutines. + // See https://github.com/VictoriaMetrics/VictoriaMetrics/issues/5319 + isDone atomic.Bool + // startTime is the time when Tracer was created startTime time.Time // doneTime is the time when Done or Donef was called @@ -30,7 +39,7 @@ type Tracer struct { message string // children is a list of children Tracer objects children []*Tracer - // span contains span for the given Tracer. It is added via Tracer.AddSpan(). + // span contains span for the given Tracer. It is added via Tracer.AddJSON(). // If span is non-nil, then the remaining fields aren't used. span *span } @@ -68,7 +77,7 @@ func (t *Tracer) NewChild(format string, args ...interface{}) *Tracer { if t == nil { return nil } - if !t.doneTime.IsZero() { + if t.isDone.Load() { panic(fmt.Errorf("BUG: NewChild() cannot be called after Donef(%q) call", t.message)) } child := &Tracer{ @@ -87,10 +96,11 @@ func (t *Tracer) Done() { if t == nil { return } - if !t.doneTime.IsZero() { + if t.isDone.Load() { panic(fmt.Errorf("BUG: Donef(%q) already called", t.message)) } t.doneTime = time.Now() + t.isDone.Store(true) } // Donef appends the given fmt.Sprintf(format, args..) message to t and finished it. @@ -101,11 +111,12 @@ func (t *Tracer) Donef(format string, args ...interface{}) { if t == nil { return } - if !t.doneTime.IsZero() { + if t.isDone.Load() { panic(fmt.Errorf("BUG: Donef(%q) already called", t.message)) } t.message += ": " + fmt.Sprintf(format, args...) t.doneTime = time.Now() + t.isDone.Store(true) } // Printf adds new fmt.Sprintf(format, args...) message to t. @@ -115,7 +126,7 @@ func (t *Tracer) Printf(format string, args ...interface{}) { if t == nil { return } - if !t.doneTime.IsZero() { + if t.isDone.Load() { panic(fmt.Errorf("BUG: Printf() cannot be called after Done(%q) call", t.message)) } now := time.Now() @@ -124,6 +135,7 @@ func (t *Tracer) Printf(format string, args ...interface{}) { doneTime: now, message: fmt.Sprintf(format, args...), } + child.isDone.Store(true) t.children = append(t.children, child) } @@ -153,6 +165,9 @@ func (t *Tracer) AddJSON(jsonTrace []byte) error { // String returns string representation of t. // // String must be called when t methods aren't called by other goroutines. +// +// It is safe calling String() when child tracers aren't finished yet. +// In this case they will contain the corresponding message. func (t *Tracer) String() string { if t == nil { return "" @@ -166,6 +181,9 @@ func (t *Tracer) String() string { // ToJSON returns JSON representation of t. // // ToJSON must be called when t methods aren't called by other goroutines. + +// It is safe calling ToJSON() when child tracers aren't finished yet. +// In this case they will contain the corresponding message. func (t *Tracer) ToJSON() string { if t == nil { return "" @@ -187,6 +205,12 @@ func (t *Tracer) toSpanInternal(prevTime time.Time) (*span, time.Time) { if t.span != nil { return t.span, prevTime } + if !t.isDone.Load() { + s := &span{ + Message: fmt.Sprintf("missing Tracer.Done() call for the trace with message=%s", t.message), + } + return s, prevTime + } if t.doneTime == t.startTime { // a single-line trace d := t.startTime.Sub(prevTime) @@ -199,10 +223,6 @@ func (t *Tracer) toSpanInternal(prevTime time.Time) (*span, time.Time) { // tracer with children msg := t.message doneTime := t.doneTime - if doneTime.IsZero() { - msg += ": missing Tracer.Done() call" - doneTime = t.getLastChildDoneTime(t.startTime) - } d := doneTime.Sub(t.startTime) var children []*span var sChild *span @@ -219,14 +239,6 @@ func (t *Tracer) toSpanInternal(prevTime time.Time) (*span, time.Time) { return s, doneTime } -func (t *Tracer) getLastChildDoneTime(defaultTime time.Time) time.Time { - if len(t.children) == 0 { - return defaultTime - } - lastChild := t.children[len(t.children)-1] - return lastChild.getLastChildDoneTime(lastChild.startTime) -} - // span represents a single trace span type span struct { // DurationMsec is the duration for the current trace span in milliseconds. diff --git a/lib/querytracer/tracer_test.go b/lib/querytracer/tracer_test.go index a20fd6a14..d67336f91 100644 --- a/lib/querytracer/tracer_test.go +++ b/lib/querytracer/tracer_test.go @@ -1,7 +1,9 @@ package querytracer import ( + "fmt" "regexp" + "sync" "testing" ) @@ -137,11 +139,39 @@ func TestTraceMissingDonef(t *testing.T) { qtChild.Printf("child printf") qt.Printf("another parent printf") s := qt.String() - sExpected := `- 0ms: : parent: missing Tracer.Done() call -| - 0ms: parent printf -| - 0ms: child: missing Tracer.Done() call -| | - 0ms: child printf -| - 0ms: another parent printf + sExpected := `- 0.000ms: missing Tracer.Done() call for the trace with message=: parent +` + if !areEqualTracesSkipDuration(s, sExpected) { + t.Fatalf("unexpected trace\ngot\n%s\nwant\n%s", s, sExpected) + } +} + +func TestTraceConcurrent(t *testing.T) { + qt := New(true, "parent") + childLocal := qt.NewChild("local") + childLocal.Printf("abc") + childLocal.Done() + var wg sync.WaitGroup + for i := 0; i < 3; i++ { + child := qt.NewChild(fmt.Sprintf("child %d", i)) + wg.Add(1) + go func() { + for j := 0; j < 100; j++ { + child.Printf(fmt.Sprintf("message %d", j)) + } + wg.Done() + }() + } + qt.Done() + // Verify that it is safe to call qt.String() when child traces aren't done yet + s := qt.String() + wg.Wait() + sExpected := `- 0.008ms: : parent +| - 0.002ms: local +| | - 0.000ms: abc +| - 0.000ms: missing Tracer.Done() call for the trace with message=child 0 +| - 0.000ms: missing Tracer.Done() call for the trace with message=child 1 +| - 0.000ms: missing Tracer.Done() call for the trace with message=child 2 ` if !areEqualTracesSkipDuration(s, sExpected) { t.Fatalf("unexpected trace\ngot\n%s\nwant\n%s", s, sExpected)