From 2f28e945b8fd97f775e5d021b7b8f8189e344776 Mon Sep 17 00:00:00 2001 From: Aliaksandr Valialkin Date: Thu, 7 May 2020 15:20:06 +0300 Subject: [PATCH] lib/httpserver: add `-http.shutdownDelay` flag for a grace period before http server shutdown The http server returns 503 non-OK error at `/health` page during grace period, so load balancers in front of the http server could re-route incoming requests to other servers. Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/463 --- lib/httpserver/httpserver.go | 60 ++++++++++++++++++++++++++++-------- 1 file changed, 48 insertions(+), 12 deletions(-) diff --git a/lib/httpserver/httpserver.go b/lib/httpserver/httpserver.go index 7b9c4c4e5..6d7a9f397 100644 --- a/lib/httpserver/httpserver.go +++ b/lib/httpserver/httpserver.go @@ -14,6 +14,7 @@ import ( "strconv" "strings" "sync" + "sync/atomic" "time" "github.com/VictoriaMetrics/VictoriaMetrics/lib/logger" @@ -29,11 +30,20 @@ var ( disableResponseCompression = flag.Bool("http.disableResponseCompression", false, "Disable compression of HTTP responses for saving CPU resources. By default compression is enabled to save network bandwidth") maxGracefulShutdownDuration = flag.Duration("http.maxGracefulShutdownDuration", 7*time.Second, "The maximum duration for graceful shutdown of HTTP server. "+ "Highly loaded server may require increased value for graceful shutdown") + shutdownDelay = flag.Duration("http.shutdownDelay", 0, "Optional delay before http server shutdown. During this dealy the servier returns non-OK responses "+ + "from /health page, so load balancers can route new requests to other servers") +) - servers = make(map[string]*http.Server) +var ( + servers = make(map[string]*server) serversLock sync.Mutex ) +type server struct { + shutdownDelayDeadline int64 + s *http.Server +} + // RequestHandler must serve the given request r and write response to w. // // RequestHandler must return true if the request has been served (successfully or not). @@ -60,8 +70,9 @@ func Serve(addr string, rh RequestHandler) { } func serveWithListener(addr string, ln net.Listener, rh RequestHandler) { - s := &http.Server{ - Handler: gzipHandler(rh), + var s server + s.s = &http.Server{ + Handler: gzipHandler(&s, rh), // Disable http/2, since it doesn't give any advantages for VictoriaMetrics services. TLSNextProto: make(map[string]func(*http.Server, *tls.Conn, http.Handler)), @@ -75,9 +86,9 @@ func serveWithListener(addr string, ln net.Listener, rh RequestHandler) { ErrorLog: logger.StdErrorLogger(), } serversLock.Lock() - servers[addr] = s + servers[addr] = &s serversLock.Unlock() - if err := s.Serve(ln); err != nil { + if err := s.s.Serve(ln); err != nil { if err == http.ErrServerClosed { // The server gracefully closed. return @@ -96,19 +107,31 @@ func Stop(addr string) error { if s == nil { logger.Panicf("BUG: there is no http server at %q", addr) } - ctx, cancelFunc := context.WithTimeout(context.Background(), *maxGracefulShutdownDuration) - defer cancelFunc() - if err := s.Shutdown(ctx); err != nil { + + deadline := time.Now().Add(*shutdownDelay).UnixNano() + atomic.StoreInt64(&s.shutdownDelayDeadline, deadline) + if *shutdownDelay > 0 { + // Sleep for a while until load balancer in front of the server + // notifies that "/health" endpoint returns non-OK responses. + // See https://github.com/VictoriaMetrics/VictoriaMetrics/issues/463 . + logger.Infof("Waiting for %.3fs before shutdown of http server %q, so load balancers could re-route requests to other servers", shutdownDelay.Seconds(), addr) + time.Sleep(*shutdownDelay) + logger.Infof("Starting shutdown for http server %q", addr) + } + + ctx, cancel := context.WithTimeout(context.Background(), *maxGracefulShutdownDuration) + defer cancel() + if err := s.s.Shutdown(ctx); err != nil { return fmt.Errorf("cannot gracefully shutdown http server at %q in %.3fs; "+ "probably, `-http.maxGracefulShutdownDuration` command-line flag value must be increased; error: %s", addr, maxGracefulShutdownDuration.Seconds(), err) } return nil } -func gzipHandler(rh RequestHandler) http.HandlerFunc { +func gzipHandler(s *server, rh RequestHandler) http.HandlerFunc { return func(w http.ResponseWriter, r *http.Request) { w = maybeGzipResponseWriter(w, r) - handlerWrapper(w, r, rh) + handlerWrapper(s, w, r, rh) if zrw, ok := w.(*gzipResponseWriter); ok { if err := zrw.Close(); err != nil && !isTrivialNetworkError(err) { logger.Warnf("gzipResponseWriter.Close: %s", err) @@ -119,7 +142,7 @@ func gzipHandler(rh RequestHandler) http.HandlerFunc { var metricsHandlerDuration = metrics.NewHistogram(`vm_http_request_duration_seconds{path="/metrics"}`) -func handlerWrapper(w http.ResponseWriter, r *http.Request, rh RequestHandler) { +func handlerWrapper(s *server, w http.ResponseWriter, r *http.Request, rh RequestHandler) { requestsTotal.Inc() path, err := getCanonicalPath(r.URL.Path) if err != nil { @@ -131,7 +154,20 @@ func handlerWrapper(w http.ResponseWriter, r *http.Request, rh RequestHandler) { switch r.URL.Path { case "/health": w.Header().Set("Content-Type", "text/plain") - w.Write([]byte("OK")) + deadline := atomic.LoadInt64(&s.shutdownDelayDeadline) + if deadline <= 0 { + w.Write([]byte("OK")) + return + } + // Return non-OK response during grace period before shutting down the server. + // Load balancers must notify these responses and re-route new requests to other servers. + // See https://github.com/VictoriaMetrics/VictoriaMetrics/issues/463 . + d := time.Until(time.Unix(0, deadline)) + if d < 0 { + d = 0 + } + errMsg := fmt.Sprintf("The server is in delayed shutdown mode, which will end in %.3fs", d.Seconds()) + http.Error(w, errMsg, http.StatusServiceUnavailable) return case "/ping": // This is needed for compatibility with Influx agents.