mirror of
https://github.com/VictoriaMetrics/VictoriaMetrics.git
synced 2024-11-21 14:44:00 +00:00
f392913d00
- Document the bugfix at docs/CHANGELOG.md - Wait until all the worker goroutines are done in consulWatcher.mustStop() - Do not log `context canceled` errors when discovering consul serviceNames - Removed explicit handling of gzipped responses at lib/promscrape/discoveryutils.Client, since this handling is automatically performed by net/http.Transport. See DisableCompression option at https://pkg.go.dev/net/http#Transport . - Remove explicit handling of the proxyURL, since it is automatically handled by net/http.Transport. See Proxy option at https://pkg.go.dev/net/http#Transport . - Expliticly set MaxIdleConnsPerHost, since its default value equals to 2. Such a small value may result in excess tcp connection churn when more than 2 concurrent requests are processed by lib/promscrape/discoveryutils.Client. - Do not set explicitly the `Host` request header, since it is automatically set by net/http.Client. - Backport the bugfix to the recently added nomad_sd_configs - see https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3367 Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3468
320 lines
8.6 KiB
Go
320 lines
8.6 KiB
Go
package nomad
|
|
|
|
import (
|
|
"context"
|
|
"encoding/json"
|
|
"errors"
|
|
"flag"
|
|
"fmt"
|
|
"net/url"
|
|
"strings"
|
|
"sync"
|
|
"time"
|
|
|
|
"github.com/VictoriaMetrics/VictoriaMetrics/lib/logger"
|
|
"github.com/VictoriaMetrics/VictoriaMetrics/lib/promscrape/discoveryutils"
|
|
"github.com/VictoriaMetrics/metrics"
|
|
)
|
|
|
|
// SDCheckInterval is check interval for Nomad service discovery.
|
|
var SDCheckInterval = flag.Duration("promscrape.nomadSDCheckInterval", 30*time.Second, "Interval for checking for changes in Nomad. "+
|
|
"This works only if nomad_sd_configs is configured in '-promscrape.config' file. "+
|
|
"See https://docs.victoriametrics.com/sd_configs.html#nomad_sd_configs for details")
|
|
|
|
// nomadWatcher is a watcher for nomad api, updates services map in background with long-polling.
|
|
type nomadWatcher struct {
|
|
client *discoveryutils.Client
|
|
|
|
serviceNamesQueryArgs string
|
|
watchServices []string
|
|
watchTags []string
|
|
|
|
// servicesLock protects services
|
|
servicesLock sync.Mutex
|
|
services map[string]*serviceWatcher
|
|
|
|
servicesWG sync.WaitGroup
|
|
wg sync.WaitGroup
|
|
stopCh chan struct{}
|
|
}
|
|
|
|
type serviceWatcher struct {
|
|
serviceName string
|
|
services []Service
|
|
stopCh chan struct{}
|
|
}
|
|
|
|
// newNomadWatcher creates new watcher and starts background service discovery for Nomad.
|
|
func newNomadWatcher(client *discoveryutils.Client, sdc *SDConfig, datacenter, namespace string) *nomadWatcher {
|
|
baseQueryArgs := "?dc=" + url.QueryEscape(datacenter)
|
|
if sdc.AllowStale == nil || *sdc.AllowStale {
|
|
baseQueryArgs += "&stale"
|
|
}
|
|
if namespace != "" {
|
|
baseQueryArgs += "&namespace=" + url.QueryEscape(namespace)
|
|
}
|
|
serviceNodesQueryArgs := baseQueryArgs
|
|
|
|
cw := &nomadWatcher{
|
|
client: client,
|
|
serviceNamesQueryArgs: serviceNodesQueryArgs,
|
|
watchServices: sdc.Services,
|
|
watchTags: sdc.Tags,
|
|
services: make(map[string]*serviceWatcher),
|
|
stopCh: make(chan struct{}),
|
|
}
|
|
initCh := make(chan struct{})
|
|
cw.wg.Add(1)
|
|
go func() {
|
|
cw.watchForServicesUpdates(initCh)
|
|
cw.wg.Done()
|
|
}()
|
|
// wait for initialization to complete
|
|
<-initCh
|
|
return cw
|
|
}
|
|
|
|
func (cw *nomadWatcher) mustStop() {
|
|
close(cw.stopCh)
|
|
cw.client.Stop()
|
|
cw.wg.Wait()
|
|
}
|
|
|
|
func (cw *nomadWatcher) updateServices(serviceNames []string) {
|
|
var initWG sync.WaitGroup
|
|
// Start watchers for new services.
|
|
cw.servicesLock.Lock()
|
|
for _, serviceName := range serviceNames {
|
|
if _, ok := cw.services[serviceName]; ok {
|
|
// The watcher for serviceName already exists.
|
|
continue
|
|
}
|
|
sw := &serviceWatcher{
|
|
serviceName: serviceName,
|
|
stopCh: make(chan struct{}),
|
|
}
|
|
cw.services[serviceName] = sw
|
|
cw.servicesWG.Add(1)
|
|
serviceWatchersCreated.Inc()
|
|
initWG.Add(1)
|
|
go func() {
|
|
serviceWatchersCount.Inc()
|
|
sw.watchForServiceAddressUpdates(cw, &initWG)
|
|
serviceWatchersCount.Dec()
|
|
cw.servicesWG.Done()
|
|
}()
|
|
}
|
|
|
|
// Stop watchers for removed services.
|
|
newServiceNamesMap := make(map[string]struct{}, len(serviceNames))
|
|
for _, serviceName := range serviceNames {
|
|
newServiceNamesMap[serviceName] = struct{}{}
|
|
}
|
|
for serviceName, sw := range cw.services {
|
|
if _, ok := newServiceNamesMap[serviceName]; ok {
|
|
continue
|
|
}
|
|
close(sw.stopCh)
|
|
delete(cw.services, serviceName)
|
|
serviceWatchersStopped.Inc()
|
|
|
|
// Do not wait for the watcher goroutine to exit, since this may take for up to maxWaitTime
|
|
// if it is blocked in Nomad API request.
|
|
}
|
|
cw.servicesLock.Unlock()
|
|
|
|
// Wait for initialization to complete.
|
|
initWG.Wait()
|
|
}
|
|
|
|
// watchForServicesUpdates watches for new services and updates it in cw.
|
|
//
|
|
// watchForServicesUpdates closes the initCh once the initialization is complete and first discovery iteration is done.
|
|
func (cw *nomadWatcher) watchForServicesUpdates(initCh chan struct{}) {
|
|
index := int64(0)
|
|
apiServer := cw.client.APIServer()
|
|
f := func() {
|
|
serviceNames, newIndex, err := cw.getBlockingServiceNames(index)
|
|
if err != nil {
|
|
if !errors.Is(err, context.Canceled) {
|
|
logger.Errorf("cannot obtain Nomad serviceNames from %q: %s", apiServer, err)
|
|
}
|
|
return
|
|
}
|
|
if index == newIndex {
|
|
// Nothing changed.
|
|
return
|
|
}
|
|
cw.updateServices(serviceNames)
|
|
index = newIndex
|
|
}
|
|
|
|
logger.Infof("started Nomad service watcher for %q", apiServer)
|
|
f()
|
|
|
|
// send signal that initialization is complete
|
|
close(initCh)
|
|
|
|
checkInterval := getCheckInterval()
|
|
ticker := time.NewTicker(checkInterval / 2)
|
|
defer ticker.Stop()
|
|
for {
|
|
select {
|
|
case <-ticker.C:
|
|
f()
|
|
case <-cw.stopCh:
|
|
logger.Infof("stopping Nomad service watchers for %q", apiServer)
|
|
startTime := time.Now()
|
|
cw.servicesLock.Lock()
|
|
for _, sw := range cw.services {
|
|
close(sw.stopCh)
|
|
}
|
|
cw.servicesLock.Unlock()
|
|
cw.servicesWG.Wait()
|
|
logger.Infof("stopped Nomad service watcher for %q in %.3f seconds", apiServer, time.Since(startTime).Seconds())
|
|
return
|
|
}
|
|
}
|
|
}
|
|
|
|
var (
|
|
serviceWatchersCreated = metrics.NewCounter("vm_promscrape_discovery_nomad_service_watchers_created_total")
|
|
serviceWatchersStopped = metrics.NewCounter("vm_promscrape_discovery_nomad_service_watchers_stopped_total")
|
|
serviceWatchersCount = metrics.NewCounter("vm_promscrape_discovery_nomad_service_watchers")
|
|
)
|
|
|
|
// getBlockingServiceNames obtains service names via blocking request to Nomad.
|
|
//
|
|
// It returns an empty serviceNames list if response contains the same index.
|
|
func (cw *nomadWatcher) getBlockingServiceNames(index int64) ([]string, int64, error) {
|
|
path := "/v1/services" + cw.serviceNamesQueryArgs
|
|
data, newIndex, err := getBlockingAPIResponse(cw.client, path, index)
|
|
if err != nil {
|
|
return nil, index, err
|
|
}
|
|
if index == newIndex {
|
|
// Nothing changed - return an empty serviceNames list.
|
|
return nil, index, nil
|
|
}
|
|
|
|
var svcs []ServiceList
|
|
if err := json.Unmarshal(data, &svcs); err != nil {
|
|
return nil, index, fmt.Errorf("cannot parse response from %q: %w; data=%q", path, err, data)
|
|
}
|
|
|
|
serviceNames := make([]string, 0, len(svcs))
|
|
for _, svc := range svcs {
|
|
for _, s := range svc.Services {
|
|
if !shouldCollectServiceByName(cw.watchServices, s.ServiceName) {
|
|
continue
|
|
}
|
|
if !shouldCollectServiceByTags(cw.watchTags, s.Tags) {
|
|
continue
|
|
}
|
|
serviceNames = append(serviceNames, s.ServiceName)
|
|
}
|
|
}
|
|
|
|
return serviceNames, newIndex, nil
|
|
}
|
|
|
|
// getServiceSnapshot returns a snapshot of discovered Services.
|
|
func (cw *nomadWatcher) getServiceSnapshot() map[string][]Service {
|
|
cw.servicesLock.Lock()
|
|
sns := make(map[string][]Service, len(cw.services))
|
|
for svc, sw := range cw.services {
|
|
sns[svc] = sw.services
|
|
}
|
|
cw.servicesLock.Unlock()
|
|
return sns
|
|
}
|
|
|
|
// watchForServiceNodesUpdates watches for Nomad serviceNode changes for the given serviceName.
|
|
//
|
|
// watchForServiceNodesUpdates calls initWG.Done() once the initialization is complete and the first discovery iteration is done.
|
|
func (sw *serviceWatcher) watchForServiceAddressUpdates(nw *nomadWatcher, initWG *sync.WaitGroup) {
|
|
apiServer := nw.client.APIServer()
|
|
index := int64(0)
|
|
// TODO: Maybe use a different query arg.
|
|
path := "/v1/service/" + sw.serviceName + nw.serviceNamesQueryArgs
|
|
f := func() {
|
|
data, newIndex, err := getBlockingAPIResponse(nw.client, path, index)
|
|
if err != nil {
|
|
if !errors.Is(err, context.Canceled) {
|
|
logger.Errorf("cannot obtain Nomad services for serviceName=%q from %q: %s", sw.serviceName, apiServer, err)
|
|
}
|
|
return
|
|
}
|
|
if index == newIndex {
|
|
// Nothing changed.
|
|
return
|
|
}
|
|
sns, err := parseServices(data)
|
|
if err != nil {
|
|
logger.Errorf("cannot parse Nomad services response for serviceName=%q from %q: %s", sw.serviceName, apiServer, err)
|
|
return
|
|
}
|
|
|
|
nw.servicesLock.Lock()
|
|
sw.services = sns
|
|
nw.servicesLock.Unlock()
|
|
|
|
index = newIndex
|
|
}
|
|
|
|
f()
|
|
// Notify caller that initialization is complete
|
|
initWG.Done()
|
|
|
|
checkInterval := getCheckInterval()
|
|
ticker := time.NewTicker(checkInterval / 2)
|
|
defer ticker.Stop()
|
|
for {
|
|
select {
|
|
case <-ticker.C:
|
|
f()
|
|
case <-sw.stopCh:
|
|
return
|
|
}
|
|
}
|
|
}
|
|
|
|
func shouldCollectServiceByName(filterServices []string, serviceName string) bool {
|
|
if len(filterServices) == 0 {
|
|
return true
|
|
}
|
|
for _, filterService := range filterServices {
|
|
// Use case-insensitive comparison for service names according to https://github.com/VictoriaMetrics/VictoriaMetrics/issues/1422
|
|
if strings.EqualFold(filterService, serviceName) {
|
|
return true
|
|
}
|
|
}
|
|
return false
|
|
}
|
|
|
|
func shouldCollectServiceByTags(filterTags, tags []string) bool {
|
|
if len(filterTags) == 0 {
|
|
return true
|
|
}
|
|
for _, filterTag := range filterTags {
|
|
hasTag := false
|
|
for _, tag := range tags {
|
|
if tag == filterTag {
|
|
hasTag = true
|
|
break
|
|
}
|
|
}
|
|
if !hasTag {
|
|
return false
|
|
}
|
|
}
|
|
return true
|
|
}
|
|
|
|
func getCheckInterval() time.Duration {
|
|
d := *SDCheckInterval
|
|
if d <= time.Second {
|
|
return time.Second
|
|
}
|
|
return d
|
|
}
|