Merge branch 'public-single-node' into pmm-6401-read-prometheus-data-files

This commit is contained in:
Aliaksandr Valialkin 2021-01-11 13:04:58 +02:00
commit 9e10d5083e
11 changed files with 189 additions and 30 deletions

View file

@ -1279,6 +1279,7 @@ The most interesting metrics are:
VictoriaMetrics also exposes currently running queries with their execution times at `/api/v1/status/active_queries` page. VictoriaMetrics also exposes currently running queries with their execution times at `/api/v1/status/active_queries` page.
See the example of alerting rules for VM components [here](https://github.com/VictoriaMetrics/VictoriaMetrics/blob/master/deployment/docker/alerts.yml).
## Troubleshooting ## Troubleshooting

View file

@ -5,9 +5,9 @@ import (
"net" "net"
"strings" "strings"
"sync/atomic" "sync/atomic"
"time"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/netutil" "github.com/VictoriaMetrics/VictoriaMetrics/lib/netutil"
"github.com/VictoriaMetrics/fasthttp"
"github.com/VictoriaMetrics/metrics" "github.com/VictoriaMetrics/metrics"
) )
@ -15,11 +15,10 @@ func statDial(network, addr string) (conn net.Conn, err error) {
if !strings.HasPrefix(network, "tcp") { if !strings.HasPrefix(network, "tcp") {
return nil, fmt.Errorf("unexpected network passed to statDial: %q; it must start from `tcp`", network) return nil, fmt.Errorf("unexpected network passed to statDial: %q; it must start from `tcp`", network)
} }
if netutil.TCP6Enabled() { if !netutil.TCP6Enabled() {
conn, err = fasthttp.DialDualStack(addr) network = "tcp4"
} else {
conn, err = fasthttp.Dial(addr)
} }
conn, err = net.DialTimeout(network, addr, 5*time.Second)
dialsTotal.Inc() dialsTotal.Inc()
if err != nil { if err != nil {
dialErrors.Inc() dialErrors.Inc()

View file

@ -17,6 +17,7 @@ groups:
(up == 1) (up == 1)
labels: labels:
job: '{{ $labels.job }}' job: '{{ $labels.job }}'
dynamic: '{{ $x := query "up" | first | value }}{{ if eq 1.0 $x }}one{{ else }}unknown{{ end }}'
annotations: annotations:
description: Job {{ $labels.job }} is up! description: Job {{ $labels.job }} is up!
summary: All instances up {{ range query "up" }} summary: All instances up {{ range query "up" }}

View file

@ -178,7 +178,9 @@ func InitTemplateFunc(externalURL *url.URL) {
// it is present here only for validation purposes, when there is no // it is present here only for validation purposes, when there is no
// provided datasource. // provided datasource.
"query": func(q string) ([]datasource.Metric, error) { "query": func(q string) ([]datasource.Metric, error) {
return nil, nil // return non-empty slice to pass validation with chained functions in template
// see issue #989 for details
return []datasource.Metric{{}}, nil
}, },
"first": func(metrics []datasource.Metric) (datasource.Metric, error) { "first": func(metrics []datasource.Metric) (datasource.Metric, error) {
if len(metrics) > 0 { if len(metrics) > 0 {

View file

@ -1,23 +1,174 @@
# File contains default list of alerts for vm-single and vmagent services.
# The alerts below are just recommendations and may require some updates
# and threshold calibration according to every specific setup.
groups: groups:
- name: groupGorSingleAlert - name: serviceHealth
rules: rules:
- alert: VMRows # note the `job` filter and update accordingly to your setup
for: 10s - alert: TooManyRestarts
expr: vm_rows > 0 expr: changes(process_start_time_seconds{job=~"victoriametrics|vmagent|vmalert"}[15m]) > 2
labels: labels:
label: bar severity: critical
host: "{{ $labels.instance }}"
annotations: annotations:
summary: "{{ $value|humanize }}" summary: "{{ $labels.job }} too many restarts (instance {{ $labels.instance }})"
description: "{{$labels}}" description: "Job {{ $labels.job }} has restarted more than twice in the last 15 minutes.
- name: TestGroup It might be crashlooping."
# Alerts group for VM single assumes that Grafana dashboard
# https://grafana.com/grafana/dashboards/10229 is installed.
# Pls update the `dashboard` annotation according to your setup.
- name: vmsingle
interval: 30s
concurrency: 2
rules: rules:
- alert: Conns - alert: DiskRunsOutOfSpaceIn3Days
expr: sum(vm_tcplistener_conns) by(instance) > 1 expr: |
for: 5s vm_free_disk_space_bytes / ignoring(path) (
(
sum(rate(vm_rows_added_to_storage_total[1d])) -
sum(rate(vm_deduplicated_samples_total[1d])) without(type)
)
*
(
sum(vm_data_size_bytes{type!="indexdb"}) /
sum(vm_rows{type!="indexdb"})
)
) < 3 * 24 * 3600
for: 30m
labels:
severity: critical
annotations: annotations:
summary: "Too high connection number for {{$labels.instance}}" dashboard: "http://localhost:3000/d/wNf0q_kZk?viewPanel=73&var-instance={{ $labels.instance }}"
description: "It is {{ $value }} connections for {{$labels.instance}}" summary: "Instance {{ $labels.instance }} will run out of disk space soon"
- alert: ExampleAlertAlwaysFiring description: "Taking into account current ingestion rate, free disk space will be enough only
expr: sum by(job) for {{ $value | humanizeDuration }} on instance {{ $labels.instance }}.\n
(up == 1) Consider to limit the ingestion rate, decrease retention or scale the disk space if possible."
- alert: RequestErrorsToAPI
expr: increase(vm_http_request_errors_total[5m]) > 0
for: 15m
labels:
severity: warning
annotations:
dashboard: "http://localhost:3000/d/wNf0q_kZk?viewPanel=35&var-instance={{ $labels.instance }}"
summary: "Too many errors served for path {{ $labels.path }} (instance {{ $labels.instance }})"
description: "Requests to path {{ $labels.path }} are receiving errors.
Please verify if clients are sending correct requests."
- alert: ConcurrentFlushesHitTheLimit
expr: vm_concurrent_addrows_current >= vm_concurrent_addrows_capacity
for: 15m
labels:
severity: warning
annotations:
dashboard: "http://localhost:3000/d/wNf0q_kZk?viewPanel=59&var-instance={{ $labels.instance }}"
summary: "VictoriMetrics on instance {{ $labels.instance }} is constantly hitting concurrent flushes limit"
description: "The limit of concurrent flushes on instance {{ $labels.instance }} is equal to number of CPUs.\n
When VictoriaMetrics constantly hits the limit it means that storage is overloaded and requires more CPU."
- alert: TooManyLogs
expr: sum(increase(vm_log_messages_total{level!="info"}[5m])) by (job, instance) > 0
for: 15m
labels:
severity: warning
annotations:
dashboard: "http://localhost:3000/d/wNf0q_kZk?viewPanel=67&var-instance={{ $labels.instance }}"
summary: "Too many logs printed for job \"{{ $labels.job }}\" ({{ $labels.instance }})"
description: "Logging rate for job \"{{ $labels.job }}\" ({{ $labels.instance }}) is {{ $value }} for last 15m.\n
Worth to check logs for specific error messages."
- alert: RowsRejectedOnIngestion
expr: sum(rate(vm_rows_ignored_total[5m])) by (instance, reason) > 0
for: 15m
labels:
severity: warning
annotations:
dashboard: "http://localhost:3000/d/wNf0q_kZk?viewPanel=58&var-instance={{ $labels.instance }}"
summary: "Some rows are rejected on \"{{ $labels.instance }}\" on ingestion attempt"
description: "VM is rejecting to ingest rows on \"{{ $labels.instance }}\" due to the
following reason: \"{{ $labels.reason }}\""
- alert: TooHighChurnRate
expr: |
(
sum(rate(vm_new_timeseries_created_total[5m])) by(instance)
/
sum(rate(vm_rows_inserted_total[5m])) by (instance)
) > 0.1
for: 15m
labels:
severity: warning
annotations:
dashboard: "http://localhost:3000/d/wNf0q_kZk?viewPanel=66&var-instance={{ $labels.instance }}"
summary: "Churn rate is more than 10% on \"{{ $labels.instance }}\" for the last 15m"
description: "VM constantly creates new time series on \"{{ $labels.instance }}\".\n
This effect is known as Churn Rate.\n
High Churn Rate tightly connected with database performance and may
result in unexpected OOM's or slow queries."
- alert: TooHighSlowInsertsRate
expr: |
(
sum(rate(vm_slow_row_inserts_total[5m])) by(instance)
/
sum(rate(vm_rows_inserted_total[5m])) by (instance)
) > 0.5
for: 15m
labels:
severity: warning
annotations:
dashboard: "http://localhost:3000/d/wNf0q_kZk?viewPanel=68&var-instance={{ $labels.instance }}"
summary: "Percentage of slow inserts is more than 50% on \"{{ $labels.instance }}\" for the last 15m"
description: "High rate of slow inserts on \"{{ $labels.instance }}\" may be a sign of resource exhaustion
for the current load. It is likely more RAM is needed for optimal handling of the current number of active time series."
# Alerts group for vmagent assumes that Grafana dashboard
# https://grafana.com/grafana/dashboards/12683 is installed.
# Pls update the `dashboard` annotation according to your setup.
- name: vmagent
interval: 30s
concurrency: 2
rules:
- alert: PersistentQueueIsDroppingData
expr: sum(increase(vm_persistentqueue_bytes_dropped_total[5m])) by (job, instance) > 0
for: 10m
labels:
severity: critical
annotations:
dashboard: "http://localhost:3000/d/G7Z9GzMGz?viewPanel=49&var-instance={{ $labels.instance }}"
summary: "Instance {{ $labels.instance }} is dropping data from persistent queue"
description: "Vmagent dropped {{ $value | humanize1024 }} from persistent queue
on instance {{ $labels.instance }} for the last 10m."
- alert: TooManyScrapeErrors
expr: sum(increase(vm_promscrape_scrapes_failed_total[5m])) by (job, instance) > 0
for: 15m
labels:
severity: warning
annotations:
dashboard: "http://localhost:3000/d/G7Z9GzMGz?viewPanel=31&var-instance={{ $labels.instance }}"
summary: "Job \"{{ $labels.job }}\" on instance {{ $labels.instance }} fails to scrape targets for last 15m"
- alert: TooManyWriteErrors
expr: |
(sum(increase(vm_ingestserver_request_errors_total[5m])) by (job, instance)
+
sum(increase(vmagent_http_request_errors_total[5m])) by (job, instance)) > 0
for: 15m
labels:
severity: warning
annotations:
dashboard: "http://localhost:3000/d/G7Z9GzMGz?viewPanel=77&var-instance={{ $labels.instance }}"
summary: "Job \"{{ $labels.job }}\" on instance {{ $labels.instance }} responds with errors to write requests for last 15m."
- alert: TooManyRemoteWriteErrors
expr: sum(rate(vmagent_remotewrite_retries_count_total[5m])) by(job, instance, url) > 0
for: 15m
labels:
severity: warning
annotations:
dashboard: "http://localhost:3000/d/G7Z9GzMGz?viewPanel=61&var-instance={{ $labels.instance }}"
summary: "Job \"{{ $labels.job }}\" on instance {{ $labels.instance }} fails to push to remote storage"
description: "Vmagent fails to push data via remote write protocol to destination \"{{ $labels.url }}\"\n
Ensure that destination is up and reachable."

View file

@ -2,6 +2,8 @@
# tip # tip
* BUGFIX: vmagent: prevent from `dialing to the given TCP address time out` error when scraping big number of unavailable targets. See https://github.com/VictoriaMetrics/VictoriaMetrics/issues/987
* FEATURE: disable final merge for data for the previous month at the beginning of new month, since it may result in high disk IO and CPU usage. Final merge can be enabled by setting `-finalMergeDelay` command-line flag to positive duration. * FEATURE: disable final merge for data for the previous month at the beginning of new month, since it may result in high disk IO and CPU usage. Final merge can be enabled by setting `-finalMergeDelay` command-line flag to positive duration.

2
go.mod
View file

@ -7,7 +7,7 @@ require (
// Do not use the original github.com/valyala/fasthttp because of issues // Do not use the original github.com/valyala/fasthttp because of issues
// like https://github.com/valyala/fasthttp/commit/996610f021ff45fdc98c2ce7884d5fa4e7f9199b // like https://github.com/valyala/fasthttp/commit/996610f021ff45fdc98c2ce7884d5fa4e7f9199b
github.com/VictoriaMetrics/fasthttp v1.0.9 github.com/VictoriaMetrics/fasthttp v1.0.11
github.com/VictoriaMetrics/metrics v1.12.3 github.com/VictoriaMetrics/metrics v1.12.3
github.com/VictoriaMetrics/metricsql v0.9.1 github.com/VictoriaMetrics/metricsql v0.9.1
github.com/aws/aws-sdk-go v1.36.23 github.com/aws/aws-sdk-go v1.36.23

4
go.sum
View file

@ -81,8 +81,8 @@ github.com/Shopify/sarama v1.19.0/go.mod h1:FVkBWblsNy7DGZRfXLU0O9RCGt5g3g3yEuWX
github.com/Shopify/toxiproxy v2.1.4+incompatible/go.mod h1:OXgGpZ6Cli1/URJOF1DMxUHB2q5Ap20/P/eIdh4G0pI= github.com/Shopify/toxiproxy v2.1.4+incompatible/go.mod h1:OXgGpZ6Cli1/URJOF1DMxUHB2q5Ap20/P/eIdh4G0pI=
github.com/VictoriaMetrics/fastcache v1.5.7 h1:4y6y0G8PRzszQUYIQHHssv/jgPHAb5qQuuDNdCbyAgw= github.com/VictoriaMetrics/fastcache v1.5.7 h1:4y6y0G8PRzszQUYIQHHssv/jgPHAb5qQuuDNdCbyAgw=
github.com/VictoriaMetrics/fastcache v1.5.7/go.mod h1:ptDBkNMQI4RtmVo8VS/XwRY6RoTu1dAWCbrk+6WsEM8= github.com/VictoriaMetrics/fastcache v1.5.7/go.mod h1:ptDBkNMQI4RtmVo8VS/XwRY6RoTu1dAWCbrk+6WsEM8=
github.com/VictoriaMetrics/fasthttp v1.0.9 h1:Fja1tfcNMNoUD7RJDYpjGx2CsSfXkUbISKY4kNafdN4= github.com/VictoriaMetrics/fasthttp v1.0.11 h1:6XOvE1pF/EhW8qoi7V5qJQJ2rhNV+UGrb1/a9vMbTiw=
github.com/VictoriaMetrics/fasthttp v1.0.9/go.mod h1:3SeUL4zwB/p/a9aEeRc6gdlbrtNHXBJR6N376EgiSHU= github.com/VictoriaMetrics/fasthttp v1.0.11/go.mod h1:3SeUL4zwB/p/a9aEeRc6gdlbrtNHXBJR6N376EgiSHU=
github.com/VictoriaMetrics/metrics v1.12.2/go.mod h1:Z1tSfPfngDn12bTfZSCqArT3OPY3u88J12hSoOhuiRE= github.com/VictoriaMetrics/metrics v1.12.2/go.mod h1:Z1tSfPfngDn12bTfZSCqArT3OPY3u88J12hSoOhuiRE=
github.com/VictoriaMetrics/metrics v1.12.3 h1:Fe6JHC6MSEKa+BtLhPN8WIvS+HKPzMc2evEpNeCGy7I= github.com/VictoriaMetrics/metrics v1.12.3 h1:Fe6JHC6MSEKa+BtLhPN8WIvS+HKPzMc2evEpNeCGy7I=
github.com/VictoriaMetrics/metrics v1.12.3/go.mod h1:Z1tSfPfngDn12bTfZSCqArT3OPY3u88J12hSoOhuiRE= github.com/VictoriaMetrics/metrics v1.12.3/go.mod h1:Z1tSfPfngDn12bTfZSCqArT3OPY3u88J12hSoOhuiRE=

View file

@ -7,6 +7,7 @@ import (
"fmt" "fmt"
"net" "net"
"net/url" "net/url"
"time"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/netutil" "github.com/VictoriaMetrics/VictoriaMetrics/lib/netutil"
"github.com/VictoriaMetrics/fasthttp" "github.com/VictoriaMetrics/fasthttp"
@ -80,10 +81,12 @@ func (u *URL) NewDialFunc(tlsConfig *tls.Config) (fasthttp.DialFunc, error) {
} }
func defaultDialFunc(addr string) (net.Conn, error) { func defaultDialFunc(addr string) (net.Conn, error) {
network := "tcp4"
if netutil.TCP6Enabled() { if netutil.TCP6Enabled() {
return fasthttp.DialDualStack(addr) network = "tcp"
} }
return fasthttp.Dial(addr) // Do not use fasthttp.Dial because of https://github.com/VictoriaMetrics/VictoriaMetrics/issues/987
return net.DialTimeout(network, addr, 5*time.Second)
} }
// sendConnectRequest sends CONNECT request to proxyConn for the given addr and authHeader and returns the established connection to dstAddr. // sendConnectRequest sends CONNECT request to proxyConn for the given addr and authHeader and returns the established connection to dstAddr.

2
vendor/modules.txt vendored
View file

@ -10,7 +10,7 @@ cloud.google.com/go/internal/version
cloud.google.com/go/storage cloud.google.com/go/storage
# github.com/VictoriaMetrics/fastcache v1.5.7 # github.com/VictoriaMetrics/fastcache v1.5.7
github.com/VictoriaMetrics/fastcache github.com/VictoriaMetrics/fastcache
# github.com/VictoriaMetrics/fasthttp v1.0.9 # github.com/VictoriaMetrics/fasthttp v1.0.11
github.com/VictoriaMetrics/fasthttp github.com/VictoriaMetrics/fasthttp
github.com/VictoriaMetrics/fasthttp/fasthttputil github.com/VictoriaMetrics/fasthttp/fasthttputil
github.com/VictoriaMetrics/fasthttp/stackless github.com/VictoriaMetrics/fasthttp/stackless