# File contains default list of alerts for VM components. # The alerts below are just recommendations and may require some updates # and threshold calibration according to every specific setup. groups: - name: vm-health # note the `job` filter and update accordingly to your setup rules: - alert: TooManyRestarts expr: changes(process_start_time_seconds{job=~"victoriametrics|vmselect|vminsert|vmstorage|vmagent|vmalert"}[15m]) > 2 labels: severity: critical annotations: summary: "{{ $labels.job }} too many restarts (instance {{ $labels.instance }})" description: "Job {{ $labels.job }} (instance {{ $labels.instance }}) has restarted more than twice in the last 15 minutes. It might be crashlooping." - alert: ServiceDown expr: up{job=~"victoriametrics|vmselect|vminsert|vmstorage|vmagent|vmalert"} == 0 for: 2m labels: severity: critical annotations: summary: "Service {{ $labels.job }} is down on {{ $labels.instance }}" description: "{{ $labels.instance }} of job {{ $labels.job }} has been down for more than 2 minutes." - alert: ProcessNearFDLimits expr: (process_max_fds - process_open_fds) < 100 for: 5m labels: severity: critical annotations: summary: "Number of free file descriptors is less than 100 for \"{{ $labels.job }}\"(\"{{ $labels.instance }}\") for the last 5m" description: "Exhausting OS file descriptors limit can cause severe degradation of the process. Consider to increase the limit as fast as possible." - alert: TooHighMemoryUsage expr: (process_resident_memory_anon_bytes / vm_available_memory_bytes) > 0.8 for: 5m labels: severity: critical annotations: summary: "It is more than 80% of memory used by \"{{ $labels.job }}\"(\"{{ $labels.instance }}\") during the last 5m" description: "Too high memory usage may result into multiple issues such as OOMs or degraded performance. Consider to either increase available memory or decrease the load on the process." - alert: TooHighCPUUsage expr: rate(process_cpu_seconds_total[5m]) / process_cpu_cores_available > 0.9 for: 5m labels: severity: critical annotations: summary: "More than 90% of CPU is used by \"{{ $labels.job }}\"(\"{{ $labels.instance }}\") during the last 5m" description: "Too high CPU usage may be a sign of insufficient resources and make process unstable. Consider to either increase available CPU resources or decrease the load on the process." - alert: TooManyLogs expr: sum(increase(vm_log_messages_total{level="error"}[5m])) by (job, instance) > 0 for: 15m labels: severity: warning annotations: summary: "Too many logs printed for job \"{{ $labels.job }}\" ({{ $labels.instance }})" description: "Logging rate for job \"{{ $labels.job }}\" ({{ $labels.instance }}) is {{ $value }} for last 15m.\n Worth to check logs for specific error messages."