ALERTPingDown IFup==0 FOR2m LABELS{severity="warning"} ANNOTATIONS{ summary="Instance {{ $labels.instance }} down", description="{{ $labels.instance }} of job {{ $labels.job }} has been down for more than 2 minutes" }
ALERTPingCheck IFprobe_success{job="PingCheck"}==0 FOR1m LABELS{ severity="warning" } ANNOTATIONS{ SUMMARY="{{ $labels.instance }} down", DESCRIPTION="{{ $labels.instance }}: Node has been down for more than 1 minutes" }
ALERTCPUUsage#Windows IF(100-(avgby(instance)(irate(node_cpu{name="node-exporter",mode="idle"}[5m]))*100))> 80 FOR 2m LABELS { severity="warning" } ANNOTATIONS { SUMMARY = "{{$labels.instance}}: High CPU usage detected", DESCRIPTION = "{{$labels.instance}}: CPU usage is above 80% (current value is: {{ $value }})" } ALERT LoadAverage #Linux IF ((node_load5 / count without (cpu, mode) (node_cpu{mode="system"})) > 1) FOR 2m LABELS { severity="warning" } ANNOTATIONS { SUMMARY = "{{$labels.instance}}: High LoadAverage detected", DESCRIPTION = "{{$labels.instance}}: LoadAverage is high" } ALERT SwapUsage IF (((node_memory_SwapTotal-node_memory_SwapFree)/node_memory_SwapTotal)*100) > 75 FOR 2m LABELS { severity="warning" } ANNOTATIONS { SUMMARY = "{{$labels.instance}}: Swap usage detected", DESCRIPTION = "{{$labels.instance}}: Swap usage usage is above 75% (current value is: {{ $value }})" } ALERT MemoryUsage IF (((node_memory_MemTotal-node_memory_MemFree-node_memory_Cached)/(node_memory_MemTotal)*100)) > 75 FOR 2m LABELS { severity="warning" } ANNOTATIONS { SUMMARY = "{{$labels.instance}}: High memory usage detected", DESCRIPTION = "{{$labels.instance}}: Memory usage is above 75% (current value is: {{ $value }})" } ALERT LowRootDisk IF ((node_filesystem_size{mountpoint="/"} - node_filesystem_free{mountpoint="/"} ) / node_filesystem_size{mountpoint="/"} * 100) > 75 FOR 2m LABELS { severity="warning" } ANNOTATIONS { SUMMARY = "{{$labels.instance}}: Low root disk space", DESCRIPTION = "{{$labels.instance}}: Root disk usage is above 75% (current value is: {{ $value }})" } ALERT HttpCheckDown IF probe_success{job="HttpCheck"} == 0 FOR 3m LABELS { severity="critical" } ANNOTATIONS { SUMMARY = "{{$labels.instance}}: Http service is down", DESCRIPTION = "{{$labels.instance}}: Http request no response in 3 minutes" } ALERT DNSCheck IF probe_success{job="DNSCheck"} == 0 FOR 1m LABELS { severity="warning" } ANNOTATIONS { SUMMARY = "{{$labels.instance}}: DNS service is down", DESCRIPTION = "{{$labels.instance}}: DNS resolution failed in 1 minutes" }
修改prometheus服务项
增加alertmanager模块
1 2
$ vi /usr/lib/systemd/system/prometheus.service ExecStart=/var/prometheus/prometheus-1.7.1.linux-amd64/prometheus -config.file=/var/prometheus/prometheus-1.7.1.linux-amd64/prometheus.yml -storage.local.path=/var/lib/prometheus -alertmanager.url=http://prometheus:9093