62 lines
1.9 KiB
Go Template
62 lines
1.9 KiB
Go Template
grafana:
|
|
adminPassword: {{ .StateValues.grafana.ADMIN_PASSWORD }}
|
|
|
|
alertmanager:
|
|
config:
|
|
global:
|
|
resolve_timeout: 1m
|
|
route:
|
|
receiver: 'discord'
|
|
group_by: ['alertname', 'cluster', 'job', 'env']
|
|
group_wait: 15s
|
|
group_interval: 15s
|
|
repeat_interval: 1m
|
|
routes:
|
|
- match: # port not exposed?
|
|
alertname: KubeProxyDown
|
|
receiver: "null"
|
|
- match: # port not exposed?
|
|
alertname: KubeSchedulerDown
|
|
receiver: "null"
|
|
- match: # port not exposed?
|
|
alertname: KubeControllerManagerDown
|
|
receiver: "null"
|
|
- match: # dummy alert
|
|
alertname: Watchdog
|
|
receiver: "null"
|
|
# # discord has delivery problems sometimes (amount of messages)
|
|
# # better mute it temporarily via silence in graphana.
|
|
#- match:
|
|
# alertname: AlertmanagerFailedToSendAlerts
|
|
# receiver: "null"
|
|
receivers:
|
|
- name: 'null'
|
|
- name: 'discord'
|
|
discord_configs:
|
|
- webhook_url: {{ .StateValues.discord.WEBHOOK_URL }}
|
|
username: {{ .StateValues.discord.USERNAME }}
|
|
|
|
additionalPrometheusRulesMap:
|
|
high-cpu-usage:
|
|
groups:
|
|
- name: Node
|
|
rules:
|
|
- alert: HighCPUUsage
|
|
expr: 100 - (avg by(instance) (rate(node_cpu_seconds_total{mode="idle"}[5m])) * 100) > 80
|
|
for: 10s
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "High CPU usage detected"
|
|
description: 'CPU usage is above 80% on {{ "{{" }} $labels.instance {{ "}}" }}'
|
|
node-not-ready:
|
|
groups:
|
|
- name: Node
|
|
rules:
|
|
- alert: NodeNotReady
|
|
expr: sum(kube_node_status_condition{condition="Ready",status!="true"}) > 0
|
|
labels:
|
|
severity: critical
|
|
annotations:
|
|
summary: "Node not ready"
|
|
description: 'Node {{ "{{" }} $labels.instance {{ "}}" }} is not ready' |