M prometheus.nomad.hcl => prometheus.nomad.hcl +5 -0
@@ 53,6 53,11 @@ job "prometheus" {
destination = "local/nomad_rules.yml"
mode = "file"
}
+ artifact {
+ source = "https://git.sr.ht/~tretinha/prometheus/blob/main/traefik_rules.yml"
+ destination = "local/traefik_rules.yml"
+ mode = "file"
+ }
}
}
}
M prometheus.yml => prometheus.yml +1 -0
@@ 6,6 6,7 @@ global:
rule_files:
- "node_rules.yml"
- "nomad_rules.yml"
+ - "traefik_rules.yml"
scrape_configs:
- job_name: 'nomad_metrics'
A traefik_rules.yml => traefik_rules.yml +31 -0
@@ 0,0 1,31 @@
+---
+groups:
+- name: traefik
+ rules:
+ - alert: Slow response times
+ expr: &res_time_gt_300ms >
+ (max_over_time(traefik_service_request_duration_seconds_sum{code=~"[2-3]\\d\\d"}[2m]) / 1000) > 300
+ << : &brief
+ for: 5m
+ labels:
+ severity: interesting
+ annotations:
+ summary: "Service {{ $labels.service }} on instance {{ $labels.instance }} has slow response times"
+
+ - alert: Sustained slow response times
+ expr: *res_time_gt_300ms
+ << : &sustained
+ for: 20m
+ labels:
+ severity: important
+ annotations:
+ summary: "Service {{ $labels.service }} on instance {{ $labels.instance }} has sustained slow response times"
+
+ - alert: Prolonged slow response times
+ expr: *res_time_gt_300ms
+ << : &prolonged
+ for: 60m
+ labels:
+ severity: urgent
+ annotations:
+ summary: "Service {{ $labels.service }} on instance {{ $labels.instance }} has sustained slow response times"