~tretinha/prometheus

8eeb19f102f250441b00562ecc218685733760e7 — Gustavo Colombini 1 year, 8 months ago a40b4cf
Add traefik_rules.yml
3 files changed, 37 insertions(+), 0 deletions(-)

M prometheus.nomad.hcl
M prometheus.yml
A traefik_rules.yml
M prometheus.nomad.hcl => prometheus.nomad.hcl +5 -0
@@ 53,6 53,11 @@ job "prometheus" {
        destination = "local/nomad_rules.yml"
        mode = "file"
      }
      artifact {
        source = "https://git.sr.ht/~tretinha/prometheus/blob/main/traefik_rules.yml"
        destination = "local/traefik_rules.yml"
        mode = "file"
      }
    }
  }
}

M prometheus.yml => prometheus.yml +1 -0
@@ 6,6 6,7 @@ global:
rule_files:
  - "node_rules.yml"
  - "nomad_rules.yml"
  - "traefik_rules.yml"

scrape_configs:
  - job_name: 'nomad_metrics'

A traefik_rules.yml => traefik_rules.yml +31 -0
@@ 0,0 1,31 @@
---
groups:
- name: traefik
  rules:
  - alert: Slow response times
    expr: &res_time_gt_300ms >
      (max_over_time(traefik_service_request_duration_seconds_sum{code=~"[2-3]\\d\\d"}[2m]) / 1000) > 300
    << : &brief
      for: 5m
      labels:
        severity: interesting
    annotations:
      summary: "Service {{ $labels.service }} on instance {{ $labels.instance }} has slow response times"

  - alert: Sustained slow response times
    expr: *res_time_gt_300ms
    << : &sustained
      for: 20m
      labels:
        severity: important
    annotations:
      summary: "Service {{ $labels.service }} on instance {{ $labels.instance }} has sustained slow response times"

  - alert: Prolonged slow response times
    expr: *res_time_gt_300ms
    << : &prolonged
      for: 60m
      labels:
        severity: urgent
    annotations:
      summary: "Service {{ $labels.service }} on instance {{ $labels.instance }} has sustained slow response times"