Alerting

Configure alerts to get notified when metrics exceed thresholds.

AlertManager Setup

Install AlertManager

yaml

# docker-compose.yml
version: '3.8'

services:
  alertmanager:
    image: prom/alertmanager:latest
    ports:
      - "9093:9093"
    volumes:
      - ./alertmanager.yml:/etc/alertmanager/alertmanager.yml
      - alertmanager-data:/alertmanager
    command:
      - '--config.file=/etc/alertmanager/alertmanager.yml'
      - '--storage.path=/alertmanager'

volumes:
  alertmanager-data:

Configure AlertManager

yaml

# alertmanager.yml
global:
  resolve_timeout: 5m
  slack_api_url: 'https://hooks.slack.com/services/YOUR/SLACK/WEBHOOK'

route:
  group_by: ['alertname', 'severity']
  group_wait: 10s
  group_interval: 10s
  repeat_interval: 12h
  receiver: 'slack-notifications'
  routes:
    - match:
        severity: critical
      receiver: 'pagerduty'
    - match:
        severity: warning
      receiver: 'slack-notifications'

receivers:
  - name: 'slack-notifications'
    slack_configs:
      - channel: '#alerts'
        title: '{{ .GroupLabels.alertname }}'
        text: '{{ range .Alerts }}{{ .Annotations.summary }}{{ end }}'

  - name: 'pagerduty'
    pagerduty_configs:
      - service_key: 'YOUR_PAGERDUTY_KEY'

Alert Rules

Redis Performance Alerts

yaml

# prometheus/rules/redis.yml
groups:
  - name: redis_alerts
    interval: 15s
    rules:
      # High latency
      - alert: HighRedisLatency
        expr: |
          histogram_quantile(0.95,
            rate(myapp_redis_command_duration_seconds_bucket[5m])
          ) > 0.05
        for: 5m
        labels:
          severity: warning
        annotations:
          summary: "High Redis latency detected"
          description: "P95 latency is {{ $value }}s (threshold: 0.05s)"

      # High error rate
      - alert: HighRedisErrorRate
        expr: |
          sum(rate(myapp_redis_errors_total[5m])) /
          sum(rate(myapp_redis_commands_total[5m])) > 0.01
        for: 5m
        labels:
          severity: critical
        annotations:
          summary: "High Redis error rate"
          description: "Error rate is {{ $value | humanizePercentage }} (threshold: 1%)"

      # Redis down
      - alert: RedisDown
        expr: up{job="redis"} == 0
        for: 1m
        labels:
          severity: critical
        annotations:
          summary: "Redis is down"
          description: "Redis instance {{ $labels.instance }} is unreachable"

Cache Alerts

yaml

groups:
  - name: cache_alerts
    rules:
      # Low hit rate
      - alert: LowCacheHitRate
        expr: |
          sum(rate(myapp_redis_cache_hits_total[5m])) /
          (
            sum(rate(myapp_redis_cache_hits_total[5m])) +
            sum(rate(myapp_redis_cache_misses_total[5m]))
          ) < 0.7
        for: 10m
        labels:
          severity: warning
        annotations:
          summary: "Cache hit rate is low"
          description: "Hit rate is {{ $value | humanizePercentage }} for layer {{ $labels.layer }}"

      # Cache full
      - alert: CacheFull
        expr: myapp_redis_cache_size > 50000
        for: 5m
        labels:
          severity: warning
        annotations:
          summary: "Cache is nearly full"
          description: "Cache size is {{ $value }} (threshold: 50000)"

Lock Alerts

yaml

groups:
  - name: lock_alerts
    rules:
      # High lock contention
      - alert: HighLockContention
        expr: myapp_redis_locks_active > 100
        for: 5m
        labels:
          severity: warning
        annotations:
          summary: "High lock contention"
          description: "{{ $value }} active locks (threshold: 100)"

      # Long lock wait time
      - alert: LongLockWaitTime
        expr: |
          histogram_quantile(0.95,
            rate(myapp_redis_lock_wait_duration_seconds_bucket[5m])
          ) > 1
        for: 5m
        labels:
          severity: warning
        annotations:
          summary: "Long lock wait times"
          description: "P95 wait time is {{ $value }}s (threshold: 1s)"

      # Lock acquisition failures
      - alert: LockAcquisitionFailures
        expr: |
          sum(rate(myapp_redis_lock_acquisitions_total{status="failed"}[5m])) > 10
        for: 5m
        labels:
          severity: warning
        annotations:
          summary: "High lock acquisition failure rate"
          description: "{{ $value }} failures/sec"

Stream Alerts

yaml

groups:
  - name: stream_alerts
    rules:
      # Processing errors
      - alert: StreamProcessingErrors
        expr: |
          sum(rate(myapp_redis_stream_messages_consumed_total{status="error"}[5m])) /
          sum(rate(myapp_redis_stream_messages_consumed_total[5m])) > 0.05
        for: 5m
        labels:
          severity: warning
        annotations:
          summary: "High stream processing error rate"
          description: "{{ $value | humanizePercentage }} errors for {{ $labels.stream }}"

      # Publish errors
      - alert: StreamPublishErrors
        expr: |
          sum(rate(myapp_redis_stream_publish_errors_total[5m])) > 1
        for: 5m
        labels:
          severity: warning
        annotations:
          summary: "Stream publish errors detected"
          description: "{{ $value }} publish errors/sec for {{ $labels.stream }}"

Rate Limit Alerts

yaml

groups:
  - name: rate_limit_alerts
    rules:
      # High block rate
      - alert: HighRateLimitBlockRate
        expr: |
          sum(rate(myapp_redis_ratelimit_requests_total{status="rejected"}[5m])) /
          sum(rate(myapp_redis_ratelimit_requests_total[5m])) > 0.1
        for: 5m
        labels:
          severity: warning
        annotations:
          summary: "High rate limit block rate"
          description: "{{ $value | humanizePercentage }} of requests blocked"

Application Alerts

API Health

yaml

groups:
  - name: api_alerts
    rules:
      # High error rate
      - alert: HighAPIErrorRate
        expr: |
          sum(rate(http_requests_total{status=~"5.."}[5m])) /
          sum(rate(http_requests_total[5m])) > 0.05
        for: 5m
        labels:
          severity: critical
        annotations:
          summary: "High API error rate"
          description: "{{ $value | humanizePercentage }} error rate"

      # Slow response times
      - alert: SlowAPIResponses
        expr: |
          histogram_quantile(0.95,
            rate(http_request_duration_seconds_bucket[5m])
          ) > 2
        for: 5m
        labels:
          severity: warning
        annotations:
          summary: "Slow API responses"
          description: "P95 latency is {{ $value }}s"

      # High request rate
      - alert: HighRequestRate
        expr: |
          sum(rate(http_requests_total[5m])) > 10000
        for: 5m
        labels:
          severity: warning
        annotations:
          summary: "Very high request rate"
          description: "{{ $value }} requests/sec"

System Resources

yaml

groups:
  - name: system_alerts
    rules:
      # High CPU
      - alert: HighCPUUsage
        expr: |
          100 - (avg(rate(node_cpu_seconds_total{mode="idle"}[5m])) * 100) > 80
        for: 5m
        labels:
          severity: warning
        annotations:
          summary: "High CPU usage"
          description: "CPU usage is {{ $value }}%"

      # High memory
      - alert: HighMemoryUsage
        expr: |
          (1 - (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes)) * 100 > 80
        for: 5m
        labels:
          severity: warning
        annotations:
          summary: "High memory usage"
          description: "Memory usage is {{ $value }}%"

      # Disk full
      - alert: DiskSpaceLow
        expr: |
          (1 - (node_filesystem_avail_bytes / node_filesystem_size_bytes)) * 100 > 85
        for: 5m
        labels:
          severity: critical
        annotations:
          summary: "Disk space running low"
          description: "Disk usage is {{ $value }}% on {{ $labels.mountpoint }}"

Notification Channels

Slack

yaml

receivers:
  - name: 'slack'
    slack_configs:
      - api_url: 'https://hooks.slack.com/services/YOUR/WEBHOOK'
        channel: '#alerts'
        title: 'Alert: {{ .GroupLabels.alertname }}'
        text: |
          {{ range .Alerts }}
          *Summary:* {{ .Annotations.summary }}
          *Description:* {{ .Annotations.description }}
          *Severity:* {{ .Labels.severity }}
          {{ end }}
        send_resolved: true

PagerDuty

yaml

receivers:
  - name: 'pagerduty'
    pagerduty_configs:
      - service_key: 'YOUR_PAGERDUTY_INTEGRATION_KEY'
        description: '{{ .GroupLabels.alertname }}: {{ .Annotations.summary }}'

Email

yaml

receivers:
  - name: 'email'
    email_configs:
      - to: 'team@example.com'
        from: 'alerts@example.com'
        smarthost: 'smtp.gmail.com:587'
        auth_username: 'alerts@example.com'
        auth_password: 'password'
        headers:
          Subject: 'Alert: {{ .GroupLabels.alertname }}'

Webhook

yaml

receivers:
  - name: 'webhook'
    webhook_configs:
      - url: 'https://your-webhook-endpoint.com/alerts'
        send_resolved: true

Alert Routing

By Severity

yaml

route:
  routes:
    # Critical → PagerDuty
    - match:
        severity: critical
      receiver: 'pagerduty'
      group_wait: 10s
      group_interval: 5m
      repeat_interval: 4h

    # Warning → Slack
    - match:
        severity: warning
      receiver: 'slack'
      group_wait: 30s
      group_interval: 10m
      repeat_interval: 12h

By Team

yaml

route:
  routes:
    # Backend team
    - match:
        team: backend
      receiver: 'backend-slack'

    # Infrastructure team
    - match:
        team: infrastructure
      receiver: 'infra-pagerduty'

Silencing Alerts

CLI

bash

# Silence for 1 hour
amtool silence add \
  alertname=HighRedisLatency \
  --duration=1h \
  --comment="Planned maintenance"

# List silences
amtool silence query

# Expire silence
amtool silence expire <silence-id>

API

bash

# Create silence
curl -X POST http://localhost:9093/api/v2/silences \
  -H "Content-Type: application/json" \
  -d '{
    "matchers": [
      {
        "name": "alertname",
        "value": "HighRedisLatency",
        "isRegex": false
      }
    ],
    "startsAt": "2025-01-28T10:00:00Z",
    "endsAt": "2025-01-28T11:00:00Z",
    "createdBy": "admin",
    "comment": "Planned maintenance"
  }'

Testing Alerts

bash

# Trigger test alert
curl -X POST http://localhost:9093/api/v1/alerts \
  -H "Content-Type: application/json" \
  -d '[{
    "labels": {
      "alertname": "TestAlert",
      "severity": "warning"
    },
    "annotations": {
      "summary": "This is a test alert"
    }
  }]'

Best Practices

1. Set appropriate thresholds

2. Use for duration to avoid flapping

3. Group related alerts

4. Add clear descriptions

5. Route by severity

6. Test alert delivery

7. Document escalation procedures

Next Steps

Grafana — Visualize metrics
Troubleshooting — Debug alerts
Overview — Back to overview

Alerting ​

AlertManager Setup ​

Install AlertManager ​

Configure AlertManager ​

Alert Rules ​

Redis Performance Alerts ​

Cache Alerts ​

Lock Alerts ​

Stream Alerts ​

Rate Limit Alerts ​

Application Alerts ​

API Health ​

System Resources ​

Notification Channels ​

Slack ​

PagerDuty ​

Email ​

Webhook ​

Alert Routing ​

By Severity ​

By Team ​

Silencing Alerts ​

CLI ​

API ​

Testing Alerts ​

Best Practices ​

Next Steps ​

Alerting

AlertManager Setup

Install AlertManager

Configure AlertManager

Alert Rules

Redis Performance Alerts

Cache Alerts

Lock Alerts

Stream Alerts

Rate Limit Alerts

Application Alerts

API Health

System Resources

Notification Channels

Slack

PagerDuty

Email

Webhook

Alert Routing

By Severity

By Team

Silencing Alerts

CLI

API

Testing Alerts

Best Practices

Next Steps