docker-stacks/stacks/mllogwatcher/alert_runbook.yaml

# Grafana alert triage playbook for the HomeLab telemetry stack.
# Each entry contains the alert metadata, what the signal means,
# the evidence to capture automatically, and the manual / scripted steps.
metadata:
  generated: "2025-09-22T00:00:00Z"
  grafana_url: "http://casper:3000"
  datasource: "InfluxDB telegraf (uid=P951FEA4DE68E13C5)"
  llm_provider: "OpenRouter"
alerts:
  - name: "Data Stale"
    rule_uid: "fdk9orif6fytcf"
    description: "No CPU usage_user metrics have arrived for non-unit hosts within 5 minutes."
    signal:
      metric: "cpu.usage_user"
      condition: "count(host samples over 5m) < 1"
      impact: "Host is no longer reporting to Telegraf/Influx -> monitoring blind spot."
    evidence_to_collect:
      - "Influx: `from(bucket:\"telegraf\") |> range(start:-10m) |> filter(fn:(r)=>r._measurement==\"cpu\" and r.host==\"{{ host }}\") |> count()`"
      - "Telegraf log tail"
      - "System journal for network/auth errors"
    triage:
      - summary: "Verify Telegraf agent health"
        linux: "sudo systemctl status telegraf && sudo journalctl -u telegraf -n 100"
        windows: "Get-Service telegraf; Get-Content 'C:\\Program Files\\telegraf\\telegraf.log' -Tail 100"
      - summary: "Check connectivity from host to Influx (`casper:8086`)"
        linux: "curl -sSf http://casper:8086/ping"
        windows: "Invoke-WebRequest -UseBasicParsing http://casper:8086/ping"
      - summary: "Confirm host clock drift <5s (important for Influx line protocol timestamps)"
        linux: "chronyc tracking"
        windows: "w32tm /query /status"
    remediation:
      - "Restart Telegraf after config validation: `sudo telegraf --test --config /etc/telegraf/telegraf.conf` then `sudo systemctl restart telegraf`."
      - "Re-apply Ansible telemetry playbook if multiple hosts fail."
    llm_prompt: >
      Alert {{ alertname }} fired for {{ host }}. Telegraf stopped sending cpu.usage_user metrics. Given the collected logs and command output, identify root causes (agent down, auth failures, firewall, time skew) and list the next action.

  - name: "High CPU"
    rule_uid: "fdkms407ubmdcc"
    description: "Mean CPU usage_system over the last 10 minutes exceeds 85%."
    signal:
      metric: "cpu.usage_system"
      condition: "mean over 10m > 85%"
      impact: "Host is near saturation; scheduler latency and queueing likely."
    evidence_to_collect:
      - "Top CPU processes snapshot (Linux: `ps -eo pid,cmd,%cpu --sort=-%cpu | head -n 15`; Windows: `Get-Process | Sort-Object CPU -Descending | Select -First 15`)"
      - "Load vs CPU core count"
      - "Recent deploys / cron jobs metadata"
    triage:
      - summary: "Confirm sustained CPU pressure"
        linux: "uptime && mpstat 1 5"
        windows: "typeperf \"\\Processor(_Total)\\% Processor Time\" -sc 15"
      - summary: "Check offending processes/services"
        linux: "sudo ps -eo pid,user,comm,%cpu,%mem --sort=-%cpu | head"
        windows: "Get-Process | Sort-Object CPU -Descending | Select -First 10 Name,CPU"
      - summary: "Inspect cgroup / VM constraints if on Proxmox"
        linux: "sudo pct status {{ vmid }} && sudo pct config {{ vmid }}"
    remediation:
      - "Throttle or restart runaway service; scale workload or tune limits."
      - "Consider moving noisy neighbors off shared hypervisor."
    llm_prompt: >
      High CPU alert for {{ host }}. Review process table, recent deploys, and virtualization context; determine why cpu.usage_system stayed above 85% and recommend mitigation.

  - name: "High Mem."
    rule_uid: "edkmsdmlay2o0c"
    description: "Mean memory used_percent over 10 minutes > 95% (excluding hosts jhci/nerv*/magi*)."
    signal:
      metric: "mem.used_percent"
      condition: "mean over 10m > 95%"
      impact: "OOM risk and swap thrash."
    evidence_to_collect:
      - "Free/available memory snapshot"
      - "Top consumers (Linux: `sudo smem -rt rss | head`; Windows: `Get-Process | Sort-Object WorkingSet -Descending`)"
      - "Swap in/out metrics"
    triage:
      - summary: "Validate actual memory pressure"
        linux: "free -m && vmstat -SM 5 5"
        windows: "Get-Counter '\\Memory\\Available MBytes'"
      - summary: "Identify leaking services"
        linux: "sudo ps -eo pid,user,comm,%mem,rss --sort=-%mem | head"
        windows: "Get-Process | Sort-Object WS -Descending | Select -First 10 ProcessName,WS"
      - summary: "Check recent kernel/OOM logs"
        linux: "sudo dmesg | tail -n 50"
        windows: "Get-WinEvent -LogName System -MaxEvents 50 | ? { $_.Message -match 'memory' }"
    remediation:
      - "Restart or reconfigure offender; add swap as stop-gap; increase VM memory allocation."
    llm_prompt: >
      High Mem alert for {{ host }}. After reviewing free memory, swap activity, and top processes, explain the likely cause and propose remediation steps with priority.

  - name: "High Disk IO"
    rule_uid: "bdkmtaru7ru2od"
    description: "Mean merged_reads/writes per second converted to GB/s exceeds 10."
    signal:
      metric: "diskio.merged_reads + merged_writes"
      condition: "mean over 10m > 10 GB/s"
      impact: "Storage controller saturated; latency spikes, possible backlog."
    evidence_to_collect:
      - "iostat extended output"
      - "Process level IO (pidstat/nethogs equivalent)"
      - "ZFS/MDADM status for relevant pools"
    triage:
      - summary: "Inspect device queues"
        linux: "iostat -xzd 5 3"
        windows: "Get-WmiObject -Class Win32_PerfFormattedData_PerfDisk_LogicalDisk | Format-Table Name,DiskWritesPersec,DiskReadsPersec,AvgDisksecPerTransfer"
      - summary: "Correlate to filesystem / VM"
        linux: "sudo lsof +D /mnt/critical -u {{ user }}"
      - summary: "Check backup or replication windows"
        linux: "journalctl -u pvebackup -n 50"
    remediation:
      - "Pause heavy jobs, move backups off-peak, evaluate faster storage tiers."
    llm_prompt: >
      High Disk IO on {{ host }}. With iostat/pidstat output provided, decide whether activity is expected (backup, scrub) or abnormal and list mitigations.

  - name: "Low Uptime"
    rule_uid: "ddkmuadxvkm4ge"
    description: "System uptime converted to minutes is below 10 -> host rebooted recently."
    signal:
      metric: "system.uptime"
      condition: "last uptime_minutes < 10"
      impact: "Unexpected reboot or crash; may need RCA."
    evidence_to_collect:
      - "Boot reason logs"
      - "Last patch/maintenance window from Ansible inventory"
      - "Smart log excerpt for power events"
    triage:
      - summary: "Confirm uptime and reason"
        linux: "uptime && last -x | head"
        windows: "Get-WinEvent -LogName System -MaxEvents 50 | ? { $_.Id -in 41,6006,6008 }"
      - summary: "Check kernel panic or watchdog traces"
        linux: "sudo journalctl -k -b -1 | tail -n 200"
      - summary: "Validate patch automation logs"
        linux: "sudo tail -n 100 /var/log/ansible-pull.log"
    remediation:
      - "Schedule deeper diagnostics if crash; reschedule workloads once stable."
    llm_prompt: >
      Low Uptime alert: host restarted within 10 minutes. Inspect boot reason logs and recommend whether this is maintenance or a fault needing follow-up.

  - name: "High Load"
    rule_uid: "ddkmul9x8gcn4d"
    description: "system.load5 > 6 for 5 minutes."
    signal:
      metric: "system.load5"
      condition: "last value > 6"
      impact: "Runnable queue more than CPU threads -> latency growth."
    evidence_to_collect:
      - "Load vs CPU count (`nproc`)"
      - "Process states (D/R blocked tasks)"
      - "IO wait percentage"
    triage:
      - summary: "Correlate load to CPU and IO"
        linux: "uptime && vmstat 1 5"
      - summary: "Identify stuck IO"
        linux: "sudo pidstat -d 1 5"
      - summary: "Check Proxmox scheduler for resource contention"
        linux: "pveperf && qm list"
    remediation:
      - "Reduce cron concurrency, add CPU, or fix IO bottleneck causing runnable queue growth."
    llm_prompt: >
      High Load alert on {{ host }}. Based on vmstat/pidstat output, explain whether CPU saturation, IO wait, or runnable pile-up is at fault and propose actions.

  - name: "High Network Traffic (Download)"
    rule_uid: "cdkpct82a7g8wd"
    description: "Derivative of bytes_recv > 50 MB/s on any interface over last hour."
    signal:
      metric: "net.bytes_recv"
      condition: "mean download throughput > 50 MB/s"
      impact: "Link saturation, potential DDoS or backup window."
    evidence_to_collect:
      - "Interface counters (Linux: `ip -s link show {{ iface }}`; Windows: `Get-NetAdapterStatistics`)"
      - "Top talkers (Linux: `sudo nethogs {{ iface }}` or `iftop -i {{ iface }}`)"
      - "Firewall/IDS logs"
    triage:
      - summary: "Confirm interface experiencing spike"
        linux: "sar -n DEV 1 5 | grep {{ iface }}"
        windows: "Get-Counter -Counter '\\Network Interface({{ iface }})\\Bytes Received/sec' -Continuous -SampleInterval 1 -MaxSamples 5"
      - summary: "Identify process or remote peer"
        linux: "sudo ss -ntu state established | sort -k4"
        windows: "Get-NetTCPConnection | Sort-Object -Property LocalPort"
    remediation:
      - "Throttle offending transfers, move backup replication, verify no compromised service."
    llm_prompt: >
      High download throughput on {{ host }} interface {{ iface }}. Review interface counters and connection list to determine if traffic is expected and advise throttling or blocking steps.

  - name: "High Network Traffic (Upload)"
    rule_uid: "aec650pbtvzswa"
    description: "Derivative of bytes_sent > 30 MB/s for an interface."
    signal:
      metric: "net.bytes_sent"
      condition: "mean upload throughput > 30 MB/s"
      impact: "Excess upstream usage; may saturate ISP uplink."
    evidence_to_collect:
      - "Interface statistics"
      - "NetFlow sample if available (`/var/log/telegraf/netflow.log`)"
      - "List of active transfers"
    triage:
      - summary: "Measure upload curve"
        linux: "bmon -p {{ iface }} -o ascii"
        windows: "Get-Counter '\\Network Interface({{ iface }})\\Bytes Sent/sec' -Continuous -SampleInterval 1 -MaxSamples 5"
      - summary: "Find process generating traffic"
        linux: "sudo iftop -i {{ iface }} -t -s 30"
        windows: "Get-NetAdapterStatistics -Name {{ iface }}"
    remediation:
      - "Pause replication jobs, confirm backups not stuck, search for data exfiltration."
    llm_prompt: >
      High upload alert for {{ host }} interface {{ iface }}. Using captured traffic samples, determine whether replication/backup explains the pattern or if anomalous traffic needs blocking.

  - name: "High Disk Usage"
    rule_uid: "cdma6i5k2gem8d"
    description: "Disk used_percent >= 95% for Linux devices (filters out unwanted devices)."
    signal:
      metric: "disk.used_percent"
      condition: "last value > 95%"
      impact: "Filesystem full -> service crashes or write failures."
    evidence_to_collect:
      - "`df -h` or `Get-Volume` output for device"
      - "Largest directories snapshot (Linux: `sudo du -xhd1 /path`; Windows: `Get-ChildItem | Sort Length`)"
      - "Recent deploy or backup expansion logs"
    triage:
      - summary: "Validate usage"
        linux: "df -h {{ mountpoint }}"
        windows: "Get-Volume -FileSystemLabel {{ volume }}"
      - summary: "Identify growth trend"
        linux: "sudo journalctl -u telegraf -g 'disk usage' -n 20"
      - summary: "Check for stale docker volumes"
        linux: "docker system df && docker volume ls"
    remediation:
      - "Prune temp artifacts, expand disk/VM, move logs to remote storage."
    llm_prompt: >
      High Disk Usage alert on {{ host }} device {{ device }}. Summarize what consumed the space and recommend reclaim or expansion actions with priority.

  - name: "CPU Heartbeat"
    rule_uid: "eec62gqn3oetcf"
    description: "Counts cpu.usage_system samples per host; fires if <1 sample arrives within window."
    signal:
      metric: "cpu.usage_system"
      condition: "sample count within 10m < 1"
      impact: "Indicates host stopped reporting metrics entirely (telemetry silent)."
    evidence_to_collect:
      - "Influx query for recent cpu samples"
      - "Telegraf service and logs"
      - "Network reachability from host to casper"
    triage:
      - summary: "Check host alive and reachable"
        linux: "ping -c 3 {{ host }} && ssh {{ host }} uptime"
        windows: "Test-Connection {{ host }} -Count 3"
      - summary: "Inspect Telegraf state"
        linux: "sudo systemctl status telegraf && sudo tail -n 100 /var/log/telegraf/telegraf.log"
        windows: "Get-Service telegraf; Get-EventLog -LogName Application -Newest 50 | ? { $_.Source -match 'Telegraf' }"
      - summary: "Validate API key / Influx auth"
        linux: "sudo grep -n 'outputs.influxdb' -n /etc/telegraf/telegraf.conf"
    remediation:
      - "Re-issue Telegraf credentials, run `ansible-playbook telemetry.yml -l {{ host }}`."
      - "If host intentionally offline, silence alert via Grafana maintenance window."
    llm_prompt: >
      CPU Heartbeat for {{ host }} indicates telemetry silent. Use connectivity tests and Telegraf logs to determine if host is down or just metrics disabled; propose fixes.