docker-stacks/stacks/mllogwatcher/alert_runbook.yaml
2025-12-31 20:11:44 -05:00

255 lines
13 KiB
YAML
Executable File

# Grafana alert triage playbook for the HomeLab telemetry stack.
# Each entry contains the alert metadata, what the signal means,
# the evidence to capture automatically, and the manual / scripted steps.
metadata:
generated: "2025-09-22T00:00:00Z"
grafana_url: "http://casper:3000"
datasource: "InfluxDB telegraf (uid=P951FEA4DE68E13C5)"
llm_provider: "OpenRouter"
alerts:
- name: "Data Stale"
rule_uid: "fdk9orif6fytcf"
description: "No CPU usage_user metrics have arrived for non-unit hosts within 5 minutes."
signal:
metric: "cpu.usage_user"
condition: "count(host samples over 5m) < 1"
impact: "Host is no longer reporting to Telegraf/Influx -> monitoring blind spot."
evidence_to_collect:
- "Influx: `from(bucket:\"telegraf\") |> range(start:-10m) |> filter(fn:(r)=>r._measurement==\"cpu\" and r.host==\"{{ host }}\") |> count()`"
- "Telegraf log tail"
- "System journal for network/auth errors"
triage:
- summary: "Verify Telegraf agent health"
linux: "sudo systemctl status telegraf && sudo journalctl -u telegraf -n 100"
windows: "Get-Service telegraf; Get-Content 'C:\\Program Files\\telegraf\\telegraf.log' -Tail 100"
- summary: "Check connectivity from host to Influx (`casper:8086`)"
linux: "curl -sSf http://casper:8086/ping"
windows: "Invoke-WebRequest -UseBasicParsing http://casper:8086/ping"
- summary: "Confirm host clock drift <5s (important for Influx line protocol timestamps)"
linux: "chronyc tracking"
windows: "w32tm /query /status"
remediation:
- "Restart Telegraf after config validation: `sudo telegraf --test --config /etc/telegraf/telegraf.conf` then `sudo systemctl restart telegraf`."
- "Re-apply Ansible telemetry playbook if multiple hosts fail."
llm_prompt: >
Alert {{ alertname }} fired for {{ host }}. Telegraf stopped sending cpu.usage_user metrics. Given the collected logs and command output, identify root causes (agent down, auth failures, firewall, time skew) and list the next action.
- name: "High CPU"
rule_uid: "fdkms407ubmdcc"
description: "Mean CPU usage_system over the last 10 minutes exceeds 85%."
signal:
metric: "cpu.usage_system"
condition: "mean over 10m > 85%"
impact: "Host is near saturation; scheduler latency and queueing likely."
evidence_to_collect:
- "Top CPU processes snapshot (Linux: `ps -eo pid,cmd,%cpu --sort=-%cpu | head -n 15`; Windows: `Get-Process | Sort-Object CPU -Descending | Select -First 15`)"
- "Load vs CPU core count"
- "Recent deploys / cron jobs metadata"
triage:
- summary: "Confirm sustained CPU pressure"
linux: "uptime && mpstat 1 5"
windows: "typeperf \"\\Processor(_Total)\\% Processor Time\" -sc 15"
- summary: "Check offending processes/services"
linux: "sudo ps -eo pid,user,comm,%cpu,%mem --sort=-%cpu | head"
windows: "Get-Process | Sort-Object CPU -Descending | Select -First 10 Name,CPU"
- summary: "Inspect cgroup / VM constraints if on Proxmox"
linux: "sudo pct status {{ vmid }} && sudo pct config {{ vmid }}"
remediation:
- "Throttle or restart runaway service; scale workload or tune limits."
- "Consider moving noisy neighbors off shared hypervisor."
llm_prompt: >
High CPU alert for {{ host }}. Review process table, recent deploys, and virtualization context; determine why cpu.usage_system stayed above 85% and recommend mitigation.
- name: "High Mem."
rule_uid: "edkmsdmlay2o0c"
description: "Mean memory used_percent over 10 minutes > 95% (excluding hosts jhci/nerv*/magi*)."
signal:
metric: "mem.used_percent"
condition: "mean over 10m > 95%"
impact: "OOM risk and swap thrash."
evidence_to_collect:
- "Free/available memory snapshot"
- "Top consumers (Linux: `sudo smem -rt rss | head`; Windows: `Get-Process | Sort-Object WorkingSet -Descending`)"
- "Swap in/out metrics"
triage:
- summary: "Validate actual memory pressure"
linux: "free -m && vmstat -SM 5 5"
windows: "Get-Counter '\\Memory\\Available MBytes'"
- summary: "Identify leaking services"
linux: "sudo ps -eo pid,user,comm,%mem,rss --sort=-%mem | head"
windows: "Get-Process | Sort-Object WS -Descending | Select -First 10 ProcessName,WS"
- summary: "Check recent kernel/OOM logs"
linux: "sudo dmesg | tail -n 50"
windows: "Get-WinEvent -LogName System -MaxEvents 50 | ? { $_.Message -match 'memory' }"
remediation:
- "Restart or reconfigure offender; add swap as stop-gap; increase VM memory allocation."
llm_prompt: >
High Mem alert for {{ host }}. After reviewing free memory, swap activity, and top processes, explain the likely cause and propose remediation steps with priority.
- name: "High Disk IO"
rule_uid: "bdkmtaru7ru2od"
description: "Mean merged_reads/writes per second converted to GB/s exceeds 10."
signal:
metric: "diskio.merged_reads + merged_writes"
condition: "mean over 10m > 10 GB/s"
impact: "Storage controller saturated; latency spikes, possible backlog."
evidence_to_collect:
- "iostat extended output"
- "Process level IO (pidstat/nethogs equivalent)"
- "ZFS/MDADM status for relevant pools"
triage:
- summary: "Inspect device queues"
linux: "iostat -xzd 5 3"
windows: "Get-WmiObject -Class Win32_PerfFormattedData_PerfDisk_LogicalDisk | Format-Table Name,DiskWritesPersec,DiskReadsPersec,AvgDisksecPerTransfer"
- summary: "Correlate to filesystem / VM"
linux: "sudo lsof +D /mnt/critical -u {{ user }}"
- summary: "Check backup or replication windows"
linux: "journalctl -u pvebackup -n 50"
remediation:
- "Pause heavy jobs, move backups off-peak, evaluate faster storage tiers."
llm_prompt: >
High Disk IO on {{ host }}. With iostat/pidstat output provided, decide whether activity is expected (backup, scrub) or abnormal and list mitigations.
- name: "Low Uptime"
rule_uid: "ddkmuadxvkm4ge"
description: "System uptime converted to minutes is below 10 -> host rebooted recently."
signal:
metric: "system.uptime"
condition: "last uptime_minutes < 10"
impact: "Unexpected reboot or crash; may need RCA."
evidence_to_collect:
- "Boot reason logs"
- "Last patch/maintenance window from Ansible inventory"
- "Smart log excerpt for power events"
triage:
- summary: "Confirm uptime and reason"
linux: "uptime && last -x | head"
windows: "Get-WinEvent -LogName System -MaxEvents 50 | ? { $_.Id -in 41,6006,6008 }"
- summary: "Check kernel panic or watchdog traces"
linux: "sudo journalctl -k -b -1 | tail -n 200"
- summary: "Validate patch automation logs"
linux: "sudo tail -n 100 /var/log/ansible-pull.log"
remediation:
- "Schedule deeper diagnostics if crash; reschedule workloads once stable."
llm_prompt: >
Low Uptime alert: host restarted within 10 minutes. Inspect boot reason logs and recommend whether this is maintenance or a fault needing follow-up.
- name: "High Load"
rule_uid: "ddkmul9x8gcn4d"
description: "system.load5 > 6 for 5 minutes."
signal:
metric: "system.load5"
condition: "last value > 6"
impact: "Runnable queue more than CPU threads -> latency growth."
evidence_to_collect:
- "Load vs CPU count (`nproc`)"
- "Process states (D/R blocked tasks)"
- "IO wait percentage"
triage:
- summary: "Correlate load to CPU and IO"
linux: "uptime && vmstat 1 5"
- summary: "Identify stuck IO"
linux: "sudo pidstat -d 1 5"
- summary: "Check Proxmox scheduler for resource contention"
linux: "pveperf && qm list"
remediation:
- "Reduce cron concurrency, add CPU, or fix IO bottleneck causing runnable queue growth."
llm_prompt: >
High Load alert on {{ host }}. Based on vmstat/pidstat output, explain whether CPU saturation, IO wait, or runnable pile-up is at fault and propose actions.
- name: "High Network Traffic (Download)"
rule_uid: "cdkpct82a7g8wd"
description: "Derivative of bytes_recv > 50 MB/s on any interface over last hour."
signal:
metric: "net.bytes_recv"
condition: "mean download throughput > 50 MB/s"
impact: "Link saturation, potential DDoS or backup window."
evidence_to_collect:
- "Interface counters (Linux: `ip -s link show {{ iface }}`; Windows: `Get-NetAdapterStatistics`)"
- "Top talkers (Linux: `sudo nethogs {{ iface }}` or `iftop -i {{ iface }}`)"
- "Firewall/IDS logs"
triage:
- summary: "Confirm interface experiencing spike"
linux: "sar -n DEV 1 5 | grep {{ iface }}"
windows: "Get-Counter -Counter '\\Network Interface({{ iface }})\\Bytes Received/sec' -Continuous -SampleInterval 1 -MaxSamples 5"
- summary: "Identify process or remote peer"
linux: "sudo ss -ntu state established | sort -k4"
windows: "Get-NetTCPConnection | Sort-Object -Property LocalPort"
remediation:
- "Throttle offending transfers, move backup replication, verify no compromised service."
llm_prompt: >
High download throughput on {{ host }} interface {{ iface }}. Review interface counters and connection list to determine if traffic is expected and advise throttling or blocking steps.
- name: "High Network Traffic (Upload)"
rule_uid: "aec650pbtvzswa"
description: "Derivative of bytes_sent > 30 MB/s for an interface."
signal:
metric: "net.bytes_sent"
condition: "mean upload throughput > 30 MB/s"
impact: "Excess upstream usage; may saturate ISP uplink."
evidence_to_collect:
- "Interface statistics"
- "NetFlow sample if available (`/var/log/telegraf/netflow.log`)"
- "List of active transfers"
triage:
- summary: "Measure upload curve"
linux: "bmon -p {{ iface }} -o ascii"
windows: "Get-Counter '\\Network Interface({{ iface }})\\Bytes Sent/sec' -Continuous -SampleInterval 1 -MaxSamples 5"
- summary: "Find process generating traffic"
linux: "sudo iftop -i {{ iface }} -t -s 30"
windows: "Get-NetAdapterStatistics -Name {{ iface }}"
remediation:
- "Pause replication jobs, confirm backups not stuck, search for data exfiltration."
llm_prompt: >
High upload alert for {{ host }} interface {{ iface }}. Using captured traffic samples, determine whether replication/backup explains the pattern or if anomalous traffic needs blocking.
- name: "High Disk Usage"
rule_uid: "cdma6i5k2gem8d"
description: "Disk used_percent >= 95% for Linux devices (filters out unwanted devices)."
signal:
metric: "disk.used_percent"
condition: "last value > 95%"
impact: "Filesystem full -> service crashes or write failures."
evidence_to_collect:
- "`df -h` or `Get-Volume` output for device"
- "Largest directories snapshot (Linux: `sudo du -xhd1 /path`; Windows: `Get-ChildItem | Sort Length`)"
- "Recent deploy or backup expansion logs"
triage:
- summary: "Validate usage"
linux: "df -h {{ mountpoint }}"
windows: "Get-Volume -FileSystemLabel {{ volume }}"
- summary: "Identify growth trend"
linux: "sudo journalctl -u telegraf -g 'disk usage' -n 20"
- summary: "Check for stale docker volumes"
linux: "docker system df && docker volume ls"
remediation:
- "Prune temp artifacts, expand disk/VM, move logs to remote storage."
llm_prompt: >
High Disk Usage alert on {{ host }} device {{ device }}. Summarize what consumed the space and recommend reclaim or expansion actions with priority.
- name: "CPU Heartbeat"
rule_uid: "eec62gqn3oetcf"
description: "Counts cpu.usage_system samples per host; fires if <1 sample arrives within window."
signal:
metric: "cpu.usage_system"
condition: "sample count within 10m < 1"
impact: "Indicates host stopped reporting metrics entirely (telemetry silent)."
evidence_to_collect:
- "Influx query for recent cpu samples"
- "Telegraf service and logs"
- "Network reachability from host to casper"
triage:
- summary: "Check host alive and reachable"
linux: "ping -c 3 {{ host }} && ssh {{ host }} uptime"
windows: "Test-Connection {{ host }} -Count 3"
- summary: "Inspect Telegraf state"
linux: "sudo systemctl status telegraf && sudo tail -n 100 /var/log/telegraf/telegraf.log"
windows: "Get-Service telegraf; Get-EventLog -LogName Application -Newest 50 | ? { $_.Source -match 'Telegraf' }"
- summary: "Validate API key / Influx auth"
linux: "sudo grep -n 'outputs.influxdb' -n /etc/telegraf/telegraf.conf"
remediation:
- "Re-issue Telegraf credentials, run `ansible-playbook telemetry.yml -l {{ host }}`."
- "If host intentionally offline, silence alert via Grafana maintenance window."
llm_prompt: >
CPU Heartbeat for {{ host }} indicates telemetry silent. Use connectivity tests and Telegraf logs to determine if host is down or just metrics disabled; propose fixes.