# Grafana alert triage playbook for the HomeLab telemetry stack. # Each entry contains the alert metadata, what the signal means, # the evidence to capture automatically, and the manual / scripted steps. metadata: generated: "2025-09-22T00:00:00Z" grafana_url: "http://casper:3000" datasource: "InfluxDB telegraf (uid=P951FEA4DE68E13C5)" llm_provider: "OpenRouter" alerts: - name: "Data Stale" rule_uid: "fdk9orif6fytcf" description: "No CPU usage_user metrics have arrived for non-unit hosts within 5 minutes." signal: metric: "cpu.usage_user" condition: "count(host samples over 5m) < 1" impact: "Host is no longer reporting to Telegraf/Influx -> monitoring blind spot." evidence_to_collect: - "Influx: `from(bucket:\"telegraf\") |> range(start:-10m) |> filter(fn:(r)=>r._measurement==\"cpu\" and r.host==\"{{ host }}\") |> count()`" - "Telegraf log tail" - "System journal for network/auth errors" triage: - summary: "Verify Telegraf agent health" linux: "sudo systemctl status telegraf && sudo journalctl -u telegraf -n 100" windows: "Get-Service telegraf; Get-Content 'C:\\Program Files\\telegraf\\telegraf.log' -Tail 100" - summary: "Check connectivity from host to Influx (`casper:8086`)" linux: "curl -sSf http://casper:8086/ping" windows: "Invoke-WebRequest -UseBasicParsing http://casper:8086/ping" - summary: "Confirm host clock drift <5s (important for Influx line protocol timestamps)" linux: "chronyc tracking" windows: "w32tm /query /status" remediation: - "Restart Telegraf after config validation: `sudo telegraf --test --config /etc/telegraf/telegraf.conf` then `sudo systemctl restart telegraf`." - "Re-apply Ansible telemetry playbook if multiple hosts fail." llm_prompt: > Alert {{ alertname }} fired for {{ host }}. Telegraf stopped sending cpu.usage_user metrics. Given the collected logs and command output, identify root causes (agent down, auth failures, firewall, time skew) and list the next action. - name: "High CPU" rule_uid: "fdkms407ubmdcc" description: "Mean CPU usage_system over the last 10 minutes exceeds 85%." signal: metric: "cpu.usage_system" condition: "mean over 10m > 85%" impact: "Host is near saturation; scheduler latency and queueing likely." evidence_to_collect: - "Top CPU processes snapshot (Linux: `ps -eo pid,cmd,%cpu --sort=-%cpu | head -n 15`; Windows: `Get-Process | Sort-Object CPU -Descending | Select -First 15`)" - "Load vs CPU core count" - "Recent deploys / cron jobs metadata" triage: - summary: "Confirm sustained CPU pressure" linux: "uptime && mpstat 1 5" windows: "typeperf \"\\Processor(_Total)\\% Processor Time\" -sc 15" - summary: "Check offending processes/services" linux: "sudo ps -eo pid,user,comm,%cpu,%mem --sort=-%cpu | head" windows: "Get-Process | Sort-Object CPU -Descending | Select -First 10 Name,CPU" - summary: "Inspect cgroup / VM constraints if on Proxmox" linux: "sudo pct status {{ vmid }} && sudo pct config {{ vmid }}" remediation: - "Throttle or restart runaway service; scale workload or tune limits." - "Consider moving noisy neighbors off shared hypervisor." llm_prompt: > High CPU alert for {{ host }}. Review process table, recent deploys, and virtualization context; determine why cpu.usage_system stayed above 85% and recommend mitigation. - name: "High Mem." rule_uid: "edkmsdmlay2o0c" description: "Mean memory used_percent over 10 minutes > 95% (excluding hosts jhci/nerv*/magi*)." signal: metric: "mem.used_percent" condition: "mean over 10m > 95%" impact: "OOM risk and swap thrash." evidence_to_collect: - "Free/available memory snapshot" - "Top consumers (Linux: `sudo smem -rt rss | head`; Windows: `Get-Process | Sort-Object WorkingSet -Descending`)" - "Swap in/out metrics" triage: - summary: "Validate actual memory pressure" linux: "free -m && vmstat -SM 5 5" windows: "Get-Counter '\\Memory\\Available MBytes'" - summary: "Identify leaking services" linux: "sudo ps -eo pid,user,comm,%mem,rss --sort=-%mem | head" windows: "Get-Process | Sort-Object WS -Descending | Select -First 10 ProcessName,WS" - summary: "Check recent kernel/OOM logs" linux: "sudo dmesg | tail -n 50" windows: "Get-WinEvent -LogName System -MaxEvents 50 | ? { $_.Message -match 'memory' }" remediation: - "Restart or reconfigure offender; add swap as stop-gap; increase VM memory allocation." llm_prompt: > High Mem alert for {{ host }}. After reviewing free memory, swap activity, and top processes, explain the likely cause and propose remediation steps with priority. - name: "High Disk IO" rule_uid: "bdkmtaru7ru2od" description: "Mean merged_reads/writes per second converted to GB/s exceeds 10." signal: metric: "diskio.merged_reads + merged_writes" condition: "mean over 10m > 10 GB/s" impact: "Storage controller saturated; latency spikes, possible backlog." evidence_to_collect: - "iostat extended output" - "Process level IO (pidstat/nethogs equivalent)" - "ZFS/MDADM status for relevant pools" triage: - summary: "Inspect device queues" linux: "iostat -xzd 5 3" windows: "Get-WmiObject -Class Win32_PerfFormattedData_PerfDisk_LogicalDisk | Format-Table Name,DiskWritesPersec,DiskReadsPersec,AvgDisksecPerTransfer" - summary: "Correlate to filesystem / VM" linux: "sudo lsof +D /mnt/critical -u {{ user }}" - summary: "Check backup or replication windows" linux: "journalctl -u pvebackup -n 50" remediation: - "Pause heavy jobs, move backups off-peak, evaluate faster storage tiers." llm_prompt: > High Disk IO on {{ host }}. With iostat/pidstat output provided, decide whether activity is expected (backup, scrub) or abnormal and list mitigations. - name: "Low Uptime" rule_uid: "ddkmuadxvkm4ge" description: "System uptime converted to minutes is below 10 -> host rebooted recently." signal: metric: "system.uptime" condition: "last uptime_minutes < 10" impact: "Unexpected reboot or crash; may need RCA." evidence_to_collect: - "Boot reason logs" - "Last patch/maintenance window from Ansible inventory" - "Smart log excerpt for power events" triage: - summary: "Confirm uptime and reason" linux: "uptime && last -x | head" windows: "Get-WinEvent -LogName System -MaxEvents 50 | ? { $_.Id -in 41,6006,6008 }" - summary: "Check kernel panic or watchdog traces" linux: "sudo journalctl -k -b -1 | tail -n 200" - summary: "Validate patch automation logs" linux: "sudo tail -n 100 /var/log/ansible-pull.log" remediation: - "Schedule deeper diagnostics if crash; reschedule workloads once stable." llm_prompt: > Low Uptime alert: host restarted within 10 minutes. Inspect boot reason logs and recommend whether this is maintenance or a fault needing follow-up. - name: "High Load" rule_uid: "ddkmul9x8gcn4d" description: "system.load5 > 6 for 5 minutes." signal: metric: "system.load5" condition: "last value > 6" impact: "Runnable queue more than CPU threads -> latency growth." evidence_to_collect: - "Load vs CPU count (`nproc`)" - "Process states (D/R blocked tasks)" - "IO wait percentage" triage: - summary: "Correlate load to CPU and IO" linux: "uptime && vmstat 1 5" - summary: "Identify stuck IO" linux: "sudo pidstat -d 1 5" - summary: "Check Proxmox scheduler for resource contention" linux: "pveperf && qm list" remediation: - "Reduce cron concurrency, add CPU, or fix IO bottleneck causing runnable queue growth." llm_prompt: > High Load alert on {{ host }}. Based on vmstat/pidstat output, explain whether CPU saturation, IO wait, or runnable pile-up is at fault and propose actions. - name: "High Network Traffic (Download)" rule_uid: "cdkpct82a7g8wd" description: "Derivative of bytes_recv > 50 MB/s on any interface over last hour." signal: metric: "net.bytes_recv" condition: "mean download throughput > 50 MB/s" impact: "Link saturation, potential DDoS or backup window." evidence_to_collect: - "Interface counters (Linux: `ip -s link show {{ iface }}`; Windows: `Get-NetAdapterStatistics`)" - "Top talkers (Linux: `sudo nethogs {{ iface }}` or `iftop -i {{ iface }}`)" - "Firewall/IDS logs" triage: - summary: "Confirm interface experiencing spike" linux: "sar -n DEV 1 5 | grep {{ iface }}" windows: "Get-Counter -Counter '\\Network Interface({{ iface }})\\Bytes Received/sec' -Continuous -SampleInterval 1 -MaxSamples 5" - summary: "Identify process or remote peer" linux: "sudo ss -ntu state established | sort -k4" windows: "Get-NetTCPConnection | Sort-Object -Property LocalPort" remediation: - "Throttle offending transfers, move backup replication, verify no compromised service." llm_prompt: > High download throughput on {{ host }} interface {{ iface }}. Review interface counters and connection list to determine if traffic is expected and advise throttling or blocking steps. - name: "High Network Traffic (Upload)" rule_uid: "aec650pbtvzswa" description: "Derivative of bytes_sent > 30 MB/s for an interface." signal: metric: "net.bytes_sent" condition: "mean upload throughput > 30 MB/s" impact: "Excess upstream usage; may saturate ISP uplink." evidence_to_collect: - "Interface statistics" - "NetFlow sample if available (`/var/log/telegraf/netflow.log`)" - "List of active transfers" triage: - summary: "Measure upload curve" linux: "bmon -p {{ iface }} -o ascii" windows: "Get-Counter '\\Network Interface({{ iface }})\\Bytes Sent/sec' -Continuous -SampleInterval 1 -MaxSamples 5" - summary: "Find process generating traffic" linux: "sudo iftop -i {{ iface }} -t -s 30" windows: "Get-NetAdapterStatistics -Name {{ iface }}" remediation: - "Pause replication jobs, confirm backups not stuck, search for data exfiltration." llm_prompt: > High upload alert for {{ host }} interface {{ iface }}. Using captured traffic samples, determine whether replication/backup explains the pattern or if anomalous traffic needs blocking. - name: "High Disk Usage" rule_uid: "cdma6i5k2gem8d" description: "Disk used_percent >= 95% for Linux devices (filters out unwanted devices)." signal: metric: "disk.used_percent" condition: "last value > 95%" impact: "Filesystem full -> service crashes or write failures." evidence_to_collect: - "`df -h` or `Get-Volume` output for device" - "Largest directories snapshot (Linux: `sudo du -xhd1 /path`; Windows: `Get-ChildItem | Sort Length`)" - "Recent deploy or backup expansion logs" triage: - summary: "Validate usage" linux: "df -h {{ mountpoint }}" windows: "Get-Volume -FileSystemLabel {{ volume }}" - summary: "Identify growth trend" linux: "sudo journalctl -u telegraf -g 'disk usage' -n 20" - summary: "Check for stale docker volumes" linux: "docker system df && docker volume ls" remediation: - "Prune temp artifacts, expand disk/VM, move logs to remote storage." llm_prompt: > High Disk Usage alert on {{ host }} device {{ device }}. Summarize what consumed the space and recommend reclaim or expansion actions with priority. - name: "CPU Heartbeat" rule_uid: "eec62gqn3oetcf" description: "Counts cpu.usage_system samples per host; fires if <1 sample arrives within window." signal: metric: "cpu.usage_system" condition: "sample count within 10m < 1" impact: "Indicates host stopped reporting metrics entirely (telemetry silent)." evidence_to_collect: - "Influx query for recent cpu samples" - "Telegraf service and logs" - "Network reachability from host to casper" triage: - summary: "Check host alive and reachable" linux: "ping -c 3 {{ host }} && ssh {{ host }} uptime" windows: "Test-Connection {{ host }} -Count 3" - summary: "Inspect Telegraf state" linux: "sudo systemctl status telegraf && sudo tail -n 100 /var/log/telegraf/telegraf.log" windows: "Get-Service telegraf; Get-EventLog -LogName Application -Newest 50 | ? { $_.Source -match 'Telegraf' }" - summary: "Validate API key / Influx auth" linux: "sudo grep -n 'outputs.influxdb' -n /etc/telegraf/telegraf.conf" remediation: - "Re-issue Telegraf credentials, run `ansible-playbook telemetry.yml -l {{ host }}`." - "If host intentionally offline, silence alert via Grafana maintenance window." llm_prompt: > CPU Heartbeat for {{ host }} indicates telemetry silent. Use connectivity tests and Telegraf logs to determine if host is down or just metrics disabled; propose fixes.