docker-stacks/stacks/mllogwatcher/scripts/grafana_alert_webhook.py
2025-12-31 20:11:44 -05:00

989 lines
39 KiB
Python
Executable File

#!/usr/bin/env python3
"""
Minimal FastAPI web server that accepts Grafana alert webhooks, looks up the
matching runbook entry, builds an LLM prompt, and calls OpenRouter to return a
triage summary.
Run with:
uvicorn scripts.grafana_alert_webhook:app --host 0.0.0.0 --port 8081
Environment variables:
RUNBOOK_PATH Path to alert_runbook.yaml (default: ./alert_runbook.yaml)
OPENROUTER_API_KEY Required; API token for https://openrouter.ai
OPENROUTER_MODEL Optional; default openai/gpt-4o-mini
OPENROUTER_REFERER Optional referer header
OPENROUTER_TITLE Optional title header (default: Grafana Alert Webhook)
"""
from __future__ import annotations
import logging
import os
import re
from pathlib import Path
from typing import Any, Dict, List, Optional, Tuple
import json
import shlex
import subprocess
from textwrap import indent
import smtplib
from email.message import EmailMessage
import requests
import yaml
from fastapi import FastAPI, HTTPException, Request
from langchain.llms.base import LLM
LOGGER = logging.getLogger("grafana_webhook")
logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s %(message)s")
RUNBOOK_PATH = Path(os.environ.get("RUNBOOK_PATH", "alert_runbook.yaml"))
ANSIBLE_HOSTS_PATH = Path(os.environ.get("ANSIBLE_HOSTS_PATH", "/etc/ansible/hosts"))
OPENROUTER_API_KEY = os.environ.get("OPENROUTER_API_KEY")
OPENROUTER_MODEL = os.environ.get("OPENROUTER_MODEL", "openai/gpt-4o-mini")
OPENROUTER_REFERER = os.environ.get("OPENROUTER_REFERER")
OPENROUTER_TITLE = os.environ.get("OPENROUTER_TITLE", "Grafana Alert Webhook")
TRIAGE_ENABLE_COMMANDS = os.environ.get("TRIAGE_ENABLE_COMMANDS", "0").lower() in {"1", "true", "yes", "on"}
TRIAGE_COMMAND_RUNNER = os.environ.get("TRIAGE_COMMAND_RUNNER", "ssh").lower()
TRIAGE_SSH_USER = os.environ.get("TRIAGE_SSH_USER", "root")
TRIAGE_SSH_OPTIONS = shlex.split(
os.environ.get("TRIAGE_SSH_OPTIONS", "-o BatchMode=yes -o StrictHostKeyChecking=no -o ConnectTimeout=5")
)
TRIAGE_COMMAND_TIMEOUT = int(os.environ.get("TRIAGE_COMMAND_TIMEOUT", "60"))
TRIAGE_DEFAULT_OS = os.environ.get("TRIAGE_DEFAULT_OS", "linux").lower()
TRIAGE_MAX_COMMANDS = int(os.environ.get("TRIAGE_MAX_COMMANDS", "3"))
TRIAGE_OUTPUT_LIMIT = int(os.environ.get("TRIAGE_OUTPUT_LIMIT", "1200"))
# LangChain-driven investigation loop
TRIAGE_MAX_ITERATIONS = int(os.environ.get("TRIAGE_MAX_ITERATIONS", "3"))
TRIAGE_FOLLOWUP_MAX_COMMANDS = int(os.environ.get("TRIAGE_FOLLOWUP_MAX_COMMANDS", "4"))
TRIAGE_SYSTEM_PROMPT = os.environ.get(
"TRIAGE_SYSTEM_PROMPT",
(
"You are assisting with on-call investigations. Always reply with JSON containing:\n"
"analysis: your findings and next steps.\n"
"followup_commands: list of command specs (summary, command, optional runner/os) to gather more data.\n"
"complete: true when sufficient information is gathered.\n"
"Request commands only when more evidence is required."
),
)
TRIAGE_VERBOSE_LOGS = os.environ.get("TRIAGE_VERBOSE_LOGS", "0").lower() in {"1", "true", "yes", "on"}
TRIAGE_EMAIL_ENABLED = os.environ.get("TRIAGE_EMAIL_ENABLED", "0").lower() in {"1", "true", "yes", "on"}
TRIAGE_EMAIL_FROM = os.environ.get("TRIAGE_EMAIL_FROM")
TRIAGE_EMAIL_TO = [addr.strip() for addr in os.environ.get("TRIAGE_EMAIL_TO", "").split(",") if addr.strip()]
TRIAGE_SMTP_HOST = os.environ.get("TRIAGE_SMTP_HOST")
TRIAGE_SMTP_PORT = int(os.environ.get("TRIAGE_SMTP_PORT", "587"))
TRIAGE_SMTP_USER = os.environ.get("TRIAGE_SMTP_USER")
TRIAGE_SMTP_PASSWORD = os.environ.get("TRIAGE_SMTP_PASSWORD")
TRIAGE_SMTP_STARTTLS = os.environ.get("TRIAGE_SMTP_STARTTLS", "1").lower() in {"1", "true", "yes", "on"}
TRIAGE_SMTP_SSL = os.environ.get("TRIAGE_SMTP_SSL", "0").lower() in {"1", "true", "yes", "on"}
TRIAGE_SMTP_TIMEOUT = int(os.environ.get("TRIAGE_SMTP_TIMEOUT", "20"))
def log_verbose(title: str, content: Any) -> None:
"""Emit structured verbose logs when TRIAGE_VERBOSE_LOGS is enabled."""
if not TRIAGE_VERBOSE_LOGS:
return
if isinstance(content, (dict, list)):
text = json.dumps(content, indent=2, sort_keys=True)
else:
text = str(content)
LOGGER.info("%s:\n%s", title, text)
def email_notifications_configured() -> bool:
if not TRIAGE_EMAIL_ENABLED:
return False
if not (TRIAGE_SMTP_HOST and TRIAGE_EMAIL_FROM and TRIAGE_EMAIL_TO):
LOGGER.warning(
"Email notifications requested but TRIAGE_SMTP_HOST/TRIAGE_EMAIL_FROM/TRIAGE_EMAIL_TO are incomplete."
)
return False
return True
def format_command_results_for_email(results: List[Dict[str, Any]]) -> str:
if not results:
return "No automation commands were executed."
lines: List[str] = []
for result in results:
lines.append(f"- {result.get('summary')} [{result.get('status')}] {result.get('command')}")
stdout = result.get("stdout")
stderr = result.get("stderr")
error = result.get("error")
if stdout:
lines.append(indent(truncate_text(stdout, 800), " stdout: "))
if stderr:
lines.append(indent(truncate_text(stderr, 800), " stderr: "))
if error and result.get("status") != "ok":
lines.append(f" error: {error}")
return "\n".join(lines)
def build_email_body(alert: Dict[str, Any], result: Dict[str, Any], context: Dict[str, Any]) -> str:
lines = [
f"Alert: {result.get('alertname')} ({result.get('rule_uid')})",
f"Host: {result.get('host') or context.get('host')}",
f"Status: {alert.get('status')}",
f"Value: {alert.get('value') or alert.get('annotations', {}).get('value')}",
f"Grafana Rule: {context.get('rule_url')}",
"",
"LLM Summary:",
result.get("llm_summary") or "(no summary returned)",
"",
"Command Results:",
format_command_results_for_email(result.get("command_results") or []),
]
return "\n".join(lines)
def send_summary_email(alert: Dict[str, Any], result: Dict[str, Any], context: Dict[str, Any]) -> None:
if not email_notifications_configured():
return
subject_host = result.get("host") or context.get("host") or "(unknown host)"
subject = f"[Grafana] {result.get('alertname')} - {subject_host}"
body = build_email_body(alert, result, context)
message = EmailMessage()
message["Subject"] = subject
message["From"] = TRIAGE_EMAIL_FROM
message["To"] = ", ".join(TRIAGE_EMAIL_TO)
message.set_content(body)
try:
smtp_class = smtplib.SMTP_SSL if TRIAGE_SMTP_SSL else smtplib.SMTP
with smtp_class(TRIAGE_SMTP_HOST, TRIAGE_SMTP_PORT, timeout=TRIAGE_SMTP_TIMEOUT) as client:
if TRIAGE_SMTP_STARTTLS and not TRIAGE_SMTP_SSL:
client.starttls()
if TRIAGE_SMTP_USER:
client.login(TRIAGE_SMTP_USER, TRIAGE_SMTP_PASSWORD or "")
client.send_message(message)
LOGGER.info("Sent summary email to %s for host %s", ", ".join(TRIAGE_EMAIL_TO), subject_host)
except Exception as exc: # pylint: disable=broad-except
LOGGER.exception("Failed to send summary email: %s", exc)
app = FastAPI(title="Grafana Alert Webhook", version="1.0.0")
_RUNBOOK_INDEX: Dict[str, Dict[str, Any]] = {}
_INVENTORY_INDEX: Dict[str, Dict[str, Any]] = {}
_INVENTORY_GROUP_VARS: Dict[str, Dict[str, str]] = {}
_TEMPLATE_PATTERN = re.compile(r"{{\s*([a-zA-Z0-9_]+)\s*}}")
DEFAULT_SYSTEM_PROMPT = TRIAGE_SYSTEM_PROMPT
class OpenRouterLLM(LLM):
"""LangChain-compatible LLM that calls OpenRouter chat completions."""
api_key: str
model_name: str
def __init__(self, api_key: str, model_name: str, **kwargs: Any) -> None:
super().__init__(api_key=api_key, model_name=model_name, **kwargs)
@property
def _llm_type(self) -> str:
return "openrouter"
def __call__(self, prompt: str, stop: Optional[List[str]] = None) -> str:
return self._call(prompt, stop=stop)
def _call(self, prompt: str, stop: Optional[List[str]] = None) -> str:
payload = {
"model": self.model_name,
"messages": [
{"role": "system", "content": DEFAULT_SYSTEM_PROMPT},
{"role": "user", "content": prompt},
],
}
log_verbose("OpenRouter request payload", payload)
if stop:
payload["stop"] = stop
LOGGER.info("Posting to OpenRouter model=%s via LangChain", self.model_name)
headers = {
"Authorization": f"Bearer {self.api_key}",
"Content-Type": "application/json",
}
if OPENROUTER_REFERER:
headers["HTTP-Referer"] = OPENROUTER_REFERER
if OPENROUTER_TITLE:
headers["X-Title"] = OPENROUTER_TITLE
response = requests.post("https://openrouter.ai/api/v1/chat/completions", json=payload, headers=headers, timeout=90)
if response.status_code >= 400:
try:
detail = response.json()
except ValueError:
detail = response.text
raise RuntimeError(f"OpenRouter error {response.status_code}: {detail}")
data = response.json()
log_verbose("OpenRouter raw response", data)
choices = data.get("choices")
if not choices:
raise RuntimeError("OpenRouter returned no choices")
return choices[0]["message"]["content"].strip()
def load_runbook() -> Dict[str, Dict[str, Any]]:
"""Load runbook YAML into a dict keyed by rule_uid."""
if not RUNBOOK_PATH.exists():
raise FileNotFoundError(f"Runbook file not found: {RUNBOOK_PATH}")
with RUNBOOK_PATH.open("r", encoding="utf-8") as handle:
data = yaml.safe_load(handle) or {}
alerts = data.get("alerts", [])
index: Dict[str, Dict[str, Any]] = {}
for entry in alerts:
uid = entry.get("rule_uid")
if uid:
index[str(uid)] = entry
LOGGER.info("Loaded %d runbook entries from %s", len(index), RUNBOOK_PATH)
return index
def _normalize_host_key(host: str) -> str:
return host.strip().lower()
def _parse_key_value_tokens(tokens: List[str]) -> Dict[str, str]:
data: Dict[str, str] = {}
for token in tokens:
if "=" not in token:
continue
key, value = token.split("=", 1)
data[key] = value
return data
def load_ansible_inventory() -> Tuple[Dict[str, Dict[str, Any]], Dict[str, Dict[str, str]]]:
"""Parse a simple INI-style Ansible hosts file into host/group maps."""
if not ANSIBLE_HOSTS_PATH.exists():
LOGGER.warning("Ansible inventory not found at %s", ANSIBLE_HOSTS_PATH)
return {}, {}
hosts: Dict[str, Dict[str, Any]] = {}
group_vars: Dict[str, Dict[str, str]] = {}
current_group: Optional[str] = None
current_section: str = "hosts"
with ANSIBLE_HOSTS_PATH.open("r", encoding="utf-8") as handle:
for raw_line in handle:
line = raw_line.strip()
if not line or line.startswith("#"):
continue
if line.startswith("[") and line.endswith("]"):
header = line[1:-1].strip()
if ":" in header:
group_name, suffix = header.split(":", 1)
current_group = group_name
current_section = suffix
else:
current_group = header
current_section = "hosts"
group_vars.setdefault(current_group, {})
continue
cleaned = line.split("#", 1)[0].strip()
if not cleaned:
continue
tokens = shlex.split(cleaned)
if not tokens:
continue
if current_section == "vars":
vars_dict = _parse_key_value_tokens(tokens)
group_vars.setdefault(current_group or "all", {}).update(vars_dict)
continue
host_token = tokens[0]
host_key = _normalize_host_key(host_token)
entry = hosts.setdefault(host_key, {"name": host_token, "definitions": [], "groups": set()})
vars_dict = _parse_key_value_tokens(tokens[1:])
entry["definitions"].append({"group": current_group, "vars": vars_dict})
if current_group:
entry["groups"].add(current_group)
LOGGER.info("Loaded %d Ansible inventory hosts from %s", len(hosts), ANSIBLE_HOSTS_PATH)
return hosts, group_vars
def _lookup_inventory(host: Optional[str]) -> Optional[Dict[str, Any]]:
if not host:
return None
key = _normalize_host_key(host)
entry = _INVENTORY_INDEX.get(key)
if entry:
return entry
# try stripping domain suffix
short = key.split(".", 1)[0]
if short != key:
return _INVENTORY_INDEX.get(short)
return None
def _merge_group_vars(groups: List[str], host_os: Optional[str]) -> Dict[str, str]:
merged: Dict[str, str] = {}
global_vars = _INVENTORY_GROUP_VARS.get("all")
if global_vars:
merged.update(global_vars)
normalized_os = (host_os or "").lower()
for group in groups:
vars_dict = _INVENTORY_GROUP_VARS.get(group)
if not vars_dict:
continue
connection = (vars_dict.get("ansible_connection") or "").lower()
if connection == "winrm" and normalized_os == "linux":
continue
merged.update(vars_dict)
return merged
def _should_include_definition(group: Optional[str], vars_dict: Dict[str, str], host_os: Optional[str]) -> bool:
if not vars_dict:
return False
normalized_os = (host_os or "").lower()
connection = (vars_dict.get("ansible_connection") or "").lower()
if connection == "winrm" and normalized_os != "windows":
return False
if connection == "local":
return True
if group and "windows" in group.lower() and normalized_os == "linux" and not connection:
return False
return True
def apply_inventory_context(context: Dict[str, Any]) -> None:
"""Augment the alert context with SSH metadata from the Ansible inventory."""
host = context.get("host")
entry = _lookup_inventory(host)
if not entry:
return
merged_vars = _merge_group_vars(list(entry.get("groups", [])), context.get("host_os"))
for definition in entry.get("definitions", []):
group_name = definition.get("group")
vars_dict = definition.get("vars", {})
if _should_include_definition(group_name, vars_dict, context.get("host_os")):
merged_vars.update(vars_dict)
ansible_host = merged_vars.get("ansible_host") or entry.get("name")
ansible_user = merged_vars.get("ansible_user")
ansible_port = merged_vars.get("ansible_port")
ssh_common_args = merged_vars.get("ansible_ssh_common_args")
ssh_key = merged_vars.get("ansible_ssh_private_key_file")
connection = (merged_vars.get("ansible_connection") or "").lower()
host_os = (context.get("host_os") or "").lower()
if connection == "winrm" and host_os != "windows":
for key in (
"ansible_connection",
"ansible_port",
"ansible_password",
"ansible_winrm_server_cert_validation",
"ansible_winrm_scheme",
):
merged_vars.pop(key, None)
connection = ""
context.setdefault("ssh_host", ansible_host or host)
if ansible_user:
context["ssh_user"] = ansible_user
if ansible_port:
context["ssh_port"] = ansible_port
if ssh_common_args:
context["ssh_common_args"] = ssh_common_args
if ssh_key:
context["ssh_identity_file"] = ssh_key
context.setdefault("inventory_groups", list(entry.get("groups", [])))
if connection == "local":
context.setdefault("preferred_runner", "local")
elif connection in {"", "ssh", "smart"}:
context.setdefault("preferred_runner", "ssh")
context.setdefault("inventory_groups", list(entry.get("groups", [])))
def render_template(template: str, context: Dict[str, Any]) -> str:
"""Very small mustache-style renderer for {{ var }} placeholders."""
def replace(match: re.Match[str]) -> str:
key = match.group(1)
return str(context.get(key, match.group(0)))
return _TEMPLATE_PATTERN.sub(replace, template)
def extract_rule_uid(alert: Dict[str, Any], parent_payload: Dict[str, Any]) -> Optional[str]:
"""Grafana webhooks may include rule UID in different fields."""
candidates: List[Any] = [
alert.get("ruleUid"),
alert.get("rule_uid"),
alert.get("ruleId"),
alert.get("uid"),
alert.get("labels", {}).get("rule_uid"),
alert.get("labels", {}).get("ruleUid"),
parent_payload.get("ruleUid"),
parent_payload.get("rule_uid"),
parent_payload.get("ruleId"),
]
for candidate in candidates:
if candidate:
return str(candidate)
# Fall back to Grafana URL parsing if present
url = (
alert.get("ruleUrl")
or parent_payload.get("ruleUrl")
or alert.get("generatorURL")
or parent_payload.get("generatorURL")
)
if url and "/alerting/" in url:
return url.rstrip("/").split("/")[-2]
return None
def derive_fallback_rule_uid(alert: Dict[str, Any], parent_payload: Dict[str, Any]) -> str:
"""Construct a deterministic identifier when Grafana omits rule UIDs."""
labels = alert.get("labels", {})
candidates = [
alert.get("fingerprint"),
labels.get("alertname"),
labels.get("host"),
labels.get("instance"),
parent_payload.get("groupKey"),
parent_payload.get("title"),
]
for candidate in candidates:
if candidate:
return str(candidate)
return "unknown-alert"
def build_fallback_runbook_entry(alert: Dict[str, Any], parent_payload: Dict[str, Any]) -> Dict[str, Any]:
"""Return a generic runbook entry so every alert can be processed."""
labels = alert.get("labels", {})
alertname = labels.get("alertname") or parent_payload.get("title") or "Grafana Alert"
host = labels.get("host") or labels.get("instance") or "(unknown host)"
return {
"name": f"{alertname} (auto)",
"llm_prompt": (
"Grafana alert {{ alertname }} fired for {{ host }}.\n"
"No dedicated runbook entry exists. Use the payload details, command outputs, "
"and your own reasoning to propose likely causes, evidence to gather, and remediation steps."
),
"triage": [],
"evidence_to_collect": [],
"remediation": [],
"metadata": {"host": host},
}
def summarize_dict(prefix: str, data: Optional[Dict[str, Any]]) -> str:
if not data:
return f"{prefix}: (none)"
parts = ", ".join(f"{key}={value}" for key, value in sorted(data.items()))
return f"{prefix}: {parts}"
def determine_host_os(alert: Dict[str, Any]) -> str:
"""Infer host operating system from labels or defaults."""
labels = alert.get("labels", {})
candidates = [
labels.get("os"),
labels.get("platform"),
labels.get("system"),
alert.get("os"),
]
for candidate in candidates:
if candidate:
value = str(candidate).lower()
if "win" in value:
return "windows"
if any(token in value for token in ("linux", "unix", "darwin")):
return "linux"
host = (labels.get("host") or labels.get("instance") or "").lower()
if host.startswith("win") or host.endswith(".localdomain") and "win" in host:
return "windows"
inventory_os = infer_os_from_inventory(labels.get("host") or labels.get("instance"))
if inventory_os:
return inventory_os
return TRIAGE_DEFAULT_OS
def infer_os_from_inventory(host: Optional[str]) -> Optional[str]:
if not host:
return None
entry = _lookup_inventory(host)
if not entry:
return None
for definition in entry.get("definitions", []):
vars_dict = definition.get("vars", {}) or {}
connection = (vars_dict.get("ansible_connection") or "").lower()
if connection == "winrm":
return "windows"
for group in entry.get("groups", []):
if "windows" in (group or "").lower():
return "windows"
return None
def truncate_text(text: str, limit: int = TRIAGE_OUTPUT_LIMIT) -> str:
"""Trim long outputs to keep prompts manageable."""
if not text:
return ""
cleaned = text.strip()
if len(cleaned) <= limit:
return cleaned
return cleaned[:limit] + "... [truncated]"
def gather_command_specs(entry: Dict[str, Any], host_os: str) -> List[Dict[str, Any]]:
"""Collect command specs from triage steps and optional automation sections."""
specs: List[Dict[str, Any]] = []
for step in entry.get("triage", []):
cmd = step.get(host_os)
if not cmd:
continue
specs.append(
{
"summary": step.get("summary") or entry.get("name") or "triage",
"shell": cmd,
"runner": step.get("runner"),
"os": host_os,
}
)
for item in entry.get("automation_commands", []):
target_os = item.get("os", host_os)
if target_os and target_os.lower() != host_os:
continue
specs.append(item)
if TRIAGE_MAX_COMMANDS > 0:
return specs[:TRIAGE_MAX_COMMANDS]
return specs
def build_runner_command(
rendered_command: str,
runner: str,
context: Dict[str, Any],
spec: Dict[str, Any],
) -> Tuple[Any, str, bool, str]:
"""Return the subprocess args, display string, shell flag, and runner label."""
runner = runner or TRIAGE_COMMAND_RUNNER
runner = runner.lower()
if runner == "ssh":
host = spec.get("host") or context.get("ssh_host") or context.get("host")
if not host:
raise RuntimeError("Host not provided for ssh runner.")
ssh_user = spec.get("ssh_user") or context.get("ssh_user") or TRIAGE_SSH_USER
ssh_target = spec.get("ssh_target") or f"{ssh_user}@{host}"
ssh_options = list(TRIAGE_SSH_OPTIONS)
common_args = spec.get("ssh_common_args") or context.get("ssh_common_args")
if common_args:
ssh_options.extend(shlex.split(common_args))
ssh_port = spec.get("ssh_port") or context.get("ssh_port")
if ssh_port:
ssh_options.extend(["-p", str(ssh_port)])
identity_file = spec.get("ssh_identity_file") or context.get("ssh_identity_file")
if identity_file:
ssh_options.extend(["-i", identity_file])
command_list = ["ssh", *ssh_options, ssh_target, rendered_command]
display = " ".join(shlex.quote(part) for part in command_list)
return command_list, display, False, "ssh"
# default to local shell execution
display = rendered_command
return rendered_command, display, True, "local"
def run_subprocess_command(
command: Any,
display: str,
summary: str,
use_shell: bool,
runner_label: str,
) -> Dict[str, Any]:
"""Execute subprocess command and capture results."""
LOGGER.info("Executing command (%s) via %s: %s", summary, runner_label, display)
try:
completed = subprocess.run(
command,
capture_output=True,
text=True,
timeout=TRIAGE_COMMAND_TIMEOUT,
shell=use_shell,
check=False,
)
result = {
"summary": summary,
"command": display,
"runner": runner_label,
"exit_code": completed.returncode,
"stdout": (completed.stdout or "").strip(),
"stderr": (completed.stderr or "").strip(),
"status": "ok" if completed.returncode == 0 else "failed",
}
log_verbose(f"Command result ({summary})", result)
return result
except subprocess.TimeoutExpired as exc:
result = {
"summary": summary,
"command": display,
"runner": runner_label,
"exit_code": None,
"stdout": truncate_text((exc.stdout or "").strip()),
"stderr": truncate_text((exc.stderr or "").strip()),
"status": "timeout",
"error": f"Command timed out after {TRIAGE_COMMAND_TIMEOUT}s",
}
log_verbose(f"Command timeout ({summary})", result)
return result
except Exception as exc: # pylint: disable=broad-except
LOGGER.exception("Command execution failed (%s): %s", summary, exc)
result = {
"summary": summary,
"command": display,
"runner": runner_label,
"exit_code": None,
"stdout": "",
"stderr": "",
"status": "error",
"error": str(exc),
}
log_verbose(f"Command error ({summary})", result)
return result
def run_command_spec(spec: Dict[str, Any], context: Dict[str, Any]) -> Dict[str, Any]:
summary = spec.get("summary") or spec.get("name") or "command"
shell_cmd = spec.get("shell")
if not shell_cmd:
return {"summary": summary, "status": "skipped", "error": "No shell command provided."}
rendered = render_template(shell_cmd, context)
preferred_runner = context.get("preferred_runner")
runner_choice = (spec.get("runner") or preferred_runner or TRIAGE_COMMAND_RUNNER).lower()
try:
command, display, use_shell, runner_label = build_runner_command(rendered, runner_choice, context, spec)
except RuntimeError as exc:
LOGGER.warning("Skipping command '%s': %s", summary, exc)
return {"summary": summary, "status": "skipped", "error": str(exc), "command": rendered}
return run_subprocess_command(command, display, summary, use_shell, runner_label)
def execute_triage_commands(entry: Dict[str, Any], alert: Dict[str, Any], context: Dict[str, Any]) -> List[Dict[str, Any]]:
host_os = context.get("host_os") or determine_host_os(alert)
context["host_os"] = host_os
specs = gather_command_specs(entry, host_os)
if not specs:
LOGGER.info("No triage commands defined for host_os=%s", host_os)
return []
if not TRIAGE_ENABLE_COMMANDS:
LOGGER.info("Command execution disabled; %d commands queued but skipped.", len(specs))
return []
LOGGER.info("Executing up to %d triage commands for host_os=%s", len(specs), host_os)
results = []
for spec in specs:
results.append(run_command_spec(spec, context))
return results
def format_command_results_for_llm(results: List[Dict[str, Any]]) -> str:
lines: List[str] = []
for idx, result in enumerate(results, start=1):
lines.append(f"{idx}. {result.get('summary')} [{result.get('status')}] {result.get('command')}")
stdout = result.get("stdout")
stderr = result.get("stderr")
error = result.get("error")
if stdout:
lines.append(" stdout:")
lines.append(indent(truncate_text(stdout), " "))
if stderr:
lines.append(" stderr:")
lines.append(indent(truncate_text(stderr), " "))
if error and result.get("status") != "ok":
lines.append(f" error: {error}")
if not lines:
return "No command results were available."
return "\n".join(lines)
def parse_structured_response(text: str) -> Optional[Dict[str, Any]]:
cleaned = text.strip()
try:
return json.loads(cleaned)
except json.JSONDecodeError:
start = cleaned.find("{")
end = cleaned.rfind("}")
if start != -1 and end != -1 and end > start:
snippet = cleaned[start : end + 1]
try:
return json.loads(snippet)
except json.JSONDecodeError:
return None
return None
def normalize_followup_command(item: Dict[str, Any]) -> Dict[str, Any]:
return {
"summary": item.get("summary") or item.get("name") or "Follow-up command",
"shell": item.get("command") or item.get("shell"),
"runner": item.get("runner"),
"host": item.get("host") or item.get("target"),
"ssh_user": item.get("ssh_user"),
"os": (item.get("os") or item.get("platform") or "").lower() or None,
}
def investigate_with_langchain(
entry: Dict[str, Any],
alert: Dict[str, Any],
parent_payload: Dict[str, Any],
context: Dict[str, Any],
initial_outputs: List[Dict[str, Any]],
) -> Tuple[str, List[Dict[str, Any]]]:
command_outputs = list(initial_outputs)
prompt = build_prompt(entry, alert, parent_payload, context, command_outputs)
log_verbose("Initial investigation prompt", prompt)
if not OPENROUTER_API_KEY:
return "OPENROUTER_API_KEY is not configured; unable to analyze alert.", command_outputs
llm = OpenRouterLLM(api_key=OPENROUTER_API_KEY, model_name=OPENROUTER_MODEL)
dialogue = (
prompt
+ "\n\nRespond with JSON containing fields analysis, followup_commands, and complete. "
"Request commands only when more evidence is required."
)
total_followup = 0
final_summary = ""
for iteration in range(TRIAGE_MAX_ITERATIONS):
log_verbose(f"LLM dialogue iteration {iteration + 1}", dialogue)
llm_text = llm(dialogue)
log_verbose(f"LLM iteration {iteration + 1} output", llm_text)
dialogue += f"\nAssistant:\n{llm_text}\n"
parsed = parse_structured_response(llm_text)
if parsed:
log_verbose(f"LLM iteration {iteration + 1} parsed response", parsed)
if not parsed:
final_summary = llm_text
break
analysis = parsed.get("analysis") or ""
followups = parsed.get("followup_commands") or parsed.get("commands") or []
final_summary = analysis
complete_flag = bool(parsed.get("complete"))
if complete_flag or not followups:
break
log_verbose(f"LLM iteration {iteration + 1} requested follow-ups", followups)
allowed = max(0, TRIAGE_FOLLOWUP_MAX_COMMANDS - total_followup)
if not TRIAGE_ENABLE_COMMANDS or allowed <= 0:
dialogue += (
"\nUser:\nCommand execution is disabled or budget exhausted. Provide final analysis with JSON format.\n"
)
continue
normalized_cmds: List[Dict[str, Any]] = []
for raw in followups:
if not isinstance(raw, dict):
continue
normalized = normalize_followup_command(raw)
if not normalized.get("shell"):
continue
cmd_os = normalized.get("os")
if cmd_os and cmd_os != context.get("host_os"):
continue
normalized_cmds.append(normalized)
log_verbose(f"Normalized follow-up commands (iteration {iteration + 1})", normalized_cmds)
if not normalized_cmds:
dialogue += "\nUser:\nNo valid commands to run. Finalize analysis in JSON format.\n"
continue
normalized_cmds = normalized_cmds[:allowed]
executed_batch: List[Dict[str, Any]] = []
for spec in normalized_cmds:
executed = run_command_spec(spec, context)
command_outputs.append(executed)
executed_batch.append(executed)
total_followup += 1
result_text = "Follow-up command results:\n" + format_command_results_for_llm(executed_batch)
dialogue += (
"\nUser:\n"
+ result_text
+ "\nUpdate your analysis and respond with JSON (analysis, followup_commands, complete).\n"
)
log_verbose("Executed follow-up commands", result_text)
else:
final_summary = final_summary or "Reached maximum iterations without a conclusive response."
if not final_summary:
final_summary = "LLM did not return a valid analysis."
log_verbose("Final LLM summary", final_summary)
return final_summary, command_outputs
def build_context(alert: Dict[str, Any], parent_payload: Dict[str, Any]) -> Dict[str, Any]:
labels = alert.get("labels", {})
annotations = alert.get("annotations", {})
context = {
"alertname": labels.get("alertname") or alert.get("title") or parent_payload.get("title") or parent_payload.get("ruleName"),
"host": labels.get("host") or labels.get("instance"),
"iface": labels.get("interface"),
"device": labels.get("device"),
"vmid": labels.get("vmid"),
"status": alert.get("status") or parent_payload.get("status"),
"value": alert.get("value") or annotations.get("value"),
"rule_url": alert.get("ruleUrl") or parent_payload.get("ruleUrl"),
}
context.setdefault("ssh_user", TRIAGE_SSH_USER)
return context
def build_prompt(
entry: Dict[str, Any],
alert: Dict[str, Any],
parent_payload: Dict[str, Any],
context: Dict[str, Any],
command_outputs: Optional[List[Dict[str, Any]]] = None,
) -> str:
template = entry.get("llm_prompt", "Alert {{ alertname }} fired for {{ host }}.")
rendered_template = render_template(template, {k: v or "" for k, v in context.items()})
evidence = entry.get("evidence_to_collect", [])
triage_steps = entry.get("triage", [])
remediation = entry.get("remediation", [])
lines = [
rendered_template.strip(),
"",
"Alert payload summary:",
f"- Status: {context.get('status') or alert.get('status')}",
f"- Host: {context.get('host')}",
f"- Value: {context.get('value')}",
f"- StartsAt: {alert.get('startsAt')}",
f"- EndsAt: {alert.get('endsAt')}",
f"- RuleURL: {context.get('rule_url')}",
f"- Host OS (inferred): {context.get('host_os')}",
"- Note: All timestamps are UTC/RFC3339 as provided by Grafana.",
summarize_dict("- Labels", alert.get("labels")),
summarize_dict("- Annotations", alert.get("annotations")),
]
if evidence:
lines.append("")
lines.append("Evidence to gather (for automation reference):")
for item in evidence:
lines.append(f"- {item}")
if triage_steps:
lines.append("")
lines.append("Suggested manual checks:")
for step in triage_steps:
summary = step.get("summary")
linux = step.get("linux")
windows = step.get("windows")
lines.append(f"- {summary}")
if linux:
lines.append(f" Linux: {linux}")
if windows:
lines.append(f" Windows: {windows}")
if remediation:
lines.append("")
lines.append("Remediation ideas:")
for item in remediation:
lines.append(f"- {item}")
if command_outputs:
lines.append("")
lines.append("Command execution results:")
for result in command_outputs:
status = result.get("status", "unknown")
cmd_display = result.get("command", "")
lines.append(f"- {result.get('summary')} [{status}] {cmd_display}")
stdout = result.get("stdout")
stderr = result.get("stderr")
error = result.get("error")
if stdout:
lines.append(" stdout:")
lines.append(indent(truncate_text(stdout), " "))
if stderr:
lines.append(" stderr:")
lines.append(indent(truncate_text(stderr), " "))
if error and status != "ok":
lines.append(f" error: {error}")
return "\n".join(lines).strip()
def get_alerts(payload: Dict[str, Any]) -> List[Dict[str, Any]]:
alerts = payload.get("alerts")
if isinstance(alerts, list) and alerts:
return alerts
return [payload]
@app.on_event("startup")
def startup_event() -> None:
global _RUNBOOK_INDEX, _INVENTORY_INDEX, _INVENTORY_GROUP_VARS
_RUNBOOK_INDEX = load_runbook()
_INVENTORY_INDEX, _INVENTORY_GROUP_VARS = load_ansible_inventory()
LOGGER.info(
"Alert webhook server ready with %d runbook entries and %d inventory hosts.",
len(_RUNBOOK_INDEX),
len(_INVENTORY_INDEX),
)
@app.post("/alerts")
async def handle_alert(request: Request) -> Dict[str, Any]:
payload = await request.json()
LOGGER.info("Received Grafana payload: %s", json.dumps(payload, indent=2, sort_keys=True))
results = []
unmatched = []
for alert in get_alerts(payload):
LOGGER.info("Processing alert: %s", json.dumps(alert, indent=2, sort_keys=True))
unmatched_reason: Optional[str] = None
alert_status = str(alert.get("status") or payload.get("status") or "").lower()
if alert_status and alert_status != "firing":
details = {"reason": "non_firing_status", "status": alert_status, "alert": alert}
unmatched.append(details)
LOGGER.info("Skipping alert with status=%s (only 'firing' alerts are processed).", alert_status)
continue
rule_uid = extract_rule_uid(alert, payload)
if not rule_uid:
unmatched_reason = "missing_rule_uid"
derived_uid = derive_fallback_rule_uid(alert, payload)
details = {"reason": unmatched_reason, "derived_rule_uid": derived_uid, "alert": alert}
unmatched.append(details)
LOGGER.warning("Alert missing rule UID, using fallback identifier %s", derived_uid)
rule_uid = derived_uid
entry = _RUNBOOK_INDEX.get(rule_uid)
runbook_matched = entry is not None
if not entry:
unmatched_reason = unmatched_reason or "no_runbook_entry"
details = {"reason": unmatched_reason, "rule_uid": rule_uid, "alert": alert}
unmatched.append(details)
LOGGER.warning("No runbook entry for rule_uid=%s, using generic fallback.", rule_uid)
entry = build_fallback_runbook_entry(alert, payload)
context = build_context(alert, payload)
context["host_os"] = determine_host_os(alert)
context["rule_uid"] = rule_uid
apply_inventory_context(context)
initial_outputs = execute_triage_commands(entry, alert, context)
try:
llm_text, command_outputs = investigate_with_langchain(entry, alert, payload, context, initial_outputs)
except Exception as exc: # pylint: disable=broad-except
LOGGER.exception("Investigation failed for rule_uid=%s: %s", rule_uid, exc)
raise HTTPException(status_code=502, detail=f"LLM investigation error: {exc}") from exc
result = {
"rule_uid": rule_uid,
"alertname": entry.get("name"),
"host": alert.get("labels", {}).get("host"),
"llm_summary": llm_text,
"command_results": command_outputs,
"runbook_matched": runbook_matched,
}
if not runbook_matched and unmatched_reason:
result["fallback_reason"] = unmatched_reason
results.append(result)
send_summary_email(alert, result, context)
return {"processed": len(results), "results": results, "unmatched": unmatched}
@app.post("/reload-runbook")
def reload_runbook() -> Dict[str, Any]:
global _RUNBOOK_INDEX, _INVENTORY_INDEX, _INVENTORY_GROUP_VARS
_RUNBOOK_INDEX = load_runbook()
_INVENTORY_INDEX, _INVENTORY_GROUP_VARS = load_ansible_inventory()
return {"entries": len(_RUNBOOK_INDEX), "inventory_hosts": len(_INVENTORY_INDEX)}