989 lines
39 KiB
Python
Executable File
989 lines
39 KiB
Python
Executable File
#!/usr/bin/env python3
|
|
"""
|
|
Minimal FastAPI web server that accepts Grafana alert webhooks, looks up the
|
|
matching runbook entry, builds an LLM prompt, and calls OpenRouter to return a
|
|
triage summary.
|
|
|
|
Run with:
|
|
uvicorn scripts.grafana_alert_webhook:app --host 0.0.0.0 --port 8081
|
|
|
|
Environment variables:
|
|
RUNBOOK_PATH Path to alert_runbook.yaml (default: ./alert_runbook.yaml)
|
|
OPENROUTER_API_KEY Required; API token for https://openrouter.ai
|
|
OPENROUTER_MODEL Optional; default openai/gpt-4o-mini
|
|
OPENROUTER_REFERER Optional referer header
|
|
OPENROUTER_TITLE Optional title header (default: Grafana Alert Webhook)
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
import logging
|
|
import os
|
|
import re
|
|
from pathlib import Path
|
|
from typing import Any, Dict, List, Optional, Tuple
|
|
import json
|
|
import shlex
|
|
import subprocess
|
|
from textwrap import indent
|
|
import smtplib
|
|
from email.message import EmailMessage
|
|
|
|
import requests
|
|
import yaml
|
|
from fastapi import FastAPI, HTTPException, Request
|
|
from langchain.llms.base import LLM
|
|
|
|
LOGGER = logging.getLogger("grafana_webhook")
|
|
logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s %(message)s")
|
|
|
|
RUNBOOK_PATH = Path(os.environ.get("RUNBOOK_PATH", "alert_runbook.yaml"))
|
|
ANSIBLE_HOSTS_PATH = Path(os.environ.get("ANSIBLE_HOSTS_PATH", "/etc/ansible/hosts"))
|
|
OPENROUTER_API_KEY = os.environ.get("OPENROUTER_API_KEY")
|
|
OPENROUTER_MODEL = os.environ.get("OPENROUTER_MODEL", "openai/gpt-4o-mini")
|
|
OPENROUTER_REFERER = os.environ.get("OPENROUTER_REFERER")
|
|
OPENROUTER_TITLE = os.environ.get("OPENROUTER_TITLE", "Grafana Alert Webhook")
|
|
|
|
TRIAGE_ENABLE_COMMANDS = os.environ.get("TRIAGE_ENABLE_COMMANDS", "0").lower() in {"1", "true", "yes", "on"}
|
|
TRIAGE_COMMAND_RUNNER = os.environ.get("TRIAGE_COMMAND_RUNNER", "ssh").lower()
|
|
TRIAGE_SSH_USER = os.environ.get("TRIAGE_SSH_USER", "root")
|
|
TRIAGE_SSH_OPTIONS = shlex.split(
|
|
os.environ.get("TRIAGE_SSH_OPTIONS", "-o BatchMode=yes -o StrictHostKeyChecking=no -o ConnectTimeout=5")
|
|
)
|
|
TRIAGE_COMMAND_TIMEOUT = int(os.environ.get("TRIAGE_COMMAND_TIMEOUT", "60"))
|
|
TRIAGE_DEFAULT_OS = os.environ.get("TRIAGE_DEFAULT_OS", "linux").lower()
|
|
TRIAGE_MAX_COMMANDS = int(os.environ.get("TRIAGE_MAX_COMMANDS", "3"))
|
|
TRIAGE_OUTPUT_LIMIT = int(os.environ.get("TRIAGE_OUTPUT_LIMIT", "1200"))
|
|
# LangChain-driven investigation loop
|
|
TRIAGE_MAX_ITERATIONS = int(os.environ.get("TRIAGE_MAX_ITERATIONS", "3"))
|
|
TRIAGE_FOLLOWUP_MAX_COMMANDS = int(os.environ.get("TRIAGE_FOLLOWUP_MAX_COMMANDS", "4"))
|
|
TRIAGE_SYSTEM_PROMPT = os.environ.get(
|
|
"TRIAGE_SYSTEM_PROMPT",
|
|
(
|
|
"You are assisting with on-call investigations. Always reply with JSON containing:\n"
|
|
"analysis: your findings and next steps.\n"
|
|
"followup_commands: list of command specs (summary, command, optional runner/os) to gather more data.\n"
|
|
"complete: true when sufficient information is gathered.\n"
|
|
"Request commands only when more evidence is required."
|
|
),
|
|
)
|
|
TRIAGE_VERBOSE_LOGS = os.environ.get("TRIAGE_VERBOSE_LOGS", "0").lower() in {"1", "true", "yes", "on"}
|
|
TRIAGE_EMAIL_ENABLED = os.environ.get("TRIAGE_EMAIL_ENABLED", "0").lower() in {"1", "true", "yes", "on"}
|
|
TRIAGE_EMAIL_FROM = os.environ.get("TRIAGE_EMAIL_FROM")
|
|
TRIAGE_EMAIL_TO = [addr.strip() for addr in os.environ.get("TRIAGE_EMAIL_TO", "").split(",") if addr.strip()]
|
|
TRIAGE_SMTP_HOST = os.environ.get("TRIAGE_SMTP_HOST")
|
|
TRIAGE_SMTP_PORT = int(os.environ.get("TRIAGE_SMTP_PORT", "587"))
|
|
TRIAGE_SMTP_USER = os.environ.get("TRIAGE_SMTP_USER")
|
|
TRIAGE_SMTP_PASSWORD = os.environ.get("TRIAGE_SMTP_PASSWORD")
|
|
TRIAGE_SMTP_STARTTLS = os.environ.get("TRIAGE_SMTP_STARTTLS", "1").lower() in {"1", "true", "yes", "on"}
|
|
TRIAGE_SMTP_SSL = os.environ.get("TRIAGE_SMTP_SSL", "0").lower() in {"1", "true", "yes", "on"}
|
|
TRIAGE_SMTP_TIMEOUT = int(os.environ.get("TRIAGE_SMTP_TIMEOUT", "20"))
|
|
|
|
|
|
def log_verbose(title: str, content: Any) -> None:
|
|
"""Emit structured verbose logs when TRIAGE_VERBOSE_LOGS is enabled."""
|
|
if not TRIAGE_VERBOSE_LOGS:
|
|
return
|
|
if isinstance(content, (dict, list)):
|
|
text = json.dumps(content, indent=2, sort_keys=True)
|
|
else:
|
|
text = str(content)
|
|
LOGGER.info("%s:\n%s", title, text)
|
|
|
|
|
|
def email_notifications_configured() -> bool:
|
|
if not TRIAGE_EMAIL_ENABLED:
|
|
return False
|
|
if not (TRIAGE_SMTP_HOST and TRIAGE_EMAIL_FROM and TRIAGE_EMAIL_TO):
|
|
LOGGER.warning(
|
|
"Email notifications requested but TRIAGE_SMTP_HOST/TRIAGE_EMAIL_FROM/TRIAGE_EMAIL_TO are incomplete."
|
|
)
|
|
return False
|
|
return True
|
|
|
|
|
|
def format_command_results_for_email(results: List[Dict[str, Any]]) -> str:
|
|
if not results:
|
|
return "No automation commands were executed."
|
|
lines: List[str] = []
|
|
for result in results:
|
|
lines.append(f"- {result.get('summary')} [{result.get('status')}] {result.get('command')}")
|
|
stdout = result.get("stdout")
|
|
stderr = result.get("stderr")
|
|
error = result.get("error")
|
|
if stdout:
|
|
lines.append(indent(truncate_text(stdout, 800), " stdout: "))
|
|
if stderr:
|
|
lines.append(indent(truncate_text(stderr, 800), " stderr: "))
|
|
if error and result.get("status") != "ok":
|
|
lines.append(f" error: {error}")
|
|
return "\n".join(lines)
|
|
|
|
|
|
def build_email_body(alert: Dict[str, Any], result: Dict[str, Any], context: Dict[str, Any]) -> str:
|
|
lines = [
|
|
f"Alert: {result.get('alertname')} ({result.get('rule_uid')})",
|
|
f"Host: {result.get('host') or context.get('host')}",
|
|
f"Status: {alert.get('status')}",
|
|
f"Value: {alert.get('value') or alert.get('annotations', {}).get('value')}",
|
|
f"Grafana Rule: {context.get('rule_url')}",
|
|
"",
|
|
"LLM Summary:",
|
|
result.get("llm_summary") or "(no summary returned)",
|
|
"",
|
|
"Command Results:",
|
|
format_command_results_for_email(result.get("command_results") or []),
|
|
]
|
|
return "\n".join(lines)
|
|
|
|
|
|
def send_summary_email(alert: Dict[str, Any], result: Dict[str, Any], context: Dict[str, Any]) -> None:
|
|
if not email_notifications_configured():
|
|
return
|
|
subject_host = result.get("host") or context.get("host") or "(unknown host)"
|
|
subject = f"[Grafana] {result.get('alertname')} - {subject_host}"
|
|
body = build_email_body(alert, result, context)
|
|
message = EmailMessage()
|
|
message["Subject"] = subject
|
|
message["From"] = TRIAGE_EMAIL_FROM
|
|
message["To"] = ", ".join(TRIAGE_EMAIL_TO)
|
|
message.set_content(body)
|
|
try:
|
|
smtp_class = smtplib.SMTP_SSL if TRIAGE_SMTP_SSL else smtplib.SMTP
|
|
with smtp_class(TRIAGE_SMTP_HOST, TRIAGE_SMTP_PORT, timeout=TRIAGE_SMTP_TIMEOUT) as client:
|
|
if TRIAGE_SMTP_STARTTLS and not TRIAGE_SMTP_SSL:
|
|
client.starttls()
|
|
if TRIAGE_SMTP_USER:
|
|
client.login(TRIAGE_SMTP_USER, TRIAGE_SMTP_PASSWORD or "")
|
|
client.send_message(message)
|
|
LOGGER.info("Sent summary email to %s for host %s", ", ".join(TRIAGE_EMAIL_TO), subject_host)
|
|
except Exception as exc: # pylint: disable=broad-except
|
|
LOGGER.exception("Failed to send summary email: %s", exc)
|
|
|
|
app = FastAPI(title="Grafana Alert Webhook", version="1.0.0")
|
|
|
|
_RUNBOOK_INDEX: Dict[str, Dict[str, Any]] = {}
|
|
_INVENTORY_INDEX: Dict[str, Dict[str, Any]] = {}
|
|
_INVENTORY_GROUP_VARS: Dict[str, Dict[str, str]] = {}
|
|
_TEMPLATE_PATTERN = re.compile(r"{{\s*([a-zA-Z0-9_]+)\s*}}")
|
|
|
|
|
|
DEFAULT_SYSTEM_PROMPT = TRIAGE_SYSTEM_PROMPT
|
|
|
|
|
|
class OpenRouterLLM(LLM):
|
|
"""LangChain-compatible LLM that calls OpenRouter chat completions."""
|
|
|
|
api_key: str
|
|
model_name: str
|
|
|
|
def __init__(self, api_key: str, model_name: str, **kwargs: Any) -> None:
|
|
super().__init__(api_key=api_key, model_name=model_name, **kwargs)
|
|
|
|
@property
|
|
def _llm_type(self) -> str:
|
|
return "openrouter"
|
|
|
|
def __call__(self, prompt: str, stop: Optional[List[str]] = None) -> str:
|
|
return self._call(prompt, stop=stop)
|
|
|
|
def _call(self, prompt: str, stop: Optional[List[str]] = None) -> str:
|
|
payload = {
|
|
"model": self.model_name,
|
|
"messages": [
|
|
{"role": "system", "content": DEFAULT_SYSTEM_PROMPT},
|
|
{"role": "user", "content": prompt},
|
|
],
|
|
}
|
|
log_verbose("OpenRouter request payload", payload)
|
|
if stop:
|
|
payload["stop"] = stop
|
|
LOGGER.info("Posting to OpenRouter model=%s via LangChain", self.model_name)
|
|
headers = {
|
|
"Authorization": f"Bearer {self.api_key}",
|
|
"Content-Type": "application/json",
|
|
}
|
|
if OPENROUTER_REFERER:
|
|
headers["HTTP-Referer"] = OPENROUTER_REFERER
|
|
if OPENROUTER_TITLE:
|
|
headers["X-Title"] = OPENROUTER_TITLE
|
|
response = requests.post("https://openrouter.ai/api/v1/chat/completions", json=payload, headers=headers, timeout=90)
|
|
if response.status_code >= 400:
|
|
try:
|
|
detail = response.json()
|
|
except ValueError:
|
|
detail = response.text
|
|
raise RuntimeError(f"OpenRouter error {response.status_code}: {detail}")
|
|
data = response.json()
|
|
log_verbose("OpenRouter raw response", data)
|
|
choices = data.get("choices")
|
|
if not choices:
|
|
raise RuntimeError("OpenRouter returned no choices")
|
|
return choices[0]["message"]["content"].strip()
|
|
|
|
|
|
def load_runbook() -> Dict[str, Dict[str, Any]]:
|
|
"""Load runbook YAML into a dict keyed by rule_uid."""
|
|
if not RUNBOOK_PATH.exists():
|
|
raise FileNotFoundError(f"Runbook file not found: {RUNBOOK_PATH}")
|
|
with RUNBOOK_PATH.open("r", encoding="utf-8") as handle:
|
|
data = yaml.safe_load(handle) or {}
|
|
alerts = data.get("alerts", [])
|
|
index: Dict[str, Dict[str, Any]] = {}
|
|
for entry in alerts:
|
|
uid = entry.get("rule_uid")
|
|
if uid:
|
|
index[str(uid)] = entry
|
|
LOGGER.info("Loaded %d runbook entries from %s", len(index), RUNBOOK_PATH)
|
|
return index
|
|
|
|
|
|
def _normalize_host_key(host: str) -> str:
|
|
return host.strip().lower()
|
|
|
|
|
|
def _parse_key_value_tokens(tokens: List[str]) -> Dict[str, str]:
|
|
data: Dict[str, str] = {}
|
|
for token in tokens:
|
|
if "=" not in token:
|
|
continue
|
|
key, value = token.split("=", 1)
|
|
data[key] = value
|
|
return data
|
|
|
|
|
|
def load_ansible_inventory() -> Tuple[Dict[str, Dict[str, Any]], Dict[str, Dict[str, str]]]:
|
|
"""Parse a simple INI-style Ansible hosts file into host/group maps."""
|
|
if not ANSIBLE_HOSTS_PATH.exists():
|
|
LOGGER.warning("Ansible inventory not found at %s", ANSIBLE_HOSTS_PATH)
|
|
return {}, {}
|
|
hosts: Dict[str, Dict[str, Any]] = {}
|
|
group_vars: Dict[str, Dict[str, str]] = {}
|
|
current_group: Optional[str] = None
|
|
current_section: str = "hosts"
|
|
|
|
with ANSIBLE_HOSTS_PATH.open("r", encoding="utf-8") as handle:
|
|
for raw_line in handle:
|
|
line = raw_line.strip()
|
|
if not line or line.startswith("#"):
|
|
continue
|
|
if line.startswith("[") and line.endswith("]"):
|
|
header = line[1:-1].strip()
|
|
if ":" in header:
|
|
group_name, suffix = header.split(":", 1)
|
|
current_group = group_name
|
|
current_section = suffix
|
|
else:
|
|
current_group = header
|
|
current_section = "hosts"
|
|
group_vars.setdefault(current_group, {})
|
|
continue
|
|
cleaned = line.split("#", 1)[0].strip()
|
|
if not cleaned:
|
|
continue
|
|
tokens = shlex.split(cleaned)
|
|
if not tokens:
|
|
continue
|
|
if current_section == "vars":
|
|
vars_dict = _parse_key_value_tokens(tokens)
|
|
group_vars.setdefault(current_group or "all", {}).update(vars_dict)
|
|
continue
|
|
host_token = tokens[0]
|
|
host_key = _normalize_host_key(host_token)
|
|
entry = hosts.setdefault(host_key, {"name": host_token, "definitions": [], "groups": set()})
|
|
vars_dict = _parse_key_value_tokens(tokens[1:])
|
|
entry["definitions"].append({"group": current_group, "vars": vars_dict})
|
|
if current_group:
|
|
entry["groups"].add(current_group)
|
|
|
|
LOGGER.info("Loaded %d Ansible inventory hosts from %s", len(hosts), ANSIBLE_HOSTS_PATH)
|
|
return hosts, group_vars
|
|
|
|
|
|
def _lookup_inventory(host: Optional[str]) -> Optional[Dict[str, Any]]:
|
|
if not host:
|
|
return None
|
|
key = _normalize_host_key(host)
|
|
entry = _INVENTORY_INDEX.get(key)
|
|
if entry:
|
|
return entry
|
|
# try stripping domain suffix
|
|
short = key.split(".", 1)[0]
|
|
if short != key:
|
|
return _INVENTORY_INDEX.get(short)
|
|
return None
|
|
|
|
|
|
def _merge_group_vars(groups: List[str], host_os: Optional[str]) -> Dict[str, str]:
|
|
merged: Dict[str, str] = {}
|
|
global_vars = _INVENTORY_GROUP_VARS.get("all")
|
|
if global_vars:
|
|
merged.update(global_vars)
|
|
normalized_os = (host_os or "").lower()
|
|
for group in groups:
|
|
vars_dict = _INVENTORY_GROUP_VARS.get(group)
|
|
if not vars_dict:
|
|
continue
|
|
connection = (vars_dict.get("ansible_connection") or "").lower()
|
|
if connection == "winrm" and normalized_os == "linux":
|
|
continue
|
|
merged.update(vars_dict)
|
|
return merged
|
|
|
|
|
|
def _should_include_definition(group: Optional[str], vars_dict: Dict[str, str], host_os: Optional[str]) -> bool:
|
|
if not vars_dict:
|
|
return False
|
|
normalized_os = (host_os or "").lower()
|
|
connection = (vars_dict.get("ansible_connection") or "").lower()
|
|
if connection == "winrm" and normalized_os != "windows":
|
|
return False
|
|
if connection == "local":
|
|
return True
|
|
if group and "windows" in group.lower() and normalized_os == "linux" and not connection:
|
|
return False
|
|
return True
|
|
|
|
|
|
def apply_inventory_context(context: Dict[str, Any]) -> None:
|
|
"""Augment the alert context with SSH metadata from the Ansible inventory."""
|
|
host = context.get("host")
|
|
entry = _lookup_inventory(host)
|
|
if not entry:
|
|
return
|
|
merged_vars = _merge_group_vars(list(entry.get("groups", [])), context.get("host_os"))
|
|
for definition in entry.get("definitions", []):
|
|
group_name = definition.get("group")
|
|
vars_dict = definition.get("vars", {})
|
|
if _should_include_definition(group_name, vars_dict, context.get("host_os")):
|
|
merged_vars.update(vars_dict)
|
|
ansible_host = merged_vars.get("ansible_host") or entry.get("name")
|
|
ansible_user = merged_vars.get("ansible_user")
|
|
ansible_port = merged_vars.get("ansible_port")
|
|
ssh_common_args = merged_vars.get("ansible_ssh_common_args")
|
|
ssh_key = merged_vars.get("ansible_ssh_private_key_file")
|
|
connection = (merged_vars.get("ansible_connection") or "").lower()
|
|
host_os = (context.get("host_os") or "").lower()
|
|
if connection == "winrm" and host_os != "windows":
|
|
for key in (
|
|
"ansible_connection",
|
|
"ansible_port",
|
|
"ansible_password",
|
|
"ansible_winrm_server_cert_validation",
|
|
"ansible_winrm_scheme",
|
|
):
|
|
merged_vars.pop(key, None)
|
|
connection = ""
|
|
|
|
context.setdefault("ssh_host", ansible_host or host)
|
|
if ansible_user:
|
|
context["ssh_user"] = ansible_user
|
|
if ansible_port:
|
|
context["ssh_port"] = ansible_port
|
|
if ssh_common_args:
|
|
context["ssh_common_args"] = ssh_common_args
|
|
if ssh_key:
|
|
context["ssh_identity_file"] = ssh_key
|
|
context.setdefault("inventory_groups", list(entry.get("groups", [])))
|
|
if connection == "local":
|
|
context.setdefault("preferred_runner", "local")
|
|
elif connection in {"", "ssh", "smart"}:
|
|
context.setdefault("preferred_runner", "ssh")
|
|
context.setdefault("inventory_groups", list(entry.get("groups", [])))
|
|
|
|
|
|
def render_template(template: str, context: Dict[str, Any]) -> str:
|
|
"""Very small mustache-style renderer for {{ var }} placeholders."""
|
|
def replace(match: re.Match[str]) -> str:
|
|
key = match.group(1)
|
|
return str(context.get(key, match.group(0)))
|
|
|
|
return _TEMPLATE_PATTERN.sub(replace, template)
|
|
|
|
|
|
def extract_rule_uid(alert: Dict[str, Any], parent_payload: Dict[str, Any]) -> Optional[str]:
|
|
"""Grafana webhooks may include rule UID in different fields."""
|
|
candidates: List[Any] = [
|
|
alert.get("ruleUid"),
|
|
alert.get("rule_uid"),
|
|
alert.get("ruleId"),
|
|
alert.get("uid"),
|
|
alert.get("labels", {}).get("rule_uid"),
|
|
alert.get("labels", {}).get("ruleUid"),
|
|
parent_payload.get("ruleUid"),
|
|
parent_payload.get("rule_uid"),
|
|
parent_payload.get("ruleId"),
|
|
]
|
|
for candidate in candidates:
|
|
if candidate:
|
|
return str(candidate)
|
|
# Fall back to Grafana URL parsing if present
|
|
url = (
|
|
alert.get("ruleUrl")
|
|
or parent_payload.get("ruleUrl")
|
|
or alert.get("generatorURL")
|
|
or parent_payload.get("generatorURL")
|
|
)
|
|
if url and "/alerting/" in url:
|
|
return url.rstrip("/").split("/")[-2]
|
|
return None
|
|
|
|
|
|
def derive_fallback_rule_uid(alert: Dict[str, Any], parent_payload: Dict[str, Any]) -> str:
|
|
"""Construct a deterministic identifier when Grafana omits rule UIDs."""
|
|
labels = alert.get("labels", {})
|
|
candidates = [
|
|
alert.get("fingerprint"),
|
|
labels.get("alertname"),
|
|
labels.get("host"),
|
|
labels.get("instance"),
|
|
parent_payload.get("groupKey"),
|
|
parent_payload.get("title"),
|
|
]
|
|
for candidate in candidates:
|
|
if candidate:
|
|
return str(candidate)
|
|
return "unknown-alert"
|
|
|
|
|
|
def build_fallback_runbook_entry(alert: Dict[str, Any], parent_payload: Dict[str, Any]) -> Dict[str, Any]:
|
|
"""Return a generic runbook entry so every alert can be processed."""
|
|
labels = alert.get("labels", {})
|
|
alertname = labels.get("alertname") or parent_payload.get("title") or "Grafana Alert"
|
|
host = labels.get("host") or labels.get("instance") or "(unknown host)"
|
|
return {
|
|
"name": f"{alertname} (auto)",
|
|
"llm_prompt": (
|
|
"Grafana alert {{ alertname }} fired for {{ host }}.\n"
|
|
"No dedicated runbook entry exists. Use the payload details, command outputs, "
|
|
"and your own reasoning to propose likely causes, evidence to gather, and remediation steps."
|
|
),
|
|
"triage": [],
|
|
"evidence_to_collect": [],
|
|
"remediation": [],
|
|
"metadata": {"host": host},
|
|
}
|
|
|
|
|
|
def summarize_dict(prefix: str, data: Optional[Dict[str, Any]]) -> str:
|
|
if not data:
|
|
return f"{prefix}: (none)"
|
|
parts = ", ".join(f"{key}={value}" for key, value in sorted(data.items()))
|
|
return f"{prefix}: {parts}"
|
|
|
|
|
|
def determine_host_os(alert: Dict[str, Any]) -> str:
|
|
"""Infer host operating system from labels or defaults."""
|
|
labels = alert.get("labels", {})
|
|
candidates = [
|
|
labels.get("os"),
|
|
labels.get("platform"),
|
|
labels.get("system"),
|
|
alert.get("os"),
|
|
]
|
|
for candidate in candidates:
|
|
if candidate:
|
|
value = str(candidate).lower()
|
|
if "win" in value:
|
|
return "windows"
|
|
if any(token in value for token in ("linux", "unix", "darwin")):
|
|
return "linux"
|
|
host = (labels.get("host") or labels.get("instance") or "").lower()
|
|
if host.startswith("win") or host.endswith(".localdomain") and "win" in host:
|
|
return "windows"
|
|
inventory_os = infer_os_from_inventory(labels.get("host") or labels.get("instance"))
|
|
if inventory_os:
|
|
return inventory_os
|
|
return TRIAGE_DEFAULT_OS
|
|
|
|
|
|
def infer_os_from_inventory(host: Optional[str]) -> Optional[str]:
|
|
if not host:
|
|
return None
|
|
entry = _lookup_inventory(host)
|
|
if not entry:
|
|
return None
|
|
for definition in entry.get("definitions", []):
|
|
vars_dict = definition.get("vars", {}) or {}
|
|
connection = (vars_dict.get("ansible_connection") or "").lower()
|
|
if connection == "winrm":
|
|
return "windows"
|
|
for group in entry.get("groups", []):
|
|
if "windows" in (group or "").lower():
|
|
return "windows"
|
|
return None
|
|
|
|
|
|
def truncate_text(text: str, limit: int = TRIAGE_OUTPUT_LIMIT) -> str:
|
|
"""Trim long outputs to keep prompts manageable."""
|
|
if not text:
|
|
return ""
|
|
cleaned = text.strip()
|
|
if len(cleaned) <= limit:
|
|
return cleaned
|
|
return cleaned[:limit] + "... [truncated]"
|
|
|
|
|
|
def gather_command_specs(entry: Dict[str, Any], host_os: str) -> List[Dict[str, Any]]:
|
|
"""Collect command specs from triage steps and optional automation sections."""
|
|
specs: List[Dict[str, Any]] = []
|
|
for step in entry.get("triage", []):
|
|
cmd = step.get(host_os)
|
|
if not cmd:
|
|
continue
|
|
specs.append(
|
|
{
|
|
"summary": step.get("summary") or entry.get("name") or "triage",
|
|
"shell": cmd,
|
|
"runner": step.get("runner"),
|
|
"os": host_os,
|
|
}
|
|
)
|
|
for item in entry.get("automation_commands", []):
|
|
target_os = item.get("os", host_os)
|
|
if target_os and target_os.lower() != host_os:
|
|
continue
|
|
specs.append(item)
|
|
if TRIAGE_MAX_COMMANDS > 0:
|
|
return specs[:TRIAGE_MAX_COMMANDS]
|
|
return specs
|
|
|
|
|
|
def build_runner_command(
|
|
rendered_command: str,
|
|
runner: str,
|
|
context: Dict[str, Any],
|
|
spec: Dict[str, Any],
|
|
) -> Tuple[Any, str, bool, str]:
|
|
"""Return the subprocess args, display string, shell flag, and runner label."""
|
|
runner = runner or TRIAGE_COMMAND_RUNNER
|
|
runner = runner.lower()
|
|
if runner == "ssh":
|
|
host = spec.get("host") or context.get("ssh_host") or context.get("host")
|
|
if not host:
|
|
raise RuntimeError("Host not provided for ssh runner.")
|
|
ssh_user = spec.get("ssh_user") or context.get("ssh_user") or TRIAGE_SSH_USER
|
|
ssh_target = spec.get("ssh_target") or f"{ssh_user}@{host}"
|
|
ssh_options = list(TRIAGE_SSH_OPTIONS)
|
|
common_args = spec.get("ssh_common_args") or context.get("ssh_common_args")
|
|
if common_args:
|
|
ssh_options.extend(shlex.split(common_args))
|
|
ssh_port = spec.get("ssh_port") or context.get("ssh_port")
|
|
if ssh_port:
|
|
ssh_options.extend(["-p", str(ssh_port)])
|
|
identity_file = spec.get("ssh_identity_file") or context.get("ssh_identity_file")
|
|
if identity_file:
|
|
ssh_options.extend(["-i", identity_file])
|
|
command_list = ["ssh", *ssh_options, ssh_target, rendered_command]
|
|
display = " ".join(shlex.quote(part) for part in command_list)
|
|
return command_list, display, False, "ssh"
|
|
# default to local shell execution
|
|
display = rendered_command
|
|
return rendered_command, display, True, "local"
|
|
|
|
|
|
def run_subprocess_command(
|
|
command: Any,
|
|
display: str,
|
|
summary: str,
|
|
use_shell: bool,
|
|
runner_label: str,
|
|
) -> Dict[str, Any]:
|
|
"""Execute subprocess command and capture results."""
|
|
LOGGER.info("Executing command (%s) via %s: %s", summary, runner_label, display)
|
|
try:
|
|
completed = subprocess.run(
|
|
command,
|
|
capture_output=True,
|
|
text=True,
|
|
timeout=TRIAGE_COMMAND_TIMEOUT,
|
|
shell=use_shell,
|
|
check=False,
|
|
)
|
|
result = {
|
|
"summary": summary,
|
|
"command": display,
|
|
"runner": runner_label,
|
|
"exit_code": completed.returncode,
|
|
"stdout": (completed.stdout or "").strip(),
|
|
"stderr": (completed.stderr or "").strip(),
|
|
"status": "ok" if completed.returncode == 0 else "failed",
|
|
}
|
|
log_verbose(f"Command result ({summary})", result)
|
|
return result
|
|
except subprocess.TimeoutExpired as exc:
|
|
result = {
|
|
"summary": summary,
|
|
"command": display,
|
|
"runner": runner_label,
|
|
"exit_code": None,
|
|
"stdout": truncate_text((exc.stdout or "").strip()),
|
|
"stderr": truncate_text((exc.stderr or "").strip()),
|
|
"status": "timeout",
|
|
"error": f"Command timed out after {TRIAGE_COMMAND_TIMEOUT}s",
|
|
}
|
|
log_verbose(f"Command timeout ({summary})", result)
|
|
return result
|
|
except Exception as exc: # pylint: disable=broad-except
|
|
LOGGER.exception("Command execution failed (%s): %s", summary, exc)
|
|
result = {
|
|
"summary": summary,
|
|
"command": display,
|
|
"runner": runner_label,
|
|
"exit_code": None,
|
|
"stdout": "",
|
|
"stderr": "",
|
|
"status": "error",
|
|
"error": str(exc),
|
|
}
|
|
log_verbose(f"Command error ({summary})", result)
|
|
return result
|
|
|
|
|
|
def run_command_spec(spec: Dict[str, Any], context: Dict[str, Any]) -> Dict[str, Any]:
|
|
summary = spec.get("summary") or spec.get("name") or "command"
|
|
shell_cmd = spec.get("shell")
|
|
if not shell_cmd:
|
|
return {"summary": summary, "status": "skipped", "error": "No shell command provided."}
|
|
rendered = render_template(shell_cmd, context)
|
|
preferred_runner = context.get("preferred_runner")
|
|
runner_choice = (spec.get("runner") or preferred_runner or TRIAGE_COMMAND_RUNNER).lower()
|
|
try:
|
|
command, display, use_shell, runner_label = build_runner_command(rendered, runner_choice, context, spec)
|
|
except RuntimeError as exc:
|
|
LOGGER.warning("Skipping command '%s': %s", summary, exc)
|
|
return {"summary": summary, "status": "skipped", "error": str(exc), "command": rendered}
|
|
return run_subprocess_command(command, display, summary, use_shell, runner_label)
|
|
|
|
|
|
def execute_triage_commands(entry: Dict[str, Any], alert: Dict[str, Any], context: Dict[str, Any]) -> List[Dict[str, Any]]:
|
|
host_os = context.get("host_os") or determine_host_os(alert)
|
|
context["host_os"] = host_os
|
|
specs = gather_command_specs(entry, host_os)
|
|
if not specs:
|
|
LOGGER.info("No triage commands defined for host_os=%s", host_os)
|
|
return []
|
|
if not TRIAGE_ENABLE_COMMANDS:
|
|
LOGGER.info("Command execution disabled; %d commands queued but skipped.", len(specs))
|
|
return []
|
|
LOGGER.info("Executing up to %d triage commands for host_os=%s", len(specs), host_os)
|
|
results = []
|
|
for spec in specs:
|
|
results.append(run_command_spec(spec, context))
|
|
return results
|
|
|
|
|
|
def format_command_results_for_llm(results: List[Dict[str, Any]]) -> str:
|
|
lines: List[str] = []
|
|
for idx, result in enumerate(results, start=1):
|
|
lines.append(f"{idx}. {result.get('summary')} [{result.get('status')}] {result.get('command')}")
|
|
stdout = result.get("stdout")
|
|
stderr = result.get("stderr")
|
|
error = result.get("error")
|
|
if stdout:
|
|
lines.append(" stdout:")
|
|
lines.append(indent(truncate_text(stdout), " "))
|
|
if stderr:
|
|
lines.append(" stderr:")
|
|
lines.append(indent(truncate_text(stderr), " "))
|
|
if error and result.get("status") != "ok":
|
|
lines.append(f" error: {error}")
|
|
if not lines:
|
|
return "No command results were available."
|
|
return "\n".join(lines)
|
|
|
|
|
|
def parse_structured_response(text: str) -> Optional[Dict[str, Any]]:
|
|
cleaned = text.strip()
|
|
try:
|
|
return json.loads(cleaned)
|
|
except json.JSONDecodeError:
|
|
start = cleaned.find("{")
|
|
end = cleaned.rfind("}")
|
|
if start != -1 and end != -1 and end > start:
|
|
snippet = cleaned[start : end + 1]
|
|
try:
|
|
return json.loads(snippet)
|
|
except json.JSONDecodeError:
|
|
return None
|
|
return None
|
|
|
|
|
|
def normalize_followup_command(item: Dict[str, Any]) -> Dict[str, Any]:
|
|
return {
|
|
"summary": item.get("summary") or item.get("name") or "Follow-up command",
|
|
"shell": item.get("command") or item.get("shell"),
|
|
"runner": item.get("runner"),
|
|
"host": item.get("host") or item.get("target"),
|
|
"ssh_user": item.get("ssh_user"),
|
|
"os": (item.get("os") or item.get("platform") or "").lower() or None,
|
|
}
|
|
|
|
|
|
def investigate_with_langchain(
|
|
entry: Dict[str, Any],
|
|
alert: Dict[str, Any],
|
|
parent_payload: Dict[str, Any],
|
|
context: Dict[str, Any],
|
|
initial_outputs: List[Dict[str, Any]],
|
|
) -> Tuple[str, List[Dict[str, Any]]]:
|
|
command_outputs = list(initial_outputs)
|
|
prompt = build_prompt(entry, alert, parent_payload, context, command_outputs)
|
|
log_verbose("Initial investigation prompt", prompt)
|
|
if not OPENROUTER_API_KEY:
|
|
return "OPENROUTER_API_KEY is not configured; unable to analyze alert.", command_outputs
|
|
|
|
llm = OpenRouterLLM(api_key=OPENROUTER_API_KEY, model_name=OPENROUTER_MODEL)
|
|
dialogue = (
|
|
prompt
|
|
+ "\n\nRespond with JSON containing fields analysis, followup_commands, and complete. "
|
|
"Request commands only when more evidence is required."
|
|
)
|
|
total_followup = 0
|
|
final_summary = ""
|
|
for iteration in range(TRIAGE_MAX_ITERATIONS):
|
|
log_verbose(f"LLM dialogue iteration {iteration + 1}", dialogue)
|
|
llm_text = llm(dialogue)
|
|
log_verbose(f"LLM iteration {iteration + 1} output", llm_text)
|
|
dialogue += f"\nAssistant:\n{llm_text}\n"
|
|
parsed = parse_structured_response(llm_text)
|
|
if parsed:
|
|
log_verbose(f"LLM iteration {iteration + 1} parsed response", parsed)
|
|
if not parsed:
|
|
final_summary = llm_text
|
|
break
|
|
|
|
analysis = parsed.get("analysis") or ""
|
|
followups = parsed.get("followup_commands") or parsed.get("commands") or []
|
|
final_summary = analysis
|
|
complete_flag = bool(parsed.get("complete"))
|
|
|
|
if complete_flag or not followups:
|
|
break
|
|
|
|
log_verbose(f"LLM iteration {iteration + 1} requested follow-ups", followups)
|
|
allowed = max(0, TRIAGE_FOLLOWUP_MAX_COMMANDS - total_followup)
|
|
if not TRIAGE_ENABLE_COMMANDS or allowed <= 0:
|
|
dialogue += (
|
|
"\nUser:\nCommand execution is disabled or budget exhausted. Provide final analysis with JSON format.\n"
|
|
)
|
|
continue
|
|
|
|
normalized_cmds: List[Dict[str, Any]] = []
|
|
for raw in followups:
|
|
if not isinstance(raw, dict):
|
|
continue
|
|
normalized = normalize_followup_command(raw)
|
|
if not normalized.get("shell"):
|
|
continue
|
|
cmd_os = normalized.get("os")
|
|
if cmd_os and cmd_os != context.get("host_os"):
|
|
continue
|
|
normalized_cmds.append(normalized)
|
|
|
|
log_verbose(f"Normalized follow-up commands (iteration {iteration + 1})", normalized_cmds)
|
|
if not normalized_cmds:
|
|
dialogue += "\nUser:\nNo valid commands to run. Finalize analysis in JSON format.\n"
|
|
continue
|
|
|
|
normalized_cmds = normalized_cmds[:allowed]
|
|
executed_batch: List[Dict[str, Any]] = []
|
|
for spec in normalized_cmds:
|
|
executed = run_command_spec(spec, context)
|
|
command_outputs.append(executed)
|
|
executed_batch.append(executed)
|
|
total_followup += 1
|
|
|
|
result_text = "Follow-up command results:\n" + format_command_results_for_llm(executed_batch)
|
|
dialogue += (
|
|
"\nUser:\n"
|
|
+ result_text
|
|
+ "\nUpdate your analysis and respond with JSON (analysis, followup_commands, complete).\n"
|
|
)
|
|
log_verbose("Executed follow-up commands", result_text)
|
|
else:
|
|
final_summary = final_summary or "Reached maximum iterations without a conclusive response."
|
|
|
|
if not final_summary:
|
|
final_summary = "LLM did not return a valid analysis."
|
|
|
|
log_verbose("Final LLM summary", final_summary)
|
|
return final_summary, command_outputs
|
|
|
|
|
|
def build_context(alert: Dict[str, Any], parent_payload: Dict[str, Any]) -> Dict[str, Any]:
|
|
labels = alert.get("labels", {})
|
|
annotations = alert.get("annotations", {})
|
|
context = {
|
|
"alertname": labels.get("alertname") or alert.get("title") or parent_payload.get("title") or parent_payload.get("ruleName"),
|
|
"host": labels.get("host") or labels.get("instance"),
|
|
"iface": labels.get("interface"),
|
|
"device": labels.get("device"),
|
|
"vmid": labels.get("vmid"),
|
|
"status": alert.get("status") or parent_payload.get("status"),
|
|
"value": alert.get("value") or annotations.get("value"),
|
|
"rule_url": alert.get("ruleUrl") or parent_payload.get("ruleUrl"),
|
|
}
|
|
context.setdefault("ssh_user", TRIAGE_SSH_USER)
|
|
return context
|
|
|
|
|
|
def build_prompt(
|
|
entry: Dict[str, Any],
|
|
alert: Dict[str, Any],
|
|
parent_payload: Dict[str, Any],
|
|
context: Dict[str, Any],
|
|
command_outputs: Optional[List[Dict[str, Any]]] = None,
|
|
) -> str:
|
|
template = entry.get("llm_prompt", "Alert {{ alertname }} fired for {{ host }}.")
|
|
rendered_template = render_template(template, {k: v or "" for k, v in context.items()})
|
|
|
|
evidence = entry.get("evidence_to_collect", [])
|
|
triage_steps = entry.get("triage", [])
|
|
remediation = entry.get("remediation", [])
|
|
|
|
lines = [
|
|
rendered_template.strip(),
|
|
"",
|
|
"Alert payload summary:",
|
|
f"- Status: {context.get('status') or alert.get('status')}",
|
|
f"- Host: {context.get('host')}",
|
|
f"- Value: {context.get('value')}",
|
|
f"- StartsAt: {alert.get('startsAt')}",
|
|
f"- EndsAt: {alert.get('endsAt')}",
|
|
f"- RuleURL: {context.get('rule_url')}",
|
|
f"- Host OS (inferred): {context.get('host_os')}",
|
|
"- Note: All timestamps are UTC/RFC3339 as provided by Grafana.",
|
|
summarize_dict("- Labels", alert.get("labels")),
|
|
summarize_dict("- Annotations", alert.get("annotations")),
|
|
]
|
|
|
|
if evidence:
|
|
lines.append("")
|
|
lines.append("Evidence to gather (for automation reference):")
|
|
for item in evidence:
|
|
lines.append(f"- {item}")
|
|
|
|
if triage_steps:
|
|
lines.append("")
|
|
lines.append("Suggested manual checks:")
|
|
for step in triage_steps:
|
|
summary = step.get("summary")
|
|
linux = step.get("linux")
|
|
windows = step.get("windows")
|
|
lines.append(f"- {summary}")
|
|
if linux:
|
|
lines.append(f" Linux: {linux}")
|
|
if windows:
|
|
lines.append(f" Windows: {windows}")
|
|
|
|
if remediation:
|
|
lines.append("")
|
|
lines.append("Remediation ideas:")
|
|
for item in remediation:
|
|
lines.append(f"- {item}")
|
|
|
|
if command_outputs:
|
|
lines.append("")
|
|
lines.append("Command execution results:")
|
|
for result in command_outputs:
|
|
status = result.get("status", "unknown")
|
|
cmd_display = result.get("command", "")
|
|
lines.append(f"- {result.get('summary')} [{status}] {cmd_display}")
|
|
stdout = result.get("stdout")
|
|
stderr = result.get("stderr")
|
|
error = result.get("error")
|
|
if stdout:
|
|
lines.append(" stdout:")
|
|
lines.append(indent(truncate_text(stdout), " "))
|
|
if stderr:
|
|
lines.append(" stderr:")
|
|
lines.append(indent(truncate_text(stderr), " "))
|
|
if error and status != "ok":
|
|
lines.append(f" error: {error}")
|
|
|
|
return "\n".join(lines).strip()
|
|
|
|
|
|
def get_alerts(payload: Dict[str, Any]) -> List[Dict[str, Any]]:
|
|
alerts = payload.get("alerts")
|
|
if isinstance(alerts, list) and alerts:
|
|
return alerts
|
|
return [payload]
|
|
|
|
|
|
@app.on_event("startup")
|
|
def startup_event() -> None:
|
|
global _RUNBOOK_INDEX, _INVENTORY_INDEX, _INVENTORY_GROUP_VARS
|
|
_RUNBOOK_INDEX = load_runbook()
|
|
_INVENTORY_INDEX, _INVENTORY_GROUP_VARS = load_ansible_inventory()
|
|
LOGGER.info(
|
|
"Alert webhook server ready with %d runbook entries and %d inventory hosts.",
|
|
len(_RUNBOOK_INDEX),
|
|
len(_INVENTORY_INDEX),
|
|
)
|
|
|
|
|
|
@app.post("/alerts")
|
|
async def handle_alert(request: Request) -> Dict[str, Any]:
|
|
payload = await request.json()
|
|
LOGGER.info("Received Grafana payload: %s", json.dumps(payload, indent=2, sort_keys=True))
|
|
results = []
|
|
unmatched = []
|
|
for alert in get_alerts(payload):
|
|
LOGGER.info("Processing alert: %s", json.dumps(alert, indent=2, sort_keys=True))
|
|
unmatched_reason: Optional[str] = None
|
|
alert_status = str(alert.get("status") or payload.get("status") or "").lower()
|
|
if alert_status and alert_status != "firing":
|
|
details = {"reason": "non_firing_status", "status": alert_status, "alert": alert}
|
|
unmatched.append(details)
|
|
LOGGER.info("Skipping alert with status=%s (only 'firing' alerts are processed).", alert_status)
|
|
continue
|
|
rule_uid = extract_rule_uid(alert, payload)
|
|
if not rule_uid:
|
|
unmatched_reason = "missing_rule_uid"
|
|
derived_uid = derive_fallback_rule_uid(alert, payload)
|
|
details = {"reason": unmatched_reason, "derived_rule_uid": derived_uid, "alert": alert}
|
|
unmatched.append(details)
|
|
LOGGER.warning("Alert missing rule UID, using fallback identifier %s", derived_uid)
|
|
rule_uid = derived_uid
|
|
entry = _RUNBOOK_INDEX.get(rule_uid)
|
|
runbook_matched = entry is not None
|
|
if not entry:
|
|
unmatched_reason = unmatched_reason or "no_runbook_entry"
|
|
details = {"reason": unmatched_reason, "rule_uid": rule_uid, "alert": alert}
|
|
unmatched.append(details)
|
|
LOGGER.warning("No runbook entry for rule_uid=%s, using generic fallback.", rule_uid)
|
|
entry = build_fallback_runbook_entry(alert, payload)
|
|
context = build_context(alert, payload)
|
|
context["host_os"] = determine_host_os(alert)
|
|
context["rule_uid"] = rule_uid
|
|
apply_inventory_context(context)
|
|
initial_outputs = execute_triage_commands(entry, alert, context)
|
|
try:
|
|
llm_text, command_outputs = investigate_with_langchain(entry, alert, payload, context, initial_outputs)
|
|
except Exception as exc: # pylint: disable=broad-except
|
|
LOGGER.exception("Investigation failed for rule_uid=%s: %s", rule_uid, exc)
|
|
raise HTTPException(status_code=502, detail=f"LLM investigation error: {exc}") from exc
|
|
result = {
|
|
"rule_uid": rule_uid,
|
|
"alertname": entry.get("name"),
|
|
"host": alert.get("labels", {}).get("host"),
|
|
"llm_summary": llm_text,
|
|
"command_results": command_outputs,
|
|
"runbook_matched": runbook_matched,
|
|
}
|
|
if not runbook_matched and unmatched_reason:
|
|
result["fallback_reason"] = unmatched_reason
|
|
results.append(result)
|
|
send_summary_email(alert, result, context)
|
|
return {"processed": len(results), "results": results, "unmatched": unmatched}
|
|
|
|
|
|
@app.post("/reload-runbook")
|
|
def reload_runbook() -> Dict[str, Any]:
|
|
global _RUNBOOK_INDEX, _INVENTORY_INDEX, _INVENTORY_GROUP_VARS
|
|
_RUNBOOK_INDEX = load_runbook()
|
|
_INVENTORY_INDEX, _INVENTORY_GROUP_VARS = load_ansible_inventory()
|
|
return {"entries": len(_RUNBOOK_INDEX), "inventory_hosts": len(_INVENTORY_INDEX)}
|