#!/usr/bin/env python3 """ Minimal FastAPI web server that accepts Grafana alert webhooks, looks up the matching runbook entry, builds an LLM prompt, and calls OpenRouter to return a triage summary. Run with: uvicorn scripts.grafana_alert_webhook:app --host 0.0.0.0 --port 8081 Environment variables: RUNBOOK_PATH Path to alert_runbook.yaml (default: ./alert_runbook.yaml) OPENROUTER_API_KEY Required; API token for https://openrouter.ai OPENROUTER_MODEL Optional; default openai/gpt-4o-mini OPENROUTER_REFERER Optional referer header OPENROUTER_TITLE Optional title header (default: Grafana Alert Webhook) """ from __future__ import annotations import logging import os import re from pathlib import Path from typing import Any, Dict, List, Optional, Tuple import json import shlex import subprocess from textwrap import indent import smtplib from email.message import EmailMessage import requests import yaml from fastapi import FastAPI, HTTPException, Request from langchain.llms.base import LLM LOGGER = logging.getLogger("grafana_webhook") logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s %(message)s") RUNBOOK_PATH = Path(os.environ.get("RUNBOOK_PATH", "alert_runbook.yaml")) ANSIBLE_HOSTS_PATH = Path(os.environ.get("ANSIBLE_HOSTS_PATH", "/etc/ansible/hosts")) OPENROUTER_API_KEY = os.environ.get("OPENROUTER_API_KEY") OPENROUTER_MODEL = os.environ.get("OPENROUTER_MODEL", "openai/gpt-4o-mini") OPENROUTER_REFERER = os.environ.get("OPENROUTER_REFERER") OPENROUTER_TITLE = os.environ.get("OPENROUTER_TITLE", "Grafana Alert Webhook") TRIAGE_ENABLE_COMMANDS = os.environ.get("TRIAGE_ENABLE_COMMANDS", "0").lower() in {"1", "true", "yes", "on"} TRIAGE_COMMAND_RUNNER = os.environ.get("TRIAGE_COMMAND_RUNNER", "ssh").lower() TRIAGE_SSH_USER = os.environ.get("TRIAGE_SSH_USER", "root") TRIAGE_SSH_OPTIONS = shlex.split( os.environ.get("TRIAGE_SSH_OPTIONS", "-o BatchMode=yes -o StrictHostKeyChecking=no -o ConnectTimeout=5") ) TRIAGE_COMMAND_TIMEOUT = int(os.environ.get("TRIAGE_COMMAND_TIMEOUT", "60")) TRIAGE_DEFAULT_OS = os.environ.get("TRIAGE_DEFAULT_OS", "linux").lower() TRIAGE_MAX_COMMANDS = int(os.environ.get("TRIAGE_MAX_COMMANDS", "3")) TRIAGE_OUTPUT_LIMIT = int(os.environ.get("TRIAGE_OUTPUT_LIMIT", "1200")) # LangChain-driven investigation loop TRIAGE_MAX_ITERATIONS = int(os.environ.get("TRIAGE_MAX_ITERATIONS", "3")) TRIAGE_FOLLOWUP_MAX_COMMANDS = int(os.environ.get("TRIAGE_FOLLOWUP_MAX_COMMANDS", "4")) TRIAGE_SYSTEM_PROMPT = os.environ.get( "TRIAGE_SYSTEM_PROMPT", ( "You are assisting with on-call investigations. Always reply with JSON containing:\n" "analysis: your findings and next steps.\n" "followup_commands: list of command specs (summary, command, optional runner/os) to gather more data.\n" "complete: true when sufficient information is gathered.\n" "Request commands only when more evidence is required." ), ) TRIAGE_VERBOSE_LOGS = os.environ.get("TRIAGE_VERBOSE_LOGS", "0").lower() in {"1", "true", "yes", "on"} TRIAGE_EMAIL_ENABLED = os.environ.get("TRIAGE_EMAIL_ENABLED", "0").lower() in {"1", "true", "yes", "on"} TRIAGE_EMAIL_FROM = os.environ.get("TRIAGE_EMAIL_FROM") TRIAGE_EMAIL_TO = [addr.strip() for addr in os.environ.get("TRIAGE_EMAIL_TO", "").split(",") if addr.strip()] TRIAGE_SMTP_HOST = os.environ.get("TRIAGE_SMTP_HOST") TRIAGE_SMTP_PORT = int(os.environ.get("TRIAGE_SMTP_PORT", "587")) TRIAGE_SMTP_USER = os.environ.get("TRIAGE_SMTP_USER") TRIAGE_SMTP_PASSWORD = os.environ.get("TRIAGE_SMTP_PASSWORD") TRIAGE_SMTP_STARTTLS = os.environ.get("TRIAGE_SMTP_STARTTLS", "1").lower() in {"1", "true", "yes", "on"} TRIAGE_SMTP_SSL = os.environ.get("TRIAGE_SMTP_SSL", "0").lower() in {"1", "true", "yes", "on"} TRIAGE_SMTP_TIMEOUT = int(os.environ.get("TRIAGE_SMTP_TIMEOUT", "20")) def log_verbose(title: str, content: Any) -> None: """Emit structured verbose logs when TRIAGE_VERBOSE_LOGS is enabled.""" if not TRIAGE_VERBOSE_LOGS: return if isinstance(content, (dict, list)): text = json.dumps(content, indent=2, sort_keys=True) else: text = str(content) LOGGER.info("%s:\n%s", title, text) def email_notifications_configured() -> bool: if not TRIAGE_EMAIL_ENABLED: return False if not (TRIAGE_SMTP_HOST and TRIAGE_EMAIL_FROM and TRIAGE_EMAIL_TO): LOGGER.warning( "Email notifications requested but TRIAGE_SMTP_HOST/TRIAGE_EMAIL_FROM/TRIAGE_EMAIL_TO are incomplete." ) return False return True def format_command_results_for_email(results: List[Dict[str, Any]]) -> str: if not results: return "No automation commands were executed." lines: List[str] = [] for result in results: lines.append(f"- {result.get('summary')} [{result.get('status')}] {result.get('command')}") stdout = result.get("stdout") stderr = result.get("stderr") error = result.get("error") if stdout: lines.append(indent(truncate_text(stdout, 800), " stdout: ")) if stderr: lines.append(indent(truncate_text(stderr, 800), " stderr: ")) if error and result.get("status") != "ok": lines.append(f" error: {error}") return "\n".join(lines) def build_email_body(alert: Dict[str, Any], result: Dict[str, Any], context: Dict[str, Any]) -> str: lines = [ f"Alert: {result.get('alertname')} ({result.get('rule_uid')})", f"Host: {result.get('host') or context.get('host')}", f"Status: {alert.get('status')}", f"Value: {alert.get('value') or alert.get('annotations', {}).get('value')}", f"Grafana Rule: {context.get('rule_url')}", "", "LLM Summary:", result.get("llm_summary") or "(no summary returned)", "", "Command Results:", format_command_results_for_email(result.get("command_results") or []), ] return "\n".join(lines) def send_summary_email(alert: Dict[str, Any], result: Dict[str, Any], context: Dict[str, Any]) -> None: if not email_notifications_configured(): return subject_host = result.get("host") or context.get("host") or "(unknown host)" subject = f"[Grafana] {result.get('alertname')} - {subject_host}" body = build_email_body(alert, result, context) message = EmailMessage() message["Subject"] = subject message["From"] = TRIAGE_EMAIL_FROM message["To"] = ", ".join(TRIAGE_EMAIL_TO) message.set_content(body) try: smtp_class = smtplib.SMTP_SSL if TRIAGE_SMTP_SSL else smtplib.SMTP with smtp_class(TRIAGE_SMTP_HOST, TRIAGE_SMTP_PORT, timeout=TRIAGE_SMTP_TIMEOUT) as client: if TRIAGE_SMTP_STARTTLS and not TRIAGE_SMTP_SSL: client.starttls() if TRIAGE_SMTP_USER: client.login(TRIAGE_SMTP_USER, TRIAGE_SMTP_PASSWORD or "") client.send_message(message) LOGGER.info("Sent summary email to %s for host %s", ", ".join(TRIAGE_EMAIL_TO), subject_host) except Exception as exc: # pylint: disable=broad-except LOGGER.exception("Failed to send summary email: %s", exc) app = FastAPI(title="Grafana Alert Webhook", version="1.0.0") _RUNBOOK_INDEX: Dict[str, Dict[str, Any]] = {} _INVENTORY_INDEX: Dict[str, Dict[str, Any]] = {} _INVENTORY_GROUP_VARS: Dict[str, Dict[str, str]] = {} _TEMPLATE_PATTERN = re.compile(r"{{\s*([a-zA-Z0-9_]+)\s*}}") DEFAULT_SYSTEM_PROMPT = TRIAGE_SYSTEM_PROMPT class OpenRouterLLM(LLM): """LangChain-compatible LLM that calls OpenRouter chat completions.""" api_key: str model_name: str def __init__(self, api_key: str, model_name: str, **kwargs: Any) -> None: super().__init__(api_key=api_key, model_name=model_name, **kwargs) @property def _llm_type(self) -> str: return "openrouter" def __call__(self, prompt: str, stop: Optional[List[str]] = None) -> str: return self._call(prompt, stop=stop) def _call(self, prompt: str, stop: Optional[List[str]] = None) -> str: payload = { "model": self.model_name, "messages": [ {"role": "system", "content": DEFAULT_SYSTEM_PROMPT}, {"role": "user", "content": prompt}, ], } log_verbose("OpenRouter request payload", payload) if stop: payload["stop"] = stop LOGGER.info("Posting to OpenRouter model=%s via LangChain", self.model_name) headers = { "Authorization": f"Bearer {self.api_key}", "Content-Type": "application/json", } if OPENROUTER_REFERER: headers["HTTP-Referer"] = OPENROUTER_REFERER if OPENROUTER_TITLE: headers["X-Title"] = OPENROUTER_TITLE response = requests.post("https://openrouter.ai/api/v1/chat/completions", json=payload, headers=headers, timeout=90) if response.status_code >= 400: try: detail = response.json() except ValueError: detail = response.text raise RuntimeError(f"OpenRouter error {response.status_code}: {detail}") data = response.json() log_verbose("OpenRouter raw response", data) choices = data.get("choices") if not choices: raise RuntimeError("OpenRouter returned no choices") return choices[0]["message"]["content"].strip() def load_runbook() -> Dict[str, Dict[str, Any]]: """Load runbook YAML into a dict keyed by rule_uid.""" if not RUNBOOK_PATH.exists(): raise FileNotFoundError(f"Runbook file not found: {RUNBOOK_PATH}") with RUNBOOK_PATH.open("r", encoding="utf-8") as handle: data = yaml.safe_load(handle) or {} alerts = data.get("alerts", []) index: Dict[str, Dict[str, Any]] = {} for entry in alerts: uid = entry.get("rule_uid") if uid: index[str(uid)] = entry LOGGER.info("Loaded %d runbook entries from %s", len(index), RUNBOOK_PATH) return index def _normalize_host_key(host: str) -> str: return host.strip().lower() def _parse_key_value_tokens(tokens: List[str]) -> Dict[str, str]: data: Dict[str, str] = {} for token in tokens: if "=" not in token: continue key, value = token.split("=", 1) data[key] = value return data def load_ansible_inventory() -> Tuple[Dict[str, Dict[str, Any]], Dict[str, Dict[str, str]]]: """Parse a simple INI-style Ansible hosts file into host/group maps.""" if not ANSIBLE_HOSTS_PATH.exists(): LOGGER.warning("Ansible inventory not found at %s", ANSIBLE_HOSTS_PATH) return {}, {} hosts: Dict[str, Dict[str, Any]] = {} group_vars: Dict[str, Dict[str, str]] = {} current_group: Optional[str] = None current_section: str = "hosts" with ANSIBLE_HOSTS_PATH.open("r", encoding="utf-8") as handle: for raw_line in handle: line = raw_line.strip() if not line or line.startswith("#"): continue if line.startswith("[") and line.endswith("]"): header = line[1:-1].strip() if ":" in header: group_name, suffix = header.split(":", 1) current_group = group_name current_section = suffix else: current_group = header current_section = "hosts" group_vars.setdefault(current_group, {}) continue cleaned = line.split("#", 1)[0].strip() if not cleaned: continue tokens = shlex.split(cleaned) if not tokens: continue if current_section == "vars": vars_dict = _parse_key_value_tokens(tokens) group_vars.setdefault(current_group or "all", {}).update(vars_dict) continue host_token = tokens[0] host_key = _normalize_host_key(host_token) entry = hosts.setdefault(host_key, {"name": host_token, "definitions": [], "groups": set()}) vars_dict = _parse_key_value_tokens(tokens[1:]) entry["definitions"].append({"group": current_group, "vars": vars_dict}) if current_group: entry["groups"].add(current_group) LOGGER.info("Loaded %d Ansible inventory hosts from %s", len(hosts), ANSIBLE_HOSTS_PATH) return hosts, group_vars def _lookup_inventory(host: Optional[str]) -> Optional[Dict[str, Any]]: if not host: return None key = _normalize_host_key(host) entry = _INVENTORY_INDEX.get(key) if entry: return entry # try stripping domain suffix short = key.split(".", 1)[0] if short != key: return _INVENTORY_INDEX.get(short) return None def _merge_group_vars(groups: List[str], host_os: Optional[str]) -> Dict[str, str]: merged: Dict[str, str] = {} global_vars = _INVENTORY_GROUP_VARS.get("all") if global_vars: merged.update(global_vars) normalized_os = (host_os or "").lower() for group in groups: vars_dict = _INVENTORY_GROUP_VARS.get(group) if not vars_dict: continue connection = (vars_dict.get("ansible_connection") or "").lower() if connection == "winrm" and normalized_os == "linux": continue merged.update(vars_dict) return merged def _should_include_definition(group: Optional[str], vars_dict: Dict[str, str], host_os: Optional[str]) -> bool: if not vars_dict: return False normalized_os = (host_os or "").lower() connection = (vars_dict.get("ansible_connection") or "").lower() if connection == "winrm" and normalized_os != "windows": return False if connection == "local": return True if group and "windows" in group.lower() and normalized_os == "linux" and not connection: return False return True def apply_inventory_context(context: Dict[str, Any]) -> None: """Augment the alert context with SSH metadata from the Ansible inventory.""" host = context.get("host") entry = _lookup_inventory(host) if not entry: return merged_vars = _merge_group_vars(list(entry.get("groups", [])), context.get("host_os")) for definition in entry.get("definitions", []): group_name = definition.get("group") vars_dict = definition.get("vars", {}) if _should_include_definition(group_name, vars_dict, context.get("host_os")): merged_vars.update(vars_dict) ansible_host = merged_vars.get("ansible_host") or entry.get("name") ansible_user = merged_vars.get("ansible_user") ansible_port = merged_vars.get("ansible_port") ssh_common_args = merged_vars.get("ansible_ssh_common_args") ssh_key = merged_vars.get("ansible_ssh_private_key_file") connection = (merged_vars.get("ansible_connection") or "").lower() host_os = (context.get("host_os") or "").lower() if connection == "winrm" and host_os != "windows": for key in ( "ansible_connection", "ansible_port", "ansible_password", "ansible_winrm_server_cert_validation", "ansible_winrm_scheme", ): merged_vars.pop(key, None) connection = "" context.setdefault("ssh_host", ansible_host or host) if ansible_user: context["ssh_user"] = ansible_user if ansible_port: context["ssh_port"] = ansible_port if ssh_common_args: context["ssh_common_args"] = ssh_common_args if ssh_key: context["ssh_identity_file"] = ssh_key context.setdefault("inventory_groups", list(entry.get("groups", []))) if connection == "local": context.setdefault("preferred_runner", "local") elif connection in {"", "ssh", "smart"}: context.setdefault("preferred_runner", "ssh") context.setdefault("inventory_groups", list(entry.get("groups", []))) def render_template(template: str, context: Dict[str, Any]) -> str: """Very small mustache-style renderer for {{ var }} placeholders.""" def replace(match: re.Match[str]) -> str: key = match.group(1) return str(context.get(key, match.group(0))) return _TEMPLATE_PATTERN.sub(replace, template) def extract_rule_uid(alert: Dict[str, Any], parent_payload: Dict[str, Any]) -> Optional[str]: """Grafana webhooks may include rule UID in different fields.""" candidates: List[Any] = [ alert.get("ruleUid"), alert.get("rule_uid"), alert.get("ruleId"), alert.get("uid"), alert.get("labels", {}).get("rule_uid"), alert.get("labels", {}).get("ruleUid"), parent_payload.get("ruleUid"), parent_payload.get("rule_uid"), parent_payload.get("ruleId"), ] for candidate in candidates: if candidate: return str(candidate) # Fall back to Grafana URL parsing if present url = ( alert.get("ruleUrl") or parent_payload.get("ruleUrl") or alert.get("generatorURL") or parent_payload.get("generatorURL") ) if url and "/alerting/" in url: return url.rstrip("/").split("/")[-2] return None def derive_fallback_rule_uid(alert: Dict[str, Any], parent_payload: Dict[str, Any]) -> str: """Construct a deterministic identifier when Grafana omits rule UIDs.""" labels = alert.get("labels", {}) candidates = [ alert.get("fingerprint"), labels.get("alertname"), labels.get("host"), labels.get("instance"), parent_payload.get("groupKey"), parent_payload.get("title"), ] for candidate in candidates: if candidate: return str(candidate) return "unknown-alert" def build_fallback_runbook_entry(alert: Dict[str, Any], parent_payload: Dict[str, Any]) -> Dict[str, Any]: """Return a generic runbook entry so every alert can be processed.""" labels = alert.get("labels", {}) alertname = labels.get("alertname") or parent_payload.get("title") or "Grafana Alert" host = labels.get("host") or labels.get("instance") or "(unknown host)" return { "name": f"{alertname} (auto)", "llm_prompt": ( "Grafana alert {{ alertname }} fired for {{ host }}.\n" "No dedicated runbook entry exists. Use the payload details, command outputs, " "and your own reasoning to propose likely causes, evidence to gather, and remediation steps." ), "triage": [], "evidence_to_collect": [], "remediation": [], "metadata": {"host": host}, } def summarize_dict(prefix: str, data: Optional[Dict[str, Any]]) -> str: if not data: return f"{prefix}: (none)" parts = ", ".join(f"{key}={value}" for key, value in sorted(data.items())) return f"{prefix}: {parts}" def determine_host_os(alert: Dict[str, Any]) -> str: """Infer host operating system from labels or defaults.""" labels = alert.get("labels", {}) candidates = [ labels.get("os"), labels.get("platform"), labels.get("system"), alert.get("os"), ] for candidate in candidates: if candidate: value = str(candidate).lower() if "win" in value: return "windows" if any(token in value for token in ("linux", "unix", "darwin")): return "linux" host = (labels.get("host") or labels.get("instance") or "").lower() if host.startswith("win") or host.endswith(".localdomain") and "win" in host: return "windows" inventory_os = infer_os_from_inventory(labels.get("host") or labels.get("instance")) if inventory_os: return inventory_os return TRIAGE_DEFAULT_OS def infer_os_from_inventory(host: Optional[str]) -> Optional[str]: if not host: return None entry = _lookup_inventory(host) if not entry: return None for definition in entry.get("definitions", []): vars_dict = definition.get("vars", {}) or {} connection = (vars_dict.get("ansible_connection") or "").lower() if connection == "winrm": return "windows" for group in entry.get("groups", []): if "windows" in (group or "").lower(): return "windows" return None def truncate_text(text: str, limit: int = TRIAGE_OUTPUT_LIMIT) -> str: """Trim long outputs to keep prompts manageable.""" if not text: return "" cleaned = text.strip() if len(cleaned) <= limit: return cleaned return cleaned[:limit] + "... [truncated]" def gather_command_specs(entry: Dict[str, Any], host_os: str) -> List[Dict[str, Any]]: """Collect command specs from triage steps and optional automation sections.""" specs: List[Dict[str, Any]] = [] for step in entry.get("triage", []): cmd = step.get(host_os) if not cmd: continue specs.append( { "summary": step.get("summary") or entry.get("name") or "triage", "shell": cmd, "runner": step.get("runner"), "os": host_os, } ) for item in entry.get("automation_commands", []): target_os = item.get("os", host_os) if target_os and target_os.lower() != host_os: continue specs.append(item) if TRIAGE_MAX_COMMANDS > 0: return specs[:TRIAGE_MAX_COMMANDS] return specs def build_runner_command( rendered_command: str, runner: str, context: Dict[str, Any], spec: Dict[str, Any], ) -> Tuple[Any, str, bool, str]: """Return the subprocess args, display string, shell flag, and runner label.""" runner = runner or TRIAGE_COMMAND_RUNNER runner = runner.lower() if runner == "ssh": host = spec.get("host") or context.get("ssh_host") or context.get("host") if not host: raise RuntimeError("Host not provided for ssh runner.") ssh_user = spec.get("ssh_user") or context.get("ssh_user") or TRIAGE_SSH_USER ssh_target = spec.get("ssh_target") or f"{ssh_user}@{host}" ssh_options = list(TRIAGE_SSH_OPTIONS) common_args = spec.get("ssh_common_args") or context.get("ssh_common_args") if common_args: ssh_options.extend(shlex.split(common_args)) ssh_port = spec.get("ssh_port") or context.get("ssh_port") if ssh_port: ssh_options.extend(["-p", str(ssh_port)]) identity_file = spec.get("ssh_identity_file") or context.get("ssh_identity_file") if identity_file: ssh_options.extend(["-i", identity_file]) command_list = ["ssh", *ssh_options, ssh_target, rendered_command] display = " ".join(shlex.quote(part) for part in command_list) return command_list, display, False, "ssh" # default to local shell execution display = rendered_command return rendered_command, display, True, "local" def run_subprocess_command( command: Any, display: str, summary: str, use_shell: bool, runner_label: str, ) -> Dict[str, Any]: """Execute subprocess command and capture results.""" LOGGER.info("Executing command (%s) via %s: %s", summary, runner_label, display) try: completed = subprocess.run( command, capture_output=True, text=True, timeout=TRIAGE_COMMAND_TIMEOUT, shell=use_shell, check=False, ) result = { "summary": summary, "command": display, "runner": runner_label, "exit_code": completed.returncode, "stdout": (completed.stdout or "").strip(), "stderr": (completed.stderr or "").strip(), "status": "ok" if completed.returncode == 0 else "failed", } log_verbose(f"Command result ({summary})", result) return result except subprocess.TimeoutExpired as exc: result = { "summary": summary, "command": display, "runner": runner_label, "exit_code": None, "stdout": truncate_text((exc.stdout or "").strip()), "stderr": truncate_text((exc.stderr or "").strip()), "status": "timeout", "error": f"Command timed out after {TRIAGE_COMMAND_TIMEOUT}s", } log_verbose(f"Command timeout ({summary})", result) return result except Exception as exc: # pylint: disable=broad-except LOGGER.exception("Command execution failed (%s): %s", summary, exc) result = { "summary": summary, "command": display, "runner": runner_label, "exit_code": None, "stdout": "", "stderr": "", "status": "error", "error": str(exc), } log_verbose(f"Command error ({summary})", result) return result def run_command_spec(spec: Dict[str, Any], context: Dict[str, Any]) -> Dict[str, Any]: summary = spec.get("summary") or spec.get("name") or "command" shell_cmd = spec.get("shell") if not shell_cmd: return {"summary": summary, "status": "skipped", "error": "No shell command provided."} rendered = render_template(shell_cmd, context) preferred_runner = context.get("preferred_runner") runner_choice = (spec.get("runner") or preferred_runner or TRIAGE_COMMAND_RUNNER).lower() try: command, display, use_shell, runner_label = build_runner_command(rendered, runner_choice, context, spec) except RuntimeError as exc: LOGGER.warning("Skipping command '%s': %s", summary, exc) return {"summary": summary, "status": "skipped", "error": str(exc), "command": rendered} return run_subprocess_command(command, display, summary, use_shell, runner_label) def execute_triage_commands(entry: Dict[str, Any], alert: Dict[str, Any], context: Dict[str, Any]) -> List[Dict[str, Any]]: host_os = context.get("host_os") or determine_host_os(alert) context["host_os"] = host_os specs = gather_command_specs(entry, host_os) if not specs: LOGGER.info("No triage commands defined for host_os=%s", host_os) return [] if not TRIAGE_ENABLE_COMMANDS: LOGGER.info("Command execution disabled; %d commands queued but skipped.", len(specs)) return [] LOGGER.info("Executing up to %d triage commands for host_os=%s", len(specs), host_os) results = [] for spec in specs: results.append(run_command_spec(spec, context)) return results def format_command_results_for_llm(results: List[Dict[str, Any]]) -> str: lines: List[str] = [] for idx, result in enumerate(results, start=1): lines.append(f"{idx}. {result.get('summary')} [{result.get('status')}] {result.get('command')}") stdout = result.get("stdout") stderr = result.get("stderr") error = result.get("error") if stdout: lines.append(" stdout:") lines.append(indent(truncate_text(stdout), " ")) if stderr: lines.append(" stderr:") lines.append(indent(truncate_text(stderr), " ")) if error and result.get("status") != "ok": lines.append(f" error: {error}") if not lines: return "No command results were available." return "\n".join(lines) def parse_structured_response(text: str) -> Optional[Dict[str, Any]]: cleaned = text.strip() try: return json.loads(cleaned) except json.JSONDecodeError: start = cleaned.find("{") end = cleaned.rfind("}") if start != -1 and end != -1 and end > start: snippet = cleaned[start : end + 1] try: return json.loads(snippet) except json.JSONDecodeError: return None return None def normalize_followup_command(item: Dict[str, Any]) -> Dict[str, Any]: return { "summary": item.get("summary") or item.get("name") or "Follow-up command", "shell": item.get("command") or item.get("shell"), "runner": item.get("runner"), "host": item.get("host") or item.get("target"), "ssh_user": item.get("ssh_user"), "os": (item.get("os") or item.get("platform") or "").lower() or None, } def investigate_with_langchain( entry: Dict[str, Any], alert: Dict[str, Any], parent_payload: Dict[str, Any], context: Dict[str, Any], initial_outputs: List[Dict[str, Any]], ) -> Tuple[str, List[Dict[str, Any]]]: command_outputs = list(initial_outputs) prompt = build_prompt(entry, alert, parent_payload, context, command_outputs) log_verbose("Initial investigation prompt", prompt) if not OPENROUTER_API_KEY: return "OPENROUTER_API_KEY is not configured; unable to analyze alert.", command_outputs llm = OpenRouterLLM(api_key=OPENROUTER_API_KEY, model_name=OPENROUTER_MODEL) dialogue = ( prompt + "\n\nRespond with JSON containing fields analysis, followup_commands, and complete. " "Request commands only when more evidence is required." ) total_followup = 0 final_summary = "" for iteration in range(TRIAGE_MAX_ITERATIONS): log_verbose(f"LLM dialogue iteration {iteration + 1}", dialogue) llm_text = llm(dialogue) log_verbose(f"LLM iteration {iteration + 1} output", llm_text) dialogue += f"\nAssistant:\n{llm_text}\n" parsed = parse_structured_response(llm_text) if parsed: log_verbose(f"LLM iteration {iteration + 1} parsed response", parsed) if not parsed: final_summary = llm_text break analysis = parsed.get("analysis") or "" followups = parsed.get("followup_commands") or parsed.get("commands") or [] final_summary = analysis complete_flag = bool(parsed.get("complete")) if complete_flag or not followups: break log_verbose(f"LLM iteration {iteration + 1} requested follow-ups", followups) allowed = max(0, TRIAGE_FOLLOWUP_MAX_COMMANDS - total_followup) if not TRIAGE_ENABLE_COMMANDS or allowed <= 0: dialogue += ( "\nUser:\nCommand execution is disabled or budget exhausted. Provide final analysis with JSON format.\n" ) continue normalized_cmds: List[Dict[str, Any]] = [] for raw in followups: if not isinstance(raw, dict): continue normalized = normalize_followup_command(raw) if not normalized.get("shell"): continue cmd_os = normalized.get("os") if cmd_os and cmd_os != context.get("host_os"): continue normalized_cmds.append(normalized) log_verbose(f"Normalized follow-up commands (iteration {iteration + 1})", normalized_cmds) if not normalized_cmds: dialogue += "\nUser:\nNo valid commands to run. Finalize analysis in JSON format.\n" continue normalized_cmds = normalized_cmds[:allowed] executed_batch: List[Dict[str, Any]] = [] for spec in normalized_cmds: executed = run_command_spec(spec, context) command_outputs.append(executed) executed_batch.append(executed) total_followup += 1 result_text = "Follow-up command results:\n" + format_command_results_for_llm(executed_batch) dialogue += ( "\nUser:\n" + result_text + "\nUpdate your analysis and respond with JSON (analysis, followup_commands, complete).\n" ) log_verbose("Executed follow-up commands", result_text) else: final_summary = final_summary or "Reached maximum iterations without a conclusive response." if not final_summary: final_summary = "LLM did not return a valid analysis." log_verbose("Final LLM summary", final_summary) return final_summary, command_outputs def build_context(alert: Dict[str, Any], parent_payload: Dict[str, Any]) -> Dict[str, Any]: labels = alert.get("labels", {}) annotations = alert.get("annotations", {}) context = { "alertname": labels.get("alertname") or alert.get("title") or parent_payload.get("title") or parent_payload.get("ruleName"), "host": labels.get("host") or labels.get("instance"), "iface": labels.get("interface"), "device": labels.get("device"), "vmid": labels.get("vmid"), "status": alert.get("status") or parent_payload.get("status"), "value": alert.get("value") or annotations.get("value"), "rule_url": alert.get("ruleUrl") or parent_payload.get("ruleUrl"), } context.setdefault("ssh_user", TRIAGE_SSH_USER) return context def build_prompt( entry: Dict[str, Any], alert: Dict[str, Any], parent_payload: Dict[str, Any], context: Dict[str, Any], command_outputs: Optional[List[Dict[str, Any]]] = None, ) -> str: template = entry.get("llm_prompt", "Alert {{ alertname }} fired for {{ host }}.") rendered_template = render_template(template, {k: v or "" for k, v in context.items()}) evidence = entry.get("evidence_to_collect", []) triage_steps = entry.get("triage", []) remediation = entry.get("remediation", []) lines = [ rendered_template.strip(), "", "Alert payload summary:", f"- Status: {context.get('status') or alert.get('status')}", f"- Host: {context.get('host')}", f"- Value: {context.get('value')}", f"- StartsAt: {alert.get('startsAt')}", f"- EndsAt: {alert.get('endsAt')}", f"- RuleURL: {context.get('rule_url')}", f"- Host OS (inferred): {context.get('host_os')}", "- Note: All timestamps are UTC/RFC3339 as provided by Grafana.", summarize_dict("- Labels", alert.get("labels")), summarize_dict("- Annotations", alert.get("annotations")), ] if evidence: lines.append("") lines.append("Evidence to gather (for automation reference):") for item in evidence: lines.append(f"- {item}") if triage_steps: lines.append("") lines.append("Suggested manual checks:") for step in triage_steps: summary = step.get("summary") linux = step.get("linux") windows = step.get("windows") lines.append(f"- {summary}") if linux: lines.append(f" Linux: {linux}") if windows: lines.append(f" Windows: {windows}") if remediation: lines.append("") lines.append("Remediation ideas:") for item in remediation: lines.append(f"- {item}") if command_outputs: lines.append("") lines.append("Command execution results:") for result in command_outputs: status = result.get("status", "unknown") cmd_display = result.get("command", "") lines.append(f"- {result.get('summary')} [{status}] {cmd_display}") stdout = result.get("stdout") stderr = result.get("stderr") error = result.get("error") if stdout: lines.append(" stdout:") lines.append(indent(truncate_text(stdout), " ")) if stderr: lines.append(" stderr:") lines.append(indent(truncate_text(stderr), " ")) if error and status != "ok": lines.append(f" error: {error}") return "\n".join(lines).strip() def get_alerts(payload: Dict[str, Any]) -> List[Dict[str, Any]]: alerts = payload.get("alerts") if isinstance(alerts, list) and alerts: return alerts return [payload] @app.on_event("startup") def startup_event() -> None: global _RUNBOOK_INDEX, _INVENTORY_INDEX, _INVENTORY_GROUP_VARS _RUNBOOK_INDEX = load_runbook() _INVENTORY_INDEX, _INVENTORY_GROUP_VARS = load_ansible_inventory() LOGGER.info( "Alert webhook server ready with %d runbook entries and %d inventory hosts.", len(_RUNBOOK_INDEX), len(_INVENTORY_INDEX), ) @app.post("/alerts") async def handle_alert(request: Request) -> Dict[str, Any]: payload = await request.json() LOGGER.info("Received Grafana payload: %s", json.dumps(payload, indent=2, sort_keys=True)) results = [] unmatched = [] for alert in get_alerts(payload): LOGGER.info("Processing alert: %s", json.dumps(alert, indent=2, sort_keys=True)) unmatched_reason: Optional[str] = None alert_status = str(alert.get("status") or payload.get("status") or "").lower() if alert_status and alert_status != "firing": details = {"reason": "non_firing_status", "status": alert_status, "alert": alert} unmatched.append(details) LOGGER.info("Skipping alert with status=%s (only 'firing' alerts are processed).", alert_status) continue rule_uid = extract_rule_uid(alert, payload) if not rule_uid: unmatched_reason = "missing_rule_uid" derived_uid = derive_fallback_rule_uid(alert, payload) details = {"reason": unmatched_reason, "derived_rule_uid": derived_uid, "alert": alert} unmatched.append(details) LOGGER.warning("Alert missing rule UID, using fallback identifier %s", derived_uid) rule_uid = derived_uid entry = _RUNBOOK_INDEX.get(rule_uid) runbook_matched = entry is not None if not entry: unmatched_reason = unmatched_reason or "no_runbook_entry" details = {"reason": unmatched_reason, "rule_uid": rule_uid, "alert": alert} unmatched.append(details) LOGGER.warning("No runbook entry for rule_uid=%s, using generic fallback.", rule_uid) entry = build_fallback_runbook_entry(alert, payload) context = build_context(alert, payload) context["host_os"] = determine_host_os(alert) context["rule_uid"] = rule_uid apply_inventory_context(context) initial_outputs = execute_triage_commands(entry, alert, context) try: llm_text, command_outputs = investigate_with_langchain(entry, alert, payload, context, initial_outputs) except Exception as exc: # pylint: disable=broad-except LOGGER.exception("Investigation failed for rule_uid=%s: %s", rule_uid, exc) raise HTTPException(status_code=502, detail=f"LLM investigation error: {exc}") from exc result = { "rule_uid": rule_uid, "alertname": entry.get("name"), "host": alert.get("labels", {}).get("host"), "llm_summary": llm_text, "command_results": command_outputs, "runbook_matched": runbook_matched, } if not runbook_matched and unmatched_reason: result["fallback_reason"] = unmatched_reason results.append(result) send_summary_email(alert, result, context) return {"processed": len(results), "results": results, "unmatched": unmatched} @app.post("/reload-runbook") def reload_runbook() -> Dict[str, Any]: global _RUNBOOK_INDEX, _INVENTORY_INDEX, _INVENTORY_GROUP_VARS _RUNBOOK_INDEX = load_runbook() _INVENTORY_INDEX, _INVENTORY_GROUP_VARS = load_ansible_inventory() return {"entries": len(_RUNBOOK_INDEX), "inventory_hosts": len(_INVENTORY_INDEX)}