docker-stacks/stacks/mllogwatcher/scripts/grafana_alert_webhook.py

#!/usr/bin/env python3
"""
Minimal FastAPI web server that accepts Grafana alert webhooks, looks up the
matching runbook entry, builds an LLM prompt, and calls OpenRouter to return a
triage summary.

Run with:
    uvicorn scripts.grafana_alert_webhook:app --host 0.0.0.0 --port 8081

Environment variables:
  RUNBOOK_PATH           Path to alert_runbook.yaml (default: ./alert_runbook.yaml)
  OPENROUTER_API_KEY     Required; API token for https://openrouter.ai
  OPENROUTER_MODEL       Optional; default openai/gpt-4o-mini
  OPENROUTER_REFERER     Optional referer header
  OPENROUTER_TITLE       Optional title header (default: Grafana Alert Webhook)
"""

from __future__ import annotations

import logging
import os
import re
from pathlib import Path
from typing import Any, Dict, List, Optional, Tuple
import json
import shlex
import subprocess
from textwrap import indent
import smtplib
from email.message import EmailMessage

import requests
import yaml
from fastapi import FastAPI, HTTPException, Request
from langchain.llms.base import LLM

LOGGER = logging.getLogger("grafana_webhook")
logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s %(message)s")

RUNBOOK_PATH = Path(os.environ.get("RUNBOOK_PATH", "alert_runbook.yaml"))
ANSIBLE_HOSTS_PATH = Path(os.environ.get("ANSIBLE_HOSTS_PATH", "/etc/ansible/hosts"))
OPENROUTER_API_KEY = os.environ.get("OPENROUTER_API_KEY")
OPENROUTER_MODEL = os.environ.get("OPENROUTER_MODEL", "openai/gpt-4o-mini")
OPENROUTER_REFERER = os.environ.get("OPENROUTER_REFERER")
OPENROUTER_TITLE = os.environ.get("OPENROUTER_TITLE", "Grafana Alert Webhook")

TRIAGE_ENABLE_COMMANDS = os.environ.get("TRIAGE_ENABLE_COMMANDS", "0").lower() in {"1", "true", "yes", "on"}
TRIAGE_COMMAND_RUNNER = os.environ.get("TRIAGE_COMMAND_RUNNER", "ssh").lower()
TRIAGE_SSH_USER = os.environ.get("TRIAGE_SSH_USER", "root")
TRIAGE_SSH_OPTIONS = shlex.split(
    os.environ.get("TRIAGE_SSH_OPTIONS", "-o BatchMode=yes -o StrictHostKeyChecking=no -o ConnectTimeout=5")
)
TRIAGE_COMMAND_TIMEOUT = int(os.environ.get("TRIAGE_COMMAND_TIMEOUT", "60"))
TRIAGE_DEFAULT_OS = os.environ.get("TRIAGE_DEFAULT_OS", "linux").lower()
TRIAGE_MAX_COMMANDS = int(os.environ.get("TRIAGE_MAX_COMMANDS", "3"))
TRIAGE_OUTPUT_LIMIT = int(os.environ.get("TRIAGE_OUTPUT_LIMIT", "1200"))
# LangChain-driven investigation loop
TRIAGE_MAX_ITERATIONS = int(os.environ.get("TRIAGE_MAX_ITERATIONS", "3"))
TRIAGE_FOLLOWUP_MAX_COMMANDS = int(os.environ.get("TRIAGE_FOLLOWUP_MAX_COMMANDS", "4"))
TRIAGE_SYSTEM_PROMPT = os.environ.get(
    "TRIAGE_SYSTEM_PROMPT",
    (
        "You are assisting with on-call investigations. Always reply with JSON containing:\n"
        "analysis: your findings and next steps.\n"
        "followup_commands: list of command specs (summary, command, optional runner/os) to gather more data.\n"
        "complete: true when sufficient information is gathered.\n"
        "Request commands only when more evidence is required."
    ),
)
TRIAGE_VERBOSE_LOGS = os.environ.get("TRIAGE_VERBOSE_LOGS", "0").lower() in {"1", "true", "yes", "on"}
TRIAGE_EMAIL_ENABLED = os.environ.get("TRIAGE_EMAIL_ENABLED", "0").lower() in {"1", "true", "yes", "on"}
TRIAGE_EMAIL_FROM = os.environ.get("TRIAGE_EMAIL_FROM")
TRIAGE_EMAIL_TO = [addr.strip() for addr in os.environ.get("TRIAGE_EMAIL_TO", "").split(",") if addr.strip()]
TRIAGE_SMTP_HOST = os.environ.get("TRIAGE_SMTP_HOST")
TRIAGE_SMTP_PORT = int(os.environ.get("TRIAGE_SMTP_PORT", "587"))
TRIAGE_SMTP_USER = os.environ.get("TRIAGE_SMTP_USER")
TRIAGE_SMTP_PASSWORD = os.environ.get("TRIAGE_SMTP_PASSWORD")
TRIAGE_SMTP_STARTTLS = os.environ.get("TRIAGE_SMTP_STARTTLS", "1").lower() in {"1", "true", "yes", "on"}
TRIAGE_SMTP_SSL = os.environ.get("TRIAGE_SMTP_SSL", "0").lower() in {"1", "true", "yes", "on"}
TRIAGE_SMTP_TIMEOUT = int(os.environ.get("TRIAGE_SMTP_TIMEOUT", "20"))


def log_verbose(title: str, content: Any) -> None:
    """Emit structured verbose logs when TRIAGE_VERBOSE_LOGS is enabled."""
    if not TRIAGE_VERBOSE_LOGS:
        return
    if isinstance(content, (dict, list)):
        text = json.dumps(content, indent=2, sort_keys=True)
    else:
        text = str(content)
    LOGGER.info("%s:\n%s", title, text)


def email_notifications_configured() -> bool:
    if not TRIAGE_EMAIL_ENABLED:
        return False
    if not (TRIAGE_SMTP_HOST and TRIAGE_EMAIL_FROM and TRIAGE_EMAIL_TO):
        LOGGER.warning(
            "Email notifications requested but TRIAGE_SMTP_HOST/TRIAGE_EMAIL_FROM/TRIAGE_EMAIL_TO are incomplete."
        )
        return False
    return True


def format_command_results_for_email(results: List[Dict[str, Any]]) -> str:
    if not results:
        return "No automation commands were executed."
    lines: List[str] = []
    for result in results:
        lines.append(f"- {result.get('summary')} [{result.get('status')}] {result.get('command')}")
        stdout = result.get("stdout")
        stderr = result.get("stderr")
        error = result.get("error")
        if stdout:
            lines.append(indent(truncate_text(stdout, 800), "    stdout: "))
        if stderr:
            lines.append(indent(truncate_text(stderr, 800), "    stderr: "))
        if error and result.get("status") != "ok":
            lines.append(f"    error: {error}")
    return "\n".join(lines)


def build_email_body(alert: Dict[str, Any], result: Dict[str, Any], context: Dict[str, Any]) -> str:
    lines = [
        f"Alert: {result.get('alertname')} ({result.get('rule_uid')})",
        f"Host: {result.get('host') or context.get('host')}",
        f"Status: {alert.get('status')}",
        f"Value: {alert.get('value') or alert.get('annotations', {}).get('value')}",
        f"Grafana Rule: {context.get('rule_url')}",
        "",
        "LLM Summary:",
        result.get("llm_summary") or "(no summary returned)",
        "",
        "Command Results:",
        format_command_results_for_email(result.get("command_results") or []),
    ]
    return "\n".join(lines)


def send_summary_email(alert: Dict[str, Any], result: Dict[str, Any], context: Dict[str, Any]) -> None:
    if not email_notifications_configured():
        return
    subject_host = result.get("host") or context.get("host") or "(unknown host)"
    subject = f"[Grafana] {result.get('alertname')} - {subject_host}"
    body = build_email_body(alert, result, context)
    message = EmailMessage()
    message["Subject"] = subject
    message["From"] = TRIAGE_EMAIL_FROM
    message["To"] = ", ".join(TRIAGE_EMAIL_TO)
    message.set_content(body)
    try:
        smtp_class = smtplib.SMTP_SSL if TRIAGE_SMTP_SSL else smtplib.SMTP
        with smtp_class(TRIAGE_SMTP_HOST, TRIAGE_SMTP_PORT, timeout=TRIAGE_SMTP_TIMEOUT) as client:
            if TRIAGE_SMTP_STARTTLS and not TRIAGE_SMTP_SSL:
                client.starttls()
            if TRIAGE_SMTP_USER:
                client.login(TRIAGE_SMTP_USER, TRIAGE_SMTP_PASSWORD or "")
            client.send_message(message)
        LOGGER.info("Sent summary email to %s for host %s", ", ".join(TRIAGE_EMAIL_TO), subject_host)
    except Exception as exc:  # pylint: disable=broad-except
        LOGGER.exception("Failed to send summary email: %s", exc)

app = FastAPI(title="Grafana Alert Webhook", version="1.0.0")

_RUNBOOK_INDEX: Dict[str, Dict[str, Any]] = {}
_INVENTORY_INDEX: Dict[str, Dict[str, Any]] = {}
_INVENTORY_GROUP_VARS: Dict[str, Dict[str, str]] = {}
_TEMPLATE_PATTERN = re.compile(r"{{\s*([a-zA-Z0-9_]+)\s*}}")


DEFAULT_SYSTEM_PROMPT = TRIAGE_SYSTEM_PROMPT


class OpenRouterLLM(LLM):
    """LangChain-compatible LLM that calls OpenRouter chat completions."""

    api_key: str
    model_name: str

    def __init__(self, api_key: str, model_name: str, **kwargs: Any) -> None:
        super().__init__(api_key=api_key, model_name=model_name, **kwargs)

    @property
    def _llm_type(self) -> str:
        return "openrouter"

    def __call__(self, prompt: str, stop: Optional[List[str]] = None) -> str:
        return self._call(prompt, stop=stop)

    def _call(self, prompt: str, stop: Optional[List[str]] = None) -> str:
        payload = {
            "model": self.model_name,
            "messages": [
                {"role": "system", "content": DEFAULT_SYSTEM_PROMPT},
                {"role": "user", "content": prompt},
            ],
        }
        log_verbose("OpenRouter request payload", payload)
        if stop:
            payload["stop"] = stop
        LOGGER.info("Posting to OpenRouter model=%s via LangChain", self.model_name)
        headers = {
            "Authorization": f"Bearer {self.api_key}",
            "Content-Type": "application/json",
        }
        if OPENROUTER_REFERER:
            headers["HTTP-Referer"] = OPENROUTER_REFERER
        if OPENROUTER_TITLE:
            headers["X-Title"] = OPENROUTER_TITLE
        response = requests.post("https://openrouter.ai/api/v1/chat/completions", json=payload, headers=headers, timeout=90)
        if response.status_code >= 400:
            try:
                detail = response.json()
            except ValueError:
                detail = response.text
            raise RuntimeError(f"OpenRouter error {response.status_code}: {detail}")
        data = response.json()
        log_verbose("OpenRouter raw response", data)
        choices = data.get("choices")
        if not choices:
            raise RuntimeError("OpenRouter returned no choices")
        return choices[0]["message"]["content"].strip()


def load_runbook() -> Dict[str, Dict[str, Any]]:
    """Load runbook YAML into a dict keyed by rule_uid."""
    if not RUNBOOK_PATH.exists():
        raise FileNotFoundError(f"Runbook file not found: {RUNBOOK_PATH}")
    with RUNBOOK_PATH.open("r", encoding="utf-8") as handle:
        data = yaml.safe_load(handle) or {}
    alerts = data.get("alerts", [])
    index: Dict[str, Dict[str, Any]] = {}
    for entry in alerts:
        uid = entry.get("rule_uid")
        if uid:
            index[str(uid)] = entry
    LOGGER.info("Loaded %d runbook entries from %s", len(index), RUNBOOK_PATH)
    return index


def _normalize_host_key(host: str) -> str:
    return host.strip().lower()


def _parse_key_value_tokens(tokens: List[str]) -> Dict[str, str]:
    data: Dict[str, str] = {}
    for token in tokens:
        if "=" not in token:
            continue
        key, value = token.split("=", 1)
        data[key] = value
    return data


def load_ansible_inventory() -> Tuple[Dict[str, Dict[str, Any]], Dict[str, Dict[str, str]]]:
    """Parse a simple INI-style Ansible hosts file into host/group maps."""
    if not ANSIBLE_HOSTS_PATH.exists():
        LOGGER.warning("Ansible inventory not found at %s", ANSIBLE_HOSTS_PATH)
        return {}, {}
    hosts: Dict[str, Dict[str, Any]] = {}
    group_vars: Dict[str, Dict[str, str]] = {}
    current_group: Optional[str] = None
    current_section: str = "hosts"

    with ANSIBLE_HOSTS_PATH.open("r", encoding="utf-8") as handle:
        for raw_line in handle:
            line = raw_line.strip()
            if not line or line.startswith("#"):
                continue
            if line.startswith("[") and line.endswith("]"):
                header = line[1:-1].strip()
                if ":" in header:
                    group_name, suffix = header.split(":", 1)
                    current_group = group_name
                    current_section = suffix
                else:
                    current_group = header
                    current_section = "hosts"
                group_vars.setdefault(current_group, {})
                continue
            cleaned = line.split("#", 1)[0].strip()
            if not cleaned:
                continue
            tokens = shlex.split(cleaned)
            if not tokens:
                continue
            if current_section == "vars":
                vars_dict = _parse_key_value_tokens(tokens)
                group_vars.setdefault(current_group or "all", {}).update(vars_dict)
                continue
            host_token = tokens[0]
            host_key = _normalize_host_key(host_token)
            entry = hosts.setdefault(host_key, {"name": host_token, "definitions": [], "groups": set()})
            vars_dict = _parse_key_value_tokens(tokens[1:])
            entry["definitions"].append({"group": current_group, "vars": vars_dict})
            if current_group:
                entry["groups"].add(current_group)

    LOGGER.info("Loaded %d Ansible inventory hosts from %s", len(hosts), ANSIBLE_HOSTS_PATH)
    return hosts, group_vars


def _lookup_inventory(host: Optional[str]) -> Optional[Dict[str, Any]]:
    if not host:
        return None
    key = _normalize_host_key(host)
    entry = _INVENTORY_INDEX.get(key)
    if entry:
        return entry
    # try stripping domain suffix
    short = key.split(".", 1)[0]
    if short != key:
        return _INVENTORY_INDEX.get(short)
    return None


def _merge_group_vars(groups: List[str], host_os: Optional[str]) -> Dict[str, str]:
    merged: Dict[str, str] = {}
    global_vars = _INVENTORY_GROUP_VARS.get("all")
    if global_vars:
        merged.update(global_vars)
    normalized_os = (host_os or "").lower()
    for group in groups:
        vars_dict = _INVENTORY_GROUP_VARS.get(group)
        if not vars_dict:
            continue
        connection = (vars_dict.get("ansible_connection") or "").lower()
        if connection == "winrm" and normalized_os == "linux":
            continue
        merged.update(vars_dict)
    return merged


def _should_include_definition(group: Optional[str], vars_dict: Dict[str, str], host_os: Optional[str]) -> bool:
    if not vars_dict:
        return False
    normalized_os = (host_os or "").lower()
    connection = (vars_dict.get("ansible_connection") or "").lower()
    if connection == "winrm" and normalized_os != "windows":
        return False
    if connection == "local":
        return True
    if group and "windows" in group.lower() and normalized_os == "linux" and not connection:
        return False
    return True


def apply_inventory_context(context: Dict[str, Any]) -> None:
    """Augment the alert context with SSH metadata from the Ansible inventory."""
    host = context.get("host")
    entry = _lookup_inventory(host)
    if not entry:
        return
    merged_vars = _merge_group_vars(list(entry.get("groups", [])), context.get("host_os"))
    for definition in entry.get("definitions", []):
        group_name = definition.get("group")
        vars_dict = definition.get("vars", {})
        if _should_include_definition(group_name, vars_dict, context.get("host_os")):
            merged_vars.update(vars_dict)
    ansible_host = merged_vars.get("ansible_host") or entry.get("name")
    ansible_user = merged_vars.get("ansible_user")
    ansible_port = merged_vars.get("ansible_port")
    ssh_common_args = merged_vars.get("ansible_ssh_common_args")
    ssh_key = merged_vars.get("ansible_ssh_private_key_file")
    connection = (merged_vars.get("ansible_connection") or "").lower()
    host_os = (context.get("host_os") or "").lower()
    if connection == "winrm" and host_os != "windows":
        for key in (
            "ansible_connection",
            "ansible_port",
            "ansible_password",
            "ansible_winrm_server_cert_validation",
            "ansible_winrm_scheme",
        ):
            merged_vars.pop(key, None)
        connection = ""

    context.setdefault("ssh_host", ansible_host or host)
    if ansible_user:
        context["ssh_user"] = ansible_user
    if ansible_port:
        context["ssh_port"] = ansible_port
    if ssh_common_args:
        context["ssh_common_args"] = ssh_common_args
    if ssh_key:
        context["ssh_identity_file"] = ssh_key
    context.setdefault("inventory_groups", list(entry.get("groups", [])))
    if connection == "local":
        context.setdefault("preferred_runner", "local")
    elif connection in {"", "ssh", "smart"}:
        context.setdefault("preferred_runner", "ssh")
    context.setdefault("inventory_groups", list(entry.get("groups", [])))


def render_template(template: str, context: Dict[str, Any]) -> str:
    """Very small mustache-style renderer for {{ var }} placeholders."""
    def replace(match: re.Match[str]) -> str:
        key = match.group(1)
        return str(context.get(key, match.group(0)))

    return _TEMPLATE_PATTERN.sub(replace, template)


def extract_rule_uid(alert: Dict[str, Any], parent_payload: Dict[str, Any]) -> Optional[str]:
    """Grafana webhooks may include rule UID in different fields."""
    candidates: List[Any] = [
        alert.get("ruleUid"),
        alert.get("rule_uid"),
        alert.get("ruleId"),
        alert.get("uid"),
        alert.get("labels", {}).get("rule_uid"),
        alert.get("labels", {}).get("ruleUid"),
        parent_payload.get("ruleUid"),
        parent_payload.get("rule_uid"),
        parent_payload.get("ruleId"),
    ]
    for candidate in candidates:
        if candidate:
            return str(candidate)
    # Fall back to Grafana URL parsing if present
    url = (
        alert.get("ruleUrl")
        or parent_payload.get("ruleUrl")
        or alert.get("generatorURL")
        or parent_payload.get("generatorURL")
    )
    if url and "/alerting/" in url:
        return url.rstrip("/").split("/")[-2]
    return None


def derive_fallback_rule_uid(alert: Dict[str, Any], parent_payload: Dict[str, Any]) -> str:
    """Construct a deterministic identifier when Grafana omits rule UIDs."""
    labels = alert.get("labels", {})
    candidates = [
        alert.get("fingerprint"),
        labels.get("alertname"),
        labels.get("host"),
        labels.get("instance"),
        parent_payload.get("groupKey"),
        parent_payload.get("title"),
    ]
    for candidate in candidates:
        if candidate:
            return str(candidate)
    return "unknown-alert"


def build_fallback_runbook_entry(alert: Dict[str, Any], parent_payload: Dict[str, Any]) -> Dict[str, Any]:
    """Return a generic runbook entry so every alert can be processed."""
    labels = alert.get("labels", {})
    alertname = labels.get("alertname") or parent_payload.get("title") or "Grafana Alert"
    host = labels.get("host") or labels.get("instance") or "(unknown host)"
    return {
        "name": f"{alertname} (auto)",
        "llm_prompt": (
            "Grafana alert {{ alertname }} fired for {{ host }}.\n"
            "No dedicated runbook entry exists. Use the payload details, command outputs, "
            "and your own reasoning to propose likely causes, evidence to gather, and remediation steps."
        ),
        "triage": [],
        "evidence_to_collect": [],
        "remediation": [],
        "metadata": {"host": host},
    }


def summarize_dict(prefix: str, data: Optional[Dict[str, Any]]) -> str:
    if not data:
        return f"{prefix}: (none)"
    parts = ", ".join(f"{key}={value}" for key, value in sorted(data.items()))
    return f"{prefix}: {parts}"


def determine_host_os(alert: Dict[str, Any]) -> str:
    """Infer host operating system from labels or defaults."""
    labels = alert.get("labels", {})
    candidates = [
        labels.get("os"),
        labels.get("platform"),
        labels.get("system"),
        alert.get("os"),
    ]
    for candidate in candidates:
        if candidate:
            value = str(candidate).lower()
            if "win" in value:
                return "windows"
            if any(token in value for token in ("linux", "unix", "darwin")):
                return "linux"
    host = (labels.get("host") or labels.get("instance") or "").lower()
    if host.startswith("win") or host.endswith(".localdomain") and "win" in host:
        return "windows"
    inventory_os = infer_os_from_inventory(labels.get("host") or labels.get("instance"))
    if inventory_os:
        return inventory_os
    return TRIAGE_DEFAULT_OS


def infer_os_from_inventory(host: Optional[str]) -> Optional[str]:
    if not host:
        return None
    entry = _lookup_inventory(host)
    if not entry:
        return None
    for definition in entry.get("definitions", []):
        vars_dict = definition.get("vars", {}) or {}
        connection = (vars_dict.get("ansible_connection") or "").lower()
        if connection == "winrm":
            return "windows"
    for group in entry.get("groups", []):
        if "windows" in (group or "").lower():
            return "windows"
    return None


def truncate_text(text: str, limit: int = TRIAGE_OUTPUT_LIMIT) -> str:
    """Trim long outputs to keep prompts manageable."""
    if not text:
        return ""
    cleaned = text.strip()
    if len(cleaned) <= limit:
        return cleaned
    return cleaned[:limit] + "... [truncated]"


def gather_command_specs(entry: Dict[str, Any], host_os: str) -> List[Dict[str, Any]]:
    """Collect command specs from triage steps and optional automation sections."""
    specs: List[Dict[str, Any]] = []
    for step in entry.get("triage", []):
        cmd = step.get(host_os)
        if not cmd:
            continue
        specs.append(
            {
                "summary": step.get("summary") or entry.get("name") or "triage",
                "shell": cmd,
                "runner": step.get("runner"),
                "os": host_os,
            }
        )
    for item in entry.get("automation_commands", []):
        target_os = item.get("os", host_os)
        if target_os and target_os.lower() != host_os:
            continue
        specs.append(item)
    if TRIAGE_MAX_COMMANDS > 0:
        return specs[:TRIAGE_MAX_COMMANDS]
    return specs


def build_runner_command(
    rendered_command: str,
    runner: str,
    context: Dict[str, Any],
    spec: Dict[str, Any],
) -> Tuple[Any, str, bool, str]:
    """Return the subprocess args, display string, shell flag, and runner label."""
    runner = runner or TRIAGE_COMMAND_RUNNER
    runner = runner.lower()
    if runner == "ssh":
        host = spec.get("host") or context.get("ssh_host") or context.get("host")
        if not host:
            raise RuntimeError("Host not provided for ssh runner.")
        ssh_user = spec.get("ssh_user") or context.get("ssh_user") or TRIAGE_SSH_USER
        ssh_target = spec.get("ssh_target") or f"{ssh_user}@{host}"
        ssh_options = list(TRIAGE_SSH_OPTIONS)
        common_args = spec.get("ssh_common_args") or context.get("ssh_common_args")
        if common_args:
            ssh_options.extend(shlex.split(common_args))
        ssh_port = spec.get("ssh_port") or context.get("ssh_port")
        if ssh_port:
            ssh_options.extend(["-p", str(ssh_port)])
        identity_file = spec.get("ssh_identity_file") or context.get("ssh_identity_file")
        if identity_file:
            ssh_options.extend(["-i", identity_file])
        command_list = ["ssh", *ssh_options, ssh_target, rendered_command]
        display = " ".join(shlex.quote(part) for part in command_list)
        return command_list, display, False, "ssh"
    # default to local shell execution
    display = rendered_command
    return rendered_command, display, True, "local"


def run_subprocess_command(
    command: Any,
    display: str,
    summary: str,
    use_shell: bool,
    runner_label: str,
) -> Dict[str, Any]:
    """Execute subprocess command and capture results."""
    LOGGER.info("Executing command (%s) via %s: %s", summary, runner_label, display)
    try:
        completed = subprocess.run(
            command,
            capture_output=True,
            text=True,
            timeout=TRIAGE_COMMAND_TIMEOUT,
            shell=use_shell,
            check=False,
        )
        result = {
            "summary": summary,
            "command": display,
            "runner": runner_label,
            "exit_code": completed.returncode,
            "stdout": (completed.stdout or "").strip(),
            "stderr": (completed.stderr or "").strip(),
            "status": "ok" if completed.returncode == 0 else "failed",
        }
        log_verbose(f"Command result ({summary})", result)
        return result
    except subprocess.TimeoutExpired as exc:
        result = {
            "summary": summary,
            "command": display,
            "runner": runner_label,
            "exit_code": None,
            "stdout": truncate_text((exc.stdout or "").strip()),
            "stderr": truncate_text((exc.stderr or "").strip()),
            "status": "timeout",
            "error": f"Command timed out after {TRIAGE_COMMAND_TIMEOUT}s",
        }
        log_verbose(f"Command timeout ({summary})", result)
        return result
    except Exception as exc:  # pylint: disable=broad-except
        LOGGER.exception("Command execution failed (%s): %s", summary, exc)
        result = {
            "summary": summary,
            "command": display,
            "runner": runner_label,
            "exit_code": None,
            "stdout": "",
            "stderr": "",
            "status": "error",
            "error": str(exc),
        }
        log_verbose(f"Command error ({summary})", result)
        return result


def run_command_spec(spec: Dict[str, Any], context: Dict[str, Any]) -> Dict[str, Any]:
    summary = spec.get("summary") or spec.get("name") or "command"
    shell_cmd = spec.get("shell")
    if not shell_cmd:
        return {"summary": summary, "status": "skipped", "error": "No shell command provided."}
    rendered = render_template(shell_cmd, context)
    preferred_runner = context.get("preferred_runner")
    runner_choice = (spec.get("runner") or preferred_runner or TRIAGE_COMMAND_RUNNER).lower()
    try:
        command, display, use_shell, runner_label = build_runner_command(rendered, runner_choice, context, spec)
    except RuntimeError as exc:
        LOGGER.warning("Skipping command '%s': %s", summary, exc)
        return {"summary": summary, "status": "skipped", "error": str(exc), "command": rendered}
    return run_subprocess_command(command, display, summary, use_shell, runner_label)


def execute_triage_commands(entry: Dict[str, Any], alert: Dict[str, Any], context: Dict[str, Any]) -> List[Dict[str, Any]]:
    host_os = context.get("host_os") or determine_host_os(alert)
    context["host_os"] = host_os
    specs = gather_command_specs(entry, host_os)
    if not specs:
        LOGGER.info("No triage commands defined for host_os=%s", host_os)
        return []
    if not TRIAGE_ENABLE_COMMANDS:
        LOGGER.info("Command execution disabled; %d commands queued but skipped.", len(specs))
        return []
    LOGGER.info("Executing up to %d triage commands for host_os=%s", len(specs), host_os)
    results = []
    for spec in specs:
        results.append(run_command_spec(spec, context))
    return results


def format_command_results_for_llm(results: List[Dict[str, Any]]) -> str:
    lines: List[str] = []
    for idx, result in enumerate(results, start=1):
        lines.append(f"{idx}. {result.get('summary')} [{result.get('status')}] {result.get('command')}")
        stdout = result.get("stdout")
        stderr = result.get("stderr")
        error = result.get("error")
        if stdout:
            lines.append("   stdout:")
            lines.append(indent(truncate_text(stdout), "      "))
        if stderr:
            lines.append("   stderr:")
            lines.append(indent(truncate_text(stderr), "      "))
        if error and result.get("status") != "ok":
            lines.append(f"   error: {error}")
    if not lines:
        return "No command results were available."
    return "\n".join(lines)


def parse_structured_response(text: str) -> Optional[Dict[str, Any]]:
    cleaned = text.strip()
    try:
        return json.loads(cleaned)
    except json.JSONDecodeError:
        start = cleaned.find("{")
        end = cleaned.rfind("}")
        if start != -1 and end != -1 and end > start:
            snippet = cleaned[start : end + 1]
            try:
                return json.loads(snippet)
            except json.JSONDecodeError:
                return None
    return None


def normalize_followup_command(item: Dict[str, Any]) -> Dict[str, Any]:
    return {
        "summary": item.get("summary") or item.get("name") or "Follow-up command",
        "shell": item.get("command") or item.get("shell"),
        "runner": item.get("runner"),
        "host": item.get("host") or item.get("target"),
        "ssh_user": item.get("ssh_user"),
        "os": (item.get("os") or item.get("platform") or "").lower() or None,
    }


def investigate_with_langchain(
    entry: Dict[str, Any],
    alert: Dict[str, Any],
    parent_payload: Dict[str, Any],
    context: Dict[str, Any],
    initial_outputs: List[Dict[str, Any]],
) -> Tuple[str, List[Dict[str, Any]]]:
    command_outputs = list(initial_outputs)
    prompt = build_prompt(entry, alert, parent_payload, context, command_outputs)
    log_verbose("Initial investigation prompt", prompt)
    if not OPENROUTER_API_KEY:
        return "OPENROUTER_API_KEY is not configured; unable to analyze alert.", command_outputs

    llm = OpenRouterLLM(api_key=OPENROUTER_API_KEY, model_name=OPENROUTER_MODEL)
    dialogue = (
        prompt
        + "\n\nRespond with JSON containing fields analysis, followup_commands, and complete. "
        "Request commands only when more evidence is required."
    )
    total_followup = 0
    final_summary = ""
    for iteration in range(TRIAGE_MAX_ITERATIONS):
        log_verbose(f"LLM dialogue iteration {iteration + 1}", dialogue)
        llm_text = llm(dialogue)
        log_verbose(f"LLM iteration {iteration + 1} output", llm_text)
        dialogue += f"\nAssistant:\n{llm_text}\n"
        parsed = parse_structured_response(llm_text)
        if parsed:
            log_verbose(f"LLM iteration {iteration + 1} parsed response", parsed)
        if not parsed:
            final_summary = llm_text
            break

        analysis = parsed.get("analysis") or ""
        followups = parsed.get("followup_commands") or parsed.get("commands") or []
        final_summary = analysis
        complete_flag = bool(parsed.get("complete"))

        if complete_flag or not followups:
            break

        log_verbose(f"LLM iteration {iteration + 1} requested follow-ups", followups)
        allowed = max(0, TRIAGE_FOLLOWUP_MAX_COMMANDS - total_followup)
        if not TRIAGE_ENABLE_COMMANDS or allowed <= 0:
            dialogue += (
                "\nUser:\nCommand execution is disabled or budget exhausted. Provide final analysis with JSON format.\n"
            )
            continue

        normalized_cmds: List[Dict[str, Any]] = []
        for raw in followups:
            if not isinstance(raw, dict):
                continue
            normalized = normalize_followup_command(raw)
            if not normalized.get("shell"):
                continue
            cmd_os = normalized.get("os")
            if cmd_os and cmd_os != context.get("host_os"):
                continue
            normalized_cmds.append(normalized)

        log_verbose(f"Normalized follow-up commands (iteration {iteration + 1})", normalized_cmds)
        if not normalized_cmds:
            dialogue += "\nUser:\nNo valid commands to run. Finalize analysis in JSON format.\n"
            continue

        normalized_cmds = normalized_cmds[:allowed]
        executed_batch: List[Dict[str, Any]] = []
        for spec in normalized_cmds:
            executed = run_command_spec(spec, context)
            command_outputs.append(executed)
            executed_batch.append(executed)
            total_followup += 1

        result_text = "Follow-up command results:\n" + format_command_results_for_llm(executed_batch)
        dialogue += (
            "\nUser:\n"
            + result_text
            + "\nUpdate your analysis and respond with JSON (analysis, followup_commands, complete).\n"
        )
        log_verbose("Executed follow-up commands", result_text)
    else:
        final_summary = final_summary or "Reached maximum iterations without a conclusive response."

    if not final_summary:
        final_summary = "LLM did not return a valid analysis."

    log_verbose("Final LLM summary", final_summary)
    return final_summary, command_outputs


def build_context(alert: Dict[str, Any], parent_payload: Dict[str, Any]) -> Dict[str, Any]:
    labels = alert.get("labels", {})
    annotations = alert.get("annotations", {})
    context = {
        "alertname": labels.get("alertname") or alert.get("title") or parent_payload.get("title") or parent_payload.get("ruleName"),
        "host": labels.get("host") or labels.get("instance"),
        "iface": labels.get("interface"),
        "device": labels.get("device"),
        "vmid": labels.get("vmid"),
        "status": alert.get("status") or parent_payload.get("status"),
        "value": alert.get("value") or annotations.get("value"),
        "rule_url": alert.get("ruleUrl") or parent_payload.get("ruleUrl"),
    }
    context.setdefault("ssh_user", TRIAGE_SSH_USER)
    return context


def build_prompt(
    entry: Dict[str, Any],
    alert: Dict[str, Any],
    parent_payload: Dict[str, Any],
    context: Dict[str, Any],
    command_outputs: Optional[List[Dict[str, Any]]] = None,
) -> str:
    template = entry.get("llm_prompt", "Alert {{ alertname }} fired for {{ host }}.")
    rendered_template = render_template(template, {k: v or "" for k, v in context.items()})

    evidence = entry.get("evidence_to_collect", [])
    triage_steps = entry.get("triage", [])
    remediation = entry.get("remediation", [])

    lines = [
        rendered_template.strip(),
        "",
        "Alert payload summary:",
        f"- Status: {context.get('status') or alert.get('status')}",
        f"- Host: {context.get('host')}",
        f"- Value: {context.get('value')}",
        f"- StartsAt: {alert.get('startsAt')}",
        f"- EndsAt: {alert.get('endsAt')}",
        f"- RuleURL: {context.get('rule_url')}",
        f"- Host OS (inferred): {context.get('host_os')}",
        "- Note: All timestamps are UTC/RFC3339 as provided by Grafana.",
        summarize_dict("- Labels", alert.get("labels")),
        summarize_dict("- Annotations", alert.get("annotations")),
    ]

    if evidence:
        lines.append("")
        lines.append("Evidence to gather (for automation reference):")
        for item in evidence:
            lines.append(f"- {item}")

    if triage_steps:
        lines.append("")
        lines.append("Suggested manual checks:")
        for step in triage_steps:
            summary = step.get("summary")
            linux = step.get("linux")
            windows = step.get("windows")
            lines.append(f"- {summary}")
            if linux:
                lines.append(f"  Linux: {linux}")
            if windows:
                lines.append(f"  Windows: {windows}")

    if remediation:
        lines.append("")
        lines.append("Remediation ideas:")
        for item in remediation:
            lines.append(f"- {item}")

    if command_outputs:
        lines.append("")
        lines.append("Command execution results:")
        for result in command_outputs:
            status = result.get("status", "unknown")
            cmd_display = result.get("command", "")
            lines.append(f"- {result.get('summary')} [{status}] {cmd_display}")
            stdout = result.get("stdout")
            stderr = result.get("stderr")
            error = result.get("error")
            if stdout:
                lines.append("  stdout:")
                lines.append(indent(truncate_text(stdout), "    "))
            if stderr:
                lines.append("  stderr:")
                lines.append(indent(truncate_text(stderr), "    "))
            if error and status != "ok":
                lines.append(f"  error: {error}")

    return "\n".join(lines).strip()


def get_alerts(payload: Dict[str, Any]) -> List[Dict[str, Any]]:
    alerts = payload.get("alerts")
    if isinstance(alerts, list) and alerts:
        return alerts
    return [payload]


@app.on_event("startup")
def startup_event() -> None:
    global _RUNBOOK_INDEX, _INVENTORY_INDEX, _INVENTORY_GROUP_VARS
    _RUNBOOK_INDEX = load_runbook()
    _INVENTORY_INDEX, _INVENTORY_GROUP_VARS = load_ansible_inventory()
    LOGGER.info(
        "Alert webhook server ready with %d runbook entries and %d inventory hosts.",
        len(_RUNBOOK_INDEX),
        len(_INVENTORY_INDEX),
    )


@app.post("/alerts")
async def handle_alert(request: Request) -> Dict[str, Any]:
    payload = await request.json()
    LOGGER.info("Received Grafana payload: %s", json.dumps(payload, indent=2, sort_keys=True))
    results = []
    unmatched = []
    for alert in get_alerts(payload):
        LOGGER.info("Processing alert: %s", json.dumps(alert, indent=2, sort_keys=True))
        unmatched_reason: Optional[str] = None
        alert_status = str(alert.get("status") or payload.get("status") or "").lower()
        if alert_status and alert_status != "firing":
            details = {"reason": "non_firing_status", "status": alert_status, "alert": alert}
            unmatched.append(details)
            LOGGER.info("Skipping alert with status=%s (only 'firing' alerts are processed).", alert_status)
            continue
        rule_uid = extract_rule_uid(alert, payload)
        if not rule_uid:
            unmatched_reason = "missing_rule_uid"
            derived_uid = derive_fallback_rule_uid(alert, payload)
            details = {"reason": unmatched_reason, "derived_rule_uid": derived_uid, "alert": alert}
            unmatched.append(details)
            LOGGER.warning("Alert missing rule UID, using fallback identifier %s", derived_uid)
            rule_uid = derived_uid
        entry = _RUNBOOK_INDEX.get(rule_uid)
        runbook_matched = entry is not None
        if not entry:
            unmatched_reason = unmatched_reason or "no_runbook_entry"
            details = {"reason": unmatched_reason, "rule_uid": rule_uid, "alert": alert}
            unmatched.append(details)
            LOGGER.warning("No runbook entry for rule_uid=%s, using generic fallback.", rule_uid)
            entry = build_fallback_runbook_entry(alert, payload)
        context = build_context(alert, payload)
        context["host_os"] = determine_host_os(alert)
        context["rule_uid"] = rule_uid
        apply_inventory_context(context)
        initial_outputs = execute_triage_commands(entry, alert, context)
        try:
            llm_text, command_outputs = investigate_with_langchain(entry, alert, payload, context, initial_outputs)
        except Exception as exc:  # pylint: disable=broad-except
            LOGGER.exception("Investigation failed for rule_uid=%s: %s", rule_uid, exc)
            raise HTTPException(status_code=502, detail=f"LLM investigation error: {exc}") from exc
        result = {
            "rule_uid": rule_uid,
            "alertname": entry.get("name"),
            "host": alert.get("labels", {}).get("host"),
            "llm_summary": llm_text,
            "command_results": command_outputs,
            "runbook_matched": runbook_matched,
        }
        if not runbook_matched and unmatched_reason:
            result["fallback_reason"] = unmatched_reason
        results.append(result)
        send_summary_email(alert, result, context)
    return {"processed": len(results), "results": results, "unmatched": unmatched}


@app.post("/reload-runbook")
def reload_runbook() -> Dict[str, Any]:
    global _RUNBOOK_INDEX, _INVENTORY_INDEX, _INVENTORY_GROUP_VARS
    _RUNBOOK_INDEX = load_runbook()
    _INVENTORY_INDEX, _INVENTORY_GROUP_VARS = load_ansible_inventory()
    return {"entries": len(_RUNBOOK_INDEX), "inventory_hosts": len(_INVENTORY_INDEX)}