diff --git a/stacks/meshmon/.env.template b/stacks/meshmon/.env.template new file mode 100644 index 0000000..b41af74 --- /dev/null +++ b/stacks/meshmon/.env.template @@ -0,0 +1,8 @@ +MESHTASTIC_NODE_IP=192.168.5.242 +ALLOWED_ORIGINS=http://docker-dev:8383,https://meshmon.ghost.tel +RATE_LIMIT_API=20000 +NODE_ENV=production +SESSION_SECRET=change-me +TRUST_PROXY=true +COOKIE_SECURE=true +DISABLE_ANONYMOUS=true diff --git a/stacks/meshmon/docker-compose.yml b/stacks/meshmon/docker-compose.yml new file mode 100644 index 0000000..e0791cf --- /dev/null +++ b/stacks/meshmon/docker-compose.yml @@ -0,0 +1,14 @@ +services: + meshmonitor: + image: ghcr.io/yeraze/meshmonitor:latest + container_name: meshmonitor + ports: + - "8383:3001" + restart: unless-stopped + volumes: + - meshmonitor-data:/data + env_file: + - .env +volumes: + meshmonitor-data: + driver: local diff --git a/stacks/meshtastic-web/docker-compose.yml b/stacks/meshtastic-web/docker-compose.yml new file mode 100644 index 0000000..e3062e7 --- /dev/null +++ b/stacks/meshtastic-web/docker-compose.yml @@ -0,0 +1,7 @@ +services: + meshtastic-web: + image: ghcr.io/meshtastic/web:latest + container_name: meshtastic-web + restart: unless-stopped + ports: + - "8585:8080" diff --git a/stacks/mllogwatcher/.dockerignore b/stacks/mllogwatcher/.dockerignore new file mode 100644 index 0000000..431e681 --- /dev/null +++ b/stacks/mllogwatcher/.dockerignore @@ -0,0 +1,7 @@ +.venv +__pycache__/ +*.pyc +.git +.gitignore +.env +tmp/ diff --git a/stacks/mllogwatcher/.env.example b/stacks/mllogwatcher/.env.example new file mode 100644 index 0000000..cad2c02 --- /dev/null +++ b/stacks/mllogwatcher/.env.example @@ -0,0 +1,14 @@ +OPENROUTER_API_KEY= +OPENROUTER_MODEL=openai/gpt-5.2-codex-max +TRIAGE_ENABLE_COMMANDS=1 +TRIAGE_COMMAND_RUNNER=local +TRIAGE_VERBOSE_LOGS=1 +TRIAGE_EMAIL_ENABLED=1 +TRIAGE_EMAIL_FROM=alertai@example.com +TRIAGE_EMAIL_TO=admin@example.com +TRIAGE_SMTP_HOST=smtp.example.com +TRIAGE_SMTP_PORT=465 +TRIAGE_SMTP_USER=alertai@example.com +TRIAGE_SMTP_PASSWORD= +TRIAGE_SMTP_SSL=1 +TRIAGE_SMTP_STARTTLS=0 diff --git a/stacks/mllogwatcher/.env.template b/stacks/mllogwatcher/.env.template new file mode 100644 index 0000000..cad2c02 --- /dev/null +++ b/stacks/mllogwatcher/.env.template @@ -0,0 +1,14 @@ +OPENROUTER_API_KEY= +OPENROUTER_MODEL=openai/gpt-5.2-codex-max +TRIAGE_ENABLE_COMMANDS=1 +TRIAGE_COMMAND_RUNNER=local +TRIAGE_VERBOSE_LOGS=1 +TRIAGE_EMAIL_ENABLED=1 +TRIAGE_EMAIL_FROM=alertai@example.com +TRIAGE_EMAIL_TO=admin@example.com +TRIAGE_SMTP_HOST=smtp.example.com +TRIAGE_SMTP_PORT=465 +TRIAGE_SMTP_USER=alertai@example.com +TRIAGE_SMTP_PASSWORD= +TRIAGE_SMTP_SSL=1 +TRIAGE_SMTP_STARTTLS=0 diff --git a/stacks/mllogwatcher/Dockerfile b/stacks/mllogwatcher/Dockerfile new file mode 100644 index 0000000..a4775ff --- /dev/null +++ b/stacks/mllogwatcher/Dockerfile @@ -0,0 +1,20 @@ +FROM python:3.11-slim + +ENV PYTHONDONTWRITEBYTECODE=1 \ + PYTHONUNBUFFERED=1 + +WORKDIR /var/core/mlLogWatcher + +RUN apt-get update && \ + apt-get install -y --no-install-recommends openssh-client && \ + rm -rf /var/lib/apt/lists/* + +COPY requirements.txt . +RUN pip install --no-cache-dir -r requirements.txt + +COPY alert_runbook.yaml ./alert_runbook.yaml +COPY scripts ./scripts + +EXPOSE 8081 + +CMD ["uvicorn", "scripts.grafana_alert_webhook:app", "--host", "0.0.0.0", "--port", "8081"] diff --git a/stacks/mllogwatcher/README.md b/stacks/mllogwatcher/README.md new file mode 100755 index 0000000..e6fed7d --- /dev/null +++ b/stacks/mllogwatcher/README.md @@ -0,0 +1,120 @@ +# ML Log Watcher Utilities + +This repository now contains two automation entry points that work together to +triage Elasticsearch logs and Grafana alerts with the help of OpenRouter-hosted +language models. + +## 1. `scripts/log_monitor.py` + +Existing script that queries Elasticsearch indices, pulls a recent window of +logs, and asks an LLM for anomaly highlights. Run it ad-hoc or schedule via +cron/systemd. + +``` +ELASTIC_HOST=https://casper.localdomain:9200 \ +ELASTIC_API_KEY=... \ +OPENROUTER_API_KEY=... \ +python3 scripts/log_monitor.py --index 'log*' --minutes 30 +``` + +## 2. `scripts/grafana_alert_webhook.py` + +A FastAPI web server that accepts Grafana alert webhooks, finds the matching +entry in `alert_runbook.yaml`, renders the LLM prompt, and posts it to +OpenRouter. The response text is returned to Grafana (or any caller) immediately +so automation can fan out to chat, ticketing, etc. + +### Dependencies + +``` +python3 -m venv .venv +.venv/bin/pip install fastapi uvicorn pyyaml requests langchain +``` + +### Environment + +- `OPENROUTER_API_KEY` – required. +- `OPENROUTER_MODEL` – optional (default `openai/gpt-4o-mini`). +- `RUNBOOK_PATH` – optional (default `alert_runbook.yaml` in repo root). +- `ANSIBLE_HOSTS_PATH` – optional (default `/etc/ansible/hosts`). When set, the webhook auto-loads the Ansible inventory so alerts targeting known hosts inherit their SSH user/port/key information. +- `OPENROUTER_REFERER` / `OPENROUTER_TITLE` – forwarded headers if needed. +- `TRIAGE_ENABLE_COMMANDS` – set to `1` to let the webhook execute runbook commands (default `0` keeps it in read-only mode). +- `TRIAGE_COMMAND_RUNNER` – `ssh` (default) or `local`. When using ssh, also set `TRIAGE_SSH_USER` and optional `TRIAGE_SSH_OPTIONS`. +- `TRIAGE_COMMAND_TIMEOUT`, `TRIAGE_MAX_COMMANDS`, `TRIAGE_OUTPUT_LIMIT`, `TRIAGE_DEFAULT_OS` – tune execution behavior. +- `TRIAGE_VERBOSE_LOGS` – set to `1` to stream the entire LLM dialogue, prompts, and command outputs to the webhook logs for debugging. +- `TRIAGE_EMAIL_ENABLED` – when `1`, the webhook emails the final LLM summary per alert. Requires `TRIAGE_EMAIL_FROM`, `TRIAGE_EMAIL_TO` (comma-separated), `TRIAGE_SMTP_HOST`, and optional `TRIAGE_SMTP_PORT`, `TRIAGE_SMTP_USER`, `TRIAGE_SMTP_PASSWORD`, `TRIAGE_SMTP_STARTTLS`, `TRIAGE_SMTP_SSL`. + +### Running + +``` +source .venv/bin/activate +export OPENROUTER_API_KEY=... +uvicorn scripts.grafana_alert_webhook:app --host 0.0.0.0 --port 8081 +``` + +The server loads the runbook at startup and exposes: + +- `POST /alerts` – Grafana webhook target. +- `POST /reload-runbook` – force runbook reload without restarting. + +When `TRIAGE_ENABLE_COMMANDS=1`, the server executes the relevant triage commands +for each alert (via SSH or locally), captures stdout/stderr, and appends the +results to both the OpenRouter prompt and the HTTP response JSON. This lets you +automate evidence gathering directly from the runbook instructions. Use +environment variables to control which user/host the commands target and to +limit timeouts/output size. LangChain powers the multi-turn investigation flow: +the LLM can call the provided tools (`run_local_command`, `run_ssh_command`) to +gather additional evidence until it’s ready to deliver a final summary. +When `/etc/ansible/hosts` (or `ANSIBLE_HOSTS_PATH`) is available the server +automatically enriches the alert context with SSH metadata (user, host, port, +identity file, and common args) so runbook commands default to using SSH against +the alerting host instead of the webhook server. + +### Running with Docker Compose + +1. Copy `.env.example` to `.env` and fill in your OpenRouter key, email SMTP + settings, and other toggles. +2. Place any SSH keys the webhook needs inside `./.ssh/` (the compose file + mounts this directory read-only inside the container). +3. Run `docker compose up -d` to build and launch the webhook. It listens on + port `8081` by default and uses the mounted `alert_runbook.yaml` plus the + host `/etc/ansible/hosts`. +4. Use `docker compose logs -f` to watch verbose LangChain output or restart + with `docker compose restart` when updating the code/runbook. + +### Sample payload + +``` +curl -X POST http://localhost:8081/alerts \ + -H 'Content-Type: application/json' \ + -d '{ + "status":"firing", + "ruleUid":"edkmsdmlay2o0c", + "ruleUrl":"http://casper:3000/alerting/grafana/edkmsdmlay2o0c/view", + "alerts":[ + { + "status":"firing", + "labels":{ + "alertname":"High Mem.", + "host":"unit-02", + "rule_uid":"edkmsdmlay2o0c" + }, + "annotations":{ + "summary":"Memory usage above 95% for 10m", + "value":"96.2%" + }, + "startsAt":"2025-09-22T17:20:00Z", + "endsAt":"0001-01-01T00:00:00Z" + } + ] + }' +``` + +With a valid OpenRouter key this returns a JSON body containing the LLM summary +per alert plus any unmatched alerts (missing runbook entries or rule UIDs). + +### Testing without OpenRouter + +Set `OPENROUTER_API_KEY=dummy` and point the DNS entry to a mock (e.g. mitmproxy) +if you need to capture outbound requests. Otherwise, hits will fail fast with +HTTP 502 so Grafana knows the automation need to be retried. diff --git a/stacks/mllogwatcher/alert_runbook.yaml b/stacks/mllogwatcher/alert_runbook.yaml new file mode 100755 index 0000000..20d96e5 --- /dev/null +++ b/stacks/mllogwatcher/alert_runbook.yaml @@ -0,0 +1,254 @@ +# Grafana alert triage playbook for the HomeLab telemetry stack. +# Each entry contains the alert metadata, what the signal means, +# the evidence to capture automatically, and the manual / scripted steps. +metadata: + generated: "2025-09-22T00:00:00Z" + grafana_url: "http://casper:3000" + datasource: "InfluxDB telegraf (uid=P951FEA4DE68E13C5)" + llm_provider: "OpenRouter" +alerts: + - name: "Data Stale" + rule_uid: "fdk9orif6fytcf" + description: "No CPU usage_user metrics have arrived for non-unit hosts within 5 minutes." + signal: + metric: "cpu.usage_user" + condition: "count(host samples over 5m) < 1" + impact: "Host is no longer reporting to Telegraf/Influx -> monitoring blind spot." + evidence_to_collect: + - "Influx: `from(bucket:\"telegraf\") |> range(start:-10m) |> filter(fn:(r)=>r._measurement==\"cpu\" and r.host==\"{{ host }}\") |> count()`" + - "Telegraf log tail" + - "System journal for network/auth errors" + triage: + - summary: "Verify Telegraf agent health" + linux: "sudo systemctl status telegraf && sudo journalctl -u telegraf -n 100" + windows: "Get-Service telegraf; Get-Content 'C:\\Program Files\\telegraf\\telegraf.log' -Tail 100" + - summary: "Check connectivity from host to Influx (`casper:8086`)" + linux: "curl -sSf http://casper:8086/ping" + windows: "Invoke-WebRequest -UseBasicParsing http://casper:8086/ping" + - summary: "Confirm host clock drift <5s (important for Influx line protocol timestamps)" + linux: "chronyc tracking" + windows: "w32tm /query /status" + remediation: + - "Restart Telegraf after config validation: `sudo telegraf --test --config /etc/telegraf/telegraf.conf` then `sudo systemctl restart telegraf`." + - "Re-apply Ansible telemetry playbook if multiple hosts fail." + llm_prompt: > + Alert {{ alertname }} fired for {{ host }}. Telegraf stopped sending cpu.usage_user metrics. Given the collected logs and command output, identify root causes (agent down, auth failures, firewall, time skew) and list the next action. + + - name: "High CPU" + rule_uid: "fdkms407ubmdcc" + description: "Mean CPU usage_system over the last 10 minutes exceeds 85%." + signal: + metric: "cpu.usage_system" + condition: "mean over 10m > 85%" + impact: "Host is near saturation; scheduler latency and queueing likely." + evidence_to_collect: + - "Top CPU processes snapshot (Linux: `ps -eo pid,cmd,%cpu --sort=-%cpu | head -n 15`; Windows: `Get-Process | Sort-Object CPU -Descending | Select -First 15`)" + - "Load vs CPU core count" + - "Recent deploys / cron jobs metadata" + triage: + - summary: "Confirm sustained CPU pressure" + linux: "uptime && mpstat 1 5" + windows: "typeperf \"\\Processor(_Total)\\% Processor Time\" -sc 15" + - summary: "Check offending processes/services" + linux: "sudo ps -eo pid,user,comm,%cpu,%mem --sort=-%cpu | head" + windows: "Get-Process | Sort-Object CPU -Descending | Select -First 10 Name,CPU" + - summary: "Inspect cgroup / VM constraints if on Proxmox" + linux: "sudo pct status {{ vmid }} && sudo pct config {{ vmid }}" + remediation: + - "Throttle or restart runaway service; scale workload or tune limits." + - "Consider moving noisy neighbors off shared hypervisor." + llm_prompt: > + High CPU alert for {{ host }}. Review process table, recent deploys, and virtualization context; determine why cpu.usage_system stayed above 85% and recommend mitigation. + + - name: "High Mem." + rule_uid: "edkmsdmlay2o0c" + description: "Mean memory used_percent over 10 minutes > 95% (excluding hosts jhci/nerv*/magi*)." + signal: + metric: "mem.used_percent" + condition: "mean over 10m > 95%" + impact: "OOM risk and swap thrash." + evidence_to_collect: + - "Free/available memory snapshot" + - "Top consumers (Linux: `sudo smem -rt rss | head`; Windows: `Get-Process | Sort-Object WorkingSet -Descending`)" + - "Swap in/out metrics" + triage: + - summary: "Validate actual memory pressure" + linux: "free -m && vmstat -SM 5 5" + windows: "Get-Counter '\\Memory\\Available MBytes'" + - summary: "Identify leaking services" + linux: "sudo ps -eo pid,user,comm,%mem,rss --sort=-%mem | head" + windows: "Get-Process | Sort-Object WS -Descending | Select -First 10 ProcessName,WS" + - summary: "Check recent kernel/OOM logs" + linux: "sudo dmesg | tail -n 50" + windows: "Get-WinEvent -LogName System -MaxEvents 50 | ? { $_.Message -match 'memory' }" + remediation: + - "Restart or reconfigure offender; add swap as stop-gap; increase VM memory allocation." + llm_prompt: > + High Mem alert for {{ host }}. After reviewing free memory, swap activity, and top processes, explain the likely cause and propose remediation steps with priority. + + - name: "High Disk IO" + rule_uid: "bdkmtaru7ru2od" + description: "Mean merged_reads/writes per second converted to GB/s exceeds 10." + signal: + metric: "diskio.merged_reads + merged_writes" + condition: "mean over 10m > 10 GB/s" + impact: "Storage controller saturated; latency spikes, possible backlog." + evidence_to_collect: + - "iostat extended output" + - "Process level IO (pidstat/nethogs equivalent)" + - "ZFS/MDADM status for relevant pools" + triage: + - summary: "Inspect device queues" + linux: "iostat -xzd 5 3" + windows: "Get-WmiObject -Class Win32_PerfFormattedData_PerfDisk_LogicalDisk | Format-Table Name,DiskWritesPersec,DiskReadsPersec,AvgDisksecPerTransfer" + - summary: "Correlate to filesystem / VM" + linux: "sudo lsof +D /mnt/critical -u {{ user }}" + - summary: "Check backup or replication windows" + linux: "journalctl -u pvebackup -n 50" + remediation: + - "Pause heavy jobs, move backups off-peak, evaluate faster storage tiers." + llm_prompt: > + High Disk IO on {{ host }}. With iostat/pidstat output provided, decide whether activity is expected (backup, scrub) or abnormal and list mitigations. + + - name: "Low Uptime" + rule_uid: "ddkmuadxvkm4ge" + description: "System uptime converted to minutes is below 10 -> host rebooted recently." + signal: + metric: "system.uptime" + condition: "last uptime_minutes < 10" + impact: "Unexpected reboot or crash; may need RCA." + evidence_to_collect: + - "Boot reason logs" + - "Last patch/maintenance window from Ansible inventory" + - "Smart log excerpt for power events" + triage: + - summary: "Confirm uptime and reason" + linux: "uptime && last -x | head" + windows: "Get-WinEvent -LogName System -MaxEvents 50 | ? { $_.Id -in 41,6006,6008 }" + - summary: "Check kernel panic or watchdog traces" + linux: "sudo journalctl -k -b -1 | tail -n 200" + - summary: "Validate patch automation logs" + linux: "sudo tail -n 100 /var/log/ansible-pull.log" + remediation: + - "Schedule deeper diagnostics if crash; reschedule workloads once stable." + llm_prompt: > + Low Uptime alert: host restarted within 10 minutes. Inspect boot reason logs and recommend whether this is maintenance or a fault needing follow-up. + + - name: "High Load" + rule_uid: "ddkmul9x8gcn4d" + description: "system.load5 > 6 for 5 minutes." + signal: + metric: "system.load5" + condition: "last value > 6" + impact: "Runnable queue more than CPU threads -> latency growth." + evidence_to_collect: + - "Load vs CPU count (`nproc`)" + - "Process states (D/R blocked tasks)" + - "IO wait percentage" + triage: + - summary: "Correlate load to CPU and IO" + linux: "uptime && vmstat 1 5" + - summary: "Identify stuck IO" + linux: "sudo pidstat -d 1 5" + - summary: "Check Proxmox scheduler for resource contention" + linux: "pveperf && qm list" + remediation: + - "Reduce cron concurrency, add CPU, or fix IO bottleneck causing runnable queue growth." + llm_prompt: > + High Load alert on {{ host }}. Based on vmstat/pidstat output, explain whether CPU saturation, IO wait, or runnable pile-up is at fault and propose actions. + + - name: "High Network Traffic (Download)" + rule_uid: "cdkpct82a7g8wd" + description: "Derivative of bytes_recv > 50 MB/s on any interface over last hour." + signal: + metric: "net.bytes_recv" + condition: "mean download throughput > 50 MB/s" + impact: "Link saturation, potential DDoS or backup window." + evidence_to_collect: + - "Interface counters (Linux: `ip -s link show {{ iface }}`; Windows: `Get-NetAdapterStatistics`)" + - "Top talkers (Linux: `sudo nethogs {{ iface }}` or `iftop -i {{ iface }}`)" + - "Firewall/IDS logs" + triage: + - summary: "Confirm interface experiencing spike" + linux: "sar -n DEV 1 5 | grep {{ iface }}" + windows: "Get-Counter -Counter '\\Network Interface({{ iface }})\\Bytes Received/sec' -Continuous -SampleInterval 1 -MaxSamples 5" + - summary: "Identify process or remote peer" + linux: "sudo ss -ntu state established | sort -k4" + windows: "Get-NetTCPConnection | Sort-Object -Property LocalPort" + remediation: + - "Throttle offending transfers, move backup replication, verify no compromised service." + llm_prompt: > + High download throughput on {{ host }} interface {{ iface }}. Review interface counters and connection list to determine if traffic is expected and advise throttling or blocking steps. + + - name: "High Network Traffic (Upload)" + rule_uid: "aec650pbtvzswa" + description: "Derivative of bytes_sent > 30 MB/s for an interface." + signal: + metric: "net.bytes_sent" + condition: "mean upload throughput > 30 MB/s" + impact: "Excess upstream usage; may saturate ISP uplink." + evidence_to_collect: + - "Interface statistics" + - "NetFlow sample if available (`/var/log/telegraf/netflow.log`)" + - "List of active transfers" + triage: + - summary: "Measure upload curve" + linux: "bmon -p {{ iface }} -o ascii" + windows: "Get-Counter '\\Network Interface({{ iface }})\\Bytes Sent/sec' -Continuous -SampleInterval 1 -MaxSamples 5" + - summary: "Find process generating traffic" + linux: "sudo iftop -i {{ iface }} -t -s 30" + windows: "Get-NetAdapterStatistics -Name {{ iface }}" + remediation: + - "Pause replication jobs, confirm backups not stuck, search for data exfiltration." + llm_prompt: > + High upload alert for {{ host }} interface {{ iface }}. Using captured traffic samples, determine whether replication/backup explains the pattern or if anomalous traffic needs blocking. + + - name: "High Disk Usage" + rule_uid: "cdma6i5k2gem8d" + description: "Disk used_percent >= 95% for Linux devices (filters out unwanted devices)." + signal: + metric: "disk.used_percent" + condition: "last value > 95%" + impact: "Filesystem full -> service crashes or write failures." + evidence_to_collect: + - "`df -h` or `Get-Volume` output for device" + - "Largest directories snapshot (Linux: `sudo du -xhd1 /path`; Windows: `Get-ChildItem | Sort Length`)" + - "Recent deploy or backup expansion logs" + triage: + - summary: "Validate usage" + linux: "df -h {{ mountpoint }}" + windows: "Get-Volume -FileSystemLabel {{ volume }}" + - summary: "Identify growth trend" + linux: "sudo journalctl -u telegraf -g 'disk usage' -n 20" + - summary: "Check for stale docker volumes" + linux: "docker system df && docker volume ls" + remediation: + - "Prune temp artifacts, expand disk/VM, move logs to remote storage." + llm_prompt: > + High Disk Usage alert on {{ host }} device {{ device }}. Summarize what consumed the space and recommend reclaim or expansion actions with priority. + + - name: "CPU Heartbeat" + rule_uid: "eec62gqn3oetcf" + description: "Counts cpu.usage_system samples per host; fires if <1 sample arrives within window." + signal: + metric: "cpu.usage_system" + condition: "sample count within 10m < 1" + impact: "Indicates host stopped reporting metrics entirely (telemetry silent)." + evidence_to_collect: + - "Influx query for recent cpu samples" + - "Telegraf service and logs" + - "Network reachability from host to casper" + triage: + - summary: "Check host alive and reachable" + linux: "ping -c 3 {{ host }} && ssh {{ host }} uptime" + windows: "Test-Connection {{ host }} -Count 3" + - summary: "Inspect Telegraf state" + linux: "sudo systemctl status telegraf && sudo tail -n 100 /var/log/telegraf/telegraf.log" + windows: "Get-Service telegraf; Get-EventLog -LogName Application -Newest 50 | ? { $_.Source -match 'Telegraf' }" + - summary: "Validate API key / Influx auth" + linux: "sudo grep -n 'outputs.influxdb' -n /etc/telegraf/telegraf.conf" + remediation: + - "Re-issue Telegraf credentials, run `ansible-playbook telemetry.yml -l {{ host }}`." + - "If host intentionally offline, silence alert via Grafana maintenance window." + llm_prompt: > + CPU Heartbeat for {{ host }} indicates telemetry silent. Use connectivity tests and Telegraf logs to determine if host is down or just metrics disabled; propose fixes. diff --git a/stacks/mllogwatcher/docker-compose.yml b/stacks/mllogwatcher/docker-compose.yml new file mode 100644 index 0000000..a3b42fe --- /dev/null +++ b/stacks/mllogwatcher/docker-compose.yml @@ -0,0 +1,14 @@ +version: "3.9" + +services: + grafana-alert-webhook: + build: . + env_file: + - .env + ports: + - "8081:8081" + volumes: + - ./alert_runbook.yaml:/var/core/mlLogWatcher/alert_runbook.yaml:ro + - /etc/ansible/hosts:/etc/ansible/hosts:ro + - ./.ssh:/var/core/mlLogWatcher/.ssh:ro + restart: unless-stopped diff --git a/stacks/mllogwatcher/requirements.txt b/stacks/mllogwatcher/requirements.txt new file mode 100644 index 0000000..54e3b88 --- /dev/null +++ b/stacks/mllogwatcher/requirements.txt @@ -0,0 +1,5 @@ +fastapi==0.115.5 +uvicorn[standard]==0.32.0 +pyyaml==6.0.2 +requests==2.32.3 +langchain==0.2.15 diff --git a/stacks/mllogwatcher/scripts/grafana_alert_webhook.py b/stacks/mllogwatcher/scripts/grafana_alert_webhook.py new file mode 100755 index 0000000..007e27a --- /dev/null +++ b/stacks/mllogwatcher/scripts/grafana_alert_webhook.py @@ -0,0 +1,988 @@ +#!/usr/bin/env python3 +""" +Minimal FastAPI web server that accepts Grafana alert webhooks, looks up the +matching runbook entry, builds an LLM prompt, and calls OpenRouter to return a +triage summary. + +Run with: + uvicorn scripts.grafana_alert_webhook:app --host 0.0.0.0 --port 8081 + +Environment variables: + RUNBOOK_PATH Path to alert_runbook.yaml (default: ./alert_runbook.yaml) + OPENROUTER_API_KEY Required; API token for https://openrouter.ai + OPENROUTER_MODEL Optional; default openai/gpt-4o-mini + OPENROUTER_REFERER Optional referer header + OPENROUTER_TITLE Optional title header (default: Grafana Alert Webhook) +""" + +from __future__ import annotations + +import logging +import os +import re +from pathlib import Path +from typing import Any, Dict, List, Optional, Tuple +import json +import shlex +import subprocess +from textwrap import indent +import smtplib +from email.message import EmailMessage + +import requests +import yaml +from fastapi import FastAPI, HTTPException, Request +from langchain.llms.base import LLM + +LOGGER = logging.getLogger("grafana_webhook") +logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s %(message)s") + +RUNBOOK_PATH = Path(os.environ.get("RUNBOOK_PATH", "alert_runbook.yaml")) +ANSIBLE_HOSTS_PATH = Path(os.environ.get("ANSIBLE_HOSTS_PATH", "/etc/ansible/hosts")) +OPENROUTER_API_KEY = os.environ.get("OPENROUTER_API_KEY") +OPENROUTER_MODEL = os.environ.get("OPENROUTER_MODEL", "openai/gpt-4o-mini") +OPENROUTER_REFERER = os.environ.get("OPENROUTER_REFERER") +OPENROUTER_TITLE = os.environ.get("OPENROUTER_TITLE", "Grafana Alert Webhook") + +TRIAGE_ENABLE_COMMANDS = os.environ.get("TRIAGE_ENABLE_COMMANDS", "0").lower() in {"1", "true", "yes", "on"} +TRIAGE_COMMAND_RUNNER = os.environ.get("TRIAGE_COMMAND_RUNNER", "ssh").lower() +TRIAGE_SSH_USER = os.environ.get("TRIAGE_SSH_USER", "root") +TRIAGE_SSH_OPTIONS = shlex.split( + os.environ.get("TRIAGE_SSH_OPTIONS", "-o BatchMode=yes -o StrictHostKeyChecking=no -o ConnectTimeout=5") +) +TRIAGE_COMMAND_TIMEOUT = int(os.environ.get("TRIAGE_COMMAND_TIMEOUT", "60")) +TRIAGE_DEFAULT_OS = os.environ.get("TRIAGE_DEFAULT_OS", "linux").lower() +TRIAGE_MAX_COMMANDS = int(os.environ.get("TRIAGE_MAX_COMMANDS", "3")) +TRIAGE_OUTPUT_LIMIT = int(os.environ.get("TRIAGE_OUTPUT_LIMIT", "1200")) +# LangChain-driven investigation loop +TRIAGE_MAX_ITERATIONS = int(os.environ.get("TRIAGE_MAX_ITERATIONS", "3")) +TRIAGE_FOLLOWUP_MAX_COMMANDS = int(os.environ.get("TRIAGE_FOLLOWUP_MAX_COMMANDS", "4")) +TRIAGE_SYSTEM_PROMPT = os.environ.get( + "TRIAGE_SYSTEM_PROMPT", + ( + "You are assisting with on-call investigations. Always reply with JSON containing:\n" + "analysis: your findings and next steps.\n" + "followup_commands: list of command specs (summary, command, optional runner/os) to gather more data.\n" + "complete: true when sufficient information is gathered.\n" + "Request commands only when more evidence is required." + ), +) +TRIAGE_VERBOSE_LOGS = os.environ.get("TRIAGE_VERBOSE_LOGS", "0").lower() in {"1", "true", "yes", "on"} +TRIAGE_EMAIL_ENABLED = os.environ.get("TRIAGE_EMAIL_ENABLED", "0").lower() in {"1", "true", "yes", "on"} +TRIAGE_EMAIL_FROM = os.environ.get("TRIAGE_EMAIL_FROM") +TRIAGE_EMAIL_TO = [addr.strip() for addr in os.environ.get("TRIAGE_EMAIL_TO", "").split(",") if addr.strip()] +TRIAGE_SMTP_HOST = os.environ.get("TRIAGE_SMTP_HOST") +TRIAGE_SMTP_PORT = int(os.environ.get("TRIAGE_SMTP_PORT", "587")) +TRIAGE_SMTP_USER = os.environ.get("TRIAGE_SMTP_USER") +TRIAGE_SMTP_PASSWORD = os.environ.get("TRIAGE_SMTP_PASSWORD") +TRIAGE_SMTP_STARTTLS = os.environ.get("TRIAGE_SMTP_STARTTLS", "1").lower() in {"1", "true", "yes", "on"} +TRIAGE_SMTP_SSL = os.environ.get("TRIAGE_SMTP_SSL", "0").lower() in {"1", "true", "yes", "on"} +TRIAGE_SMTP_TIMEOUT = int(os.environ.get("TRIAGE_SMTP_TIMEOUT", "20")) + + +def log_verbose(title: str, content: Any) -> None: + """Emit structured verbose logs when TRIAGE_VERBOSE_LOGS is enabled.""" + if not TRIAGE_VERBOSE_LOGS: + return + if isinstance(content, (dict, list)): + text = json.dumps(content, indent=2, sort_keys=True) + else: + text = str(content) + LOGGER.info("%s:\n%s", title, text) + + +def email_notifications_configured() -> bool: + if not TRIAGE_EMAIL_ENABLED: + return False + if not (TRIAGE_SMTP_HOST and TRIAGE_EMAIL_FROM and TRIAGE_EMAIL_TO): + LOGGER.warning( + "Email notifications requested but TRIAGE_SMTP_HOST/TRIAGE_EMAIL_FROM/TRIAGE_EMAIL_TO are incomplete." + ) + return False + return True + + +def format_command_results_for_email(results: List[Dict[str, Any]]) -> str: + if not results: + return "No automation commands were executed." + lines: List[str] = [] + for result in results: + lines.append(f"- {result.get('summary')} [{result.get('status')}] {result.get('command')}") + stdout = result.get("stdout") + stderr = result.get("stderr") + error = result.get("error") + if stdout: + lines.append(indent(truncate_text(stdout, 800), " stdout: ")) + if stderr: + lines.append(indent(truncate_text(stderr, 800), " stderr: ")) + if error and result.get("status") != "ok": + lines.append(f" error: {error}") + return "\n".join(lines) + + +def build_email_body(alert: Dict[str, Any], result: Dict[str, Any], context: Dict[str, Any]) -> str: + lines = [ + f"Alert: {result.get('alertname')} ({result.get('rule_uid')})", + f"Host: {result.get('host') or context.get('host')}", + f"Status: {alert.get('status')}", + f"Value: {alert.get('value') or alert.get('annotations', {}).get('value')}", + f"Grafana Rule: {context.get('rule_url')}", + "", + "LLM Summary:", + result.get("llm_summary") or "(no summary returned)", + "", + "Command Results:", + format_command_results_for_email(result.get("command_results") or []), + ] + return "\n".join(lines) + + +def send_summary_email(alert: Dict[str, Any], result: Dict[str, Any], context: Dict[str, Any]) -> None: + if not email_notifications_configured(): + return + subject_host = result.get("host") or context.get("host") or "(unknown host)" + subject = f"[Grafana] {result.get('alertname')} - {subject_host}" + body = build_email_body(alert, result, context) + message = EmailMessage() + message["Subject"] = subject + message["From"] = TRIAGE_EMAIL_FROM + message["To"] = ", ".join(TRIAGE_EMAIL_TO) + message.set_content(body) + try: + smtp_class = smtplib.SMTP_SSL if TRIAGE_SMTP_SSL else smtplib.SMTP + with smtp_class(TRIAGE_SMTP_HOST, TRIAGE_SMTP_PORT, timeout=TRIAGE_SMTP_TIMEOUT) as client: + if TRIAGE_SMTP_STARTTLS and not TRIAGE_SMTP_SSL: + client.starttls() + if TRIAGE_SMTP_USER: + client.login(TRIAGE_SMTP_USER, TRIAGE_SMTP_PASSWORD or "") + client.send_message(message) + LOGGER.info("Sent summary email to %s for host %s", ", ".join(TRIAGE_EMAIL_TO), subject_host) + except Exception as exc: # pylint: disable=broad-except + LOGGER.exception("Failed to send summary email: %s", exc) + +app = FastAPI(title="Grafana Alert Webhook", version="1.0.0") + +_RUNBOOK_INDEX: Dict[str, Dict[str, Any]] = {} +_INVENTORY_INDEX: Dict[str, Dict[str, Any]] = {} +_INVENTORY_GROUP_VARS: Dict[str, Dict[str, str]] = {} +_TEMPLATE_PATTERN = re.compile(r"{{\s*([a-zA-Z0-9_]+)\s*}}") + + +DEFAULT_SYSTEM_PROMPT = TRIAGE_SYSTEM_PROMPT + + +class OpenRouterLLM(LLM): + """LangChain-compatible LLM that calls OpenRouter chat completions.""" + + api_key: str + model_name: str + + def __init__(self, api_key: str, model_name: str, **kwargs: Any) -> None: + super().__init__(api_key=api_key, model_name=model_name, **kwargs) + + @property + def _llm_type(self) -> str: + return "openrouter" + + def __call__(self, prompt: str, stop: Optional[List[str]] = None) -> str: + return self._call(prompt, stop=stop) + + def _call(self, prompt: str, stop: Optional[List[str]] = None) -> str: + payload = { + "model": self.model_name, + "messages": [ + {"role": "system", "content": DEFAULT_SYSTEM_PROMPT}, + {"role": "user", "content": prompt}, + ], + } + log_verbose("OpenRouter request payload", payload) + if stop: + payload["stop"] = stop + LOGGER.info("Posting to OpenRouter model=%s via LangChain", self.model_name) + headers = { + "Authorization": f"Bearer {self.api_key}", + "Content-Type": "application/json", + } + if OPENROUTER_REFERER: + headers["HTTP-Referer"] = OPENROUTER_REFERER + if OPENROUTER_TITLE: + headers["X-Title"] = OPENROUTER_TITLE + response = requests.post("https://openrouter.ai/api/v1/chat/completions", json=payload, headers=headers, timeout=90) + if response.status_code >= 400: + try: + detail = response.json() + except ValueError: + detail = response.text + raise RuntimeError(f"OpenRouter error {response.status_code}: {detail}") + data = response.json() + log_verbose("OpenRouter raw response", data) + choices = data.get("choices") + if not choices: + raise RuntimeError("OpenRouter returned no choices") + return choices[0]["message"]["content"].strip() + + +def load_runbook() -> Dict[str, Dict[str, Any]]: + """Load runbook YAML into a dict keyed by rule_uid.""" + if not RUNBOOK_PATH.exists(): + raise FileNotFoundError(f"Runbook file not found: {RUNBOOK_PATH}") + with RUNBOOK_PATH.open("r", encoding="utf-8") as handle: + data = yaml.safe_load(handle) or {} + alerts = data.get("alerts", []) + index: Dict[str, Dict[str, Any]] = {} + for entry in alerts: + uid = entry.get("rule_uid") + if uid: + index[str(uid)] = entry + LOGGER.info("Loaded %d runbook entries from %s", len(index), RUNBOOK_PATH) + return index + + +def _normalize_host_key(host: str) -> str: + return host.strip().lower() + + +def _parse_key_value_tokens(tokens: List[str]) -> Dict[str, str]: + data: Dict[str, str] = {} + for token in tokens: + if "=" not in token: + continue + key, value = token.split("=", 1) + data[key] = value + return data + + +def load_ansible_inventory() -> Tuple[Dict[str, Dict[str, Any]], Dict[str, Dict[str, str]]]: + """Parse a simple INI-style Ansible hosts file into host/group maps.""" + if not ANSIBLE_HOSTS_PATH.exists(): + LOGGER.warning("Ansible inventory not found at %s", ANSIBLE_HOSTS_PATH) + return {}, {} + hosts: Dict[str, Dict[str, Any]] = {} + group_vars: Dict[str, Dict[str, str]] = {} + current_group: Optional[str] = None + current_section: str = "hosts" + + with ANSIBLE_HOSTS_PATH.open("r", encoding="utf-8") as handle: + for raw_line in handle: + line = raw_line.strip() + if not line or line.startswith("#"): + continue + if line.startswith("[") and line.endswith("]"): + header = line[1:-1].strip() + if ":" in header: + group_name, suffix = header.split(":", 1) + current_group = group_name + current_section = suffix + else: + current_group = header + current_section = "hosts" + group_vars.setdefault(current_group, {}) + continue + cleaned = line.split("#", 1)[0].strip() + if not cleaned: + continue + tokens = shlex.split(cleaned) + if not tokens: + continue + if current_section == "vars": + vars_dict = _parse_key_value_tokens(tokens) + group_vars.setdefault(current_group or "all", {}).update(vars_dict) + continue + host_token = tokens[0] + host_key = _normalize_host_key(host_token) + entry = hosts.setdefault(host_key, {"name": host_token, "definitions": [], "groups": set()}) + vars_dict = _parse_key_value_tokens(tokens[1:]) + entry["definitions"].append({"group": current_group, "vars": vars_dict}) + if current_group: + entry["groups"].add(current_group) + + LOGGER.info("Loaded %d Ansible inventory hosts from %s", len(hosts), ANSIBLE_HOSTS_PATH) + return hosts, group_vars + + +def _lookup_inventory(host: Optional[str]) -> Optional[Dict[str, Any]]: + if not host: + return None + key = _normalize_host_key(host) + entry = _INVENTORY_INDEX.get(key) + if entry: + return entry + # try stripping domain suffix + short = key.split(".", 1)[0] + if short != key: + return _INVENTORY_INDEX.get(short) + return None + + +def _merge_group_vars(groups: List[str], host_os: Optional[str]) -> Dict[str, str]: + merged: Dict[str, str] = {} + global_vars = _INVENTORY_GROUP_VARS.get("all") + if global_vars: + merged.update(global_vars) + normalized_os = (host_os or "").lower() + for group in groups: + vars_dict = _INVENTORY_GROUP_VARS.get(group) + if not vars_dict: + continue + connection = (vars_dict.get("ansible_connection") or "").lower() + if connection == "winrm" and normalized_os == "linux": + continue + merged.update(vars_dict) + return merged + + +def _should_include_definition(group: Optional[str], vars_dict: Dict[str, str], host_os: Optional[str]) -> bool: + if not vars_dict: + return False + normalized_os = (host_os or "").lower() + connection = (vars_dict.get("ansible_connection") or "").lower() + if connection == "winrm" and normalized_os != "windows": + return False + if connection == "local": + return True + if group and "windows" in group.lower() and normalized_os == "linux" and not connection: + return False + return True + + +def apply_inventory_context(context: Dict[str, Any]) -> None: + """Augment the alert context with SSH metadata from the Ansible inventory.""" + host = context.get("host") + entry = _lookup_inventory(host) + if not entry: + return + merged_vars = _merge_group_vars(list(entry.get("groups", [])), context.get("host_os")) + for definition in entry.get("definitions", []): + group_name = definition.get("group") + vars_dict = definition.get("vars", {}) + if _should_include_definition(group_name, vars_dict, context.get("host_os")): + merged_vars.update(vars_dict) + ansible_host = merged_vars.get("ansible_host") or entry.get("name") + ansible_user = merged_vars.get("ansible_user") + ansible_port = merged_vars.get("ansible_port") + ssh_common_args = merged_vars.get("ansible_ssh_common_args") + ssh_key = merged_vars.get("ansible_ssh_private_key_file") + connection = (merged_vars.get("ansible_connection") or "").lower() + host_os = (context.get("host_os") or "").lower() + if connection == "winrm" and host_os != "windows": + for key in ( + "ansible_connection", + "ansible_port", + "ansible_password", + "ansible_winrm_server_cert_validation", + "ansible_winrm_scheme", + ): + merged_vars.pop(key, None) + connection = "" + + context.setdefault("ssh_host", ansible_host or host) + if ansible_user: + context["ssh_user"] = ansible_user + if ansible_port: + context["ssh_port"] = ansible_port + if ssh_common_args: + context["ssh_common_args"] = ssh_common_args + if ssh_key: + context["ssh_identity_file"] = ssh_key + context.setdefault("inventory_groups", list(entry.get("groups", []))) + if connection == "local": + context.setdefault("preferred_runner", "local") + elif connection in {"", "ssh", "smart"}: + context.setdefault("preferred_runner", "ssh") + context.setdefault("inventory_groups", list(entry.get("groups", []))) + + +def render_template(template: str, context: Dict[str, Any]) -> str: + """Very small mustache-style renderer for {{ var }} placeholders.""" + def replace(match: re.Match[str]) -> str: + key = match.group(1) + return str(context.get(key, match.group(0))) + + return _TEMPLATE_PATTERN.sub(replace, template) + + +def extract_rule_uid(alert: Dict[str, Any], parent_payload: Dict[str, Any]) -> Optional[str]: + """Grafana webhooks may include rule UID in different fields.""" + candidates: List[Any] = [ + alert.get("ruleUid"), + alert.get("rule_uid"), + alert.get("ruleId"), + alert.get("uid"), + alert.get("labels", {}).get("rule_uid"), + alert.get("labels", {}).get("ruleUid"), + parent_payload.get("ruleUid"), + parent_payload.get("rule_uid"), + parent_payload.get("ruleId"), + ] + for candidate in candidates: + if candidate: + return str(candidate) + # Fall back to Grafana URL parsing if present + url = ( + alert.get("ruleUrl") + or parent_payload.get("ruleUrl") + or alert.get("generatorURL") + or parent_payload.get("generatorURL") + ) + if url and "/alerting/" in url: + return url.rstrip("/").split("/")[-2] + return None + + +def derive_fallback_rule_uid(alert: Dict[str, Any], parent_payload: Dict[str, Any]) -> str: + """Construct a deterministic identifier when Grafana omits rule UIDs.""" + labels = alert.get("labels", {}) + candidates = [ + alert.get("fingerprint"), + labels.get("alertname"), + labels.get("host"), + labels.get("instance"), + parent_payload.get("groupKey"), + parent_payload.get("title"), + ] + for candidate in candidates: + if candidate: + return str(candidate) + return "unknown-alert" + + +def build_fallback_runbook_entry(alert: Dict[str, Any], parent_payload: Dict[str, Any]) -> Dict[str, Any]: + """Return a generic runbook entry so every alert can be processed.""" + labels = alert.get("labels", {}) + alertname = labels.get("alertname") or parent_payload.get("title") or "Grafana Alert" + host = labels.get("host") or labels.get("instance") or "(unknown host)" + return { + "name": f"{alertname} (auto)", + "llm_prompt": ( + "Grafana alert {{ alertname }} fired for {{ host }}.\n" + "No dedicated runbook entry exists. Use the payload details, command outputs, " + "and your own reasoning to propose likely causes, evidence to gather, and remediation steps." + ), + "triage": [], + "evidence_to_collect": [], + "remediation": [], + "metadata": {"host": host}, + } + + +def summarize_dict(prefix: str, data: Optional[Dict[str, Any]]) -> str: + if not data: + return f"{prefix}: (none)" + parts = ", ".join(f"{key}={value}" for key, value in sorted(data.items())) + return f"{prefix}: {parts}" + + +def determine_host_os(alert: Dict[str, Any]) -> str: + """Infer host operating system from labels or defaults.""" + labels = alert.get("labels", {}) + candidates = [ + labels.get("os"), + labels.get("platform"), + labels.get("system"), + alert.get("os"), + ] + for candidate in candidates: + if candidate: + value = str(candidate).lower() + if "win" in value: + return "windows" + if any(token in value for token in ("linux", "unix", "darwin")): + return "linux" + host = (labels.get("host") or labels.get("instance") or "").lower() + if host.startswith("win") or host.endswith(".localdomain") and "win" in host: + return "windows" + inventory_os = infer_os_from_inventory(labels.get("host") or labels.get("instance")) + if inventory_os: + return inventory_os + return TRIAGE_DEFAULT_OS + + +def infer_os_from_inventory(host: Optional[str]) -> Optional[str]: + if not host: + return None + entry = _lookup_inventory(host) + if not entry: + return None + for definition in entry.get("definitions", []): + vars_dict = definition.get("vars", {}) or {} + connection = (vars_dict.get("ansible_connection") or "").lower() + if connection == "winrm": + return "windows" + for group in entry.get("groups", []): + if "windows" in (group or "").lower(): + return "windows" + return None + + +def truncate_text(text: str, limit: int = TRIAGE_OUTPUT_LIMIT) -> str: + """Trim long outputs to keep prompts manageable.""" + if not text: + return "" + cleaned = text.strip() + if len(cleaned) <= limit: + return cleaned + return cleaned[:limit] + "... [truncated]" + + +def gather_command_specs(entry: Dict[str, Any], host_os: str) -> List[Dict[str, Any]]: + """Collect command specs from triage steps and optional automation sections.""" + specs: List[Dict[str, Any]] = [] + for step in entry.get("triage", []): + cmd = step.get(host_os) + if not cmd: + continue + specs.append( + { + "summary": step.get("summary") or entry.get("name") or "triage", + "shell": cmd, + "runner": step.get("runner"), + "os": host_os, + } + ) + for item in entry.get("automation_commands", []): + target_os = item.get("os", host_os) + if target_os and target_os.lower() != host_os: + continue + specs.append(item) + if TRIAGE_MAX_COMMANDS > 0: + return specs[:TRIAGE_MAX_COMMANDS] + return specs + + +def build_runner_command( + rendered_command: str, + runner: str, + context: Dict[str, Any], + spec: Dict[str, Any], +) -> Tuple[Any, str, bool, str]: + """Return the subprocess args, display string, shell flag, and runner label.""" + runner = runner or TRIAGE_COMMAND_RUNNER + runner = runner.lower() + if runner == "ssh": + host = spec.get("host") or context.get("ssh_host") or context.get("host") + if not host: + raise RuntimeError("Host not provided for ssh runner.") + ssh_user = spec.get("ssh_user") or context.get("ssh_user") or TRIAGE_SSH_USER + ssh_target = spec.get("ssh_target") or f"{ssh_user}@{host}" + ssh_options = list(TRIAGE_SSH_OPTIONS) + common_args = spec.get("ssh_common_args") or context.get("ssh_common_args") + if common_args: + ssh_options.extend(shlex.split(common_args)) + ssh_port = spec.get("ssh_port") or context.get("ssh_port") + if ssh_port: + ssh_options.extend(["-p", str(ssh_port)]) + identity_file = spec.get("ssh_identity_file") or context.get("ssh_identity_file") + if identity_file: + ssh_options.extend(["-i", identity_file]) + command_list = ["ssh", *ssh_options, ssh_target, rendered_command] + display = " ".join(shlex.quote(part) for part in command_list) + return command_list, display, False, "ssh" + # default to local shell execution + display = rendered_command + return rendered_command, display, True, "local" + + +def run_subprocess_command( + command: Any, + display: str, + summary: str, + use_shell: bool, + runner_label: str, +) -> Dict[str, Any]: + """Execute subprocess command and capture results.""" + LOGGER.info("Executing command (%s) via %s: %s", summary, runner_label, display) + try: + completed = subprocess.run( + command, + capture_output=True, + text=True, + timeout=TRIAGE_COMMAND_TIMEOUT, + shell=use_shell, + check=False, + ) + result = { + "summary": summary, + "command": display, + "runner": runner_label, + "exit_code": completed.returncode, + "stdout": (completed.stdout or "").strip(), + "stderr": (completed.stderr or "").strip(), + "status": "ok" if completed.returncode == 0 else "failed", + } + log_verbose(f"Command result ({summary})", result) + return result + except subprocess.TimeoutExpired as exc: + result = { + "summary": summary, + "command": display, + "runner": runner_label, + "exit_code": None, + "stdout": truncate_text((exc.stdout or "").strip()), + "stderr": truncate_text((exc.stderr or "").strip()), + "status": "timeout", + "error": f"Command timed out after {TRIAGE_COMMAND_TIMEOUT}s", + } + log_verbose(f"Command timeout ({summary})", result) + return result + except Exception as exc: # pylint: disable=broad-except + LOGGER.exception("Command execution failed (%s): %s", summary, exc) + result = { + "summary": summary, + "command": display, + "runner": runner_label, + "exit_code": None, + "stdout": "", + "stderr": "", + "status": "error", + "error": str(exc), + } + log_verbose(f"Command error ({summary})", result) + return result + + +def run_command_spec(spec: Dict[str, Any], context: Dict[str, Any]) -> Dict[str, Any]: + summary = spec.get("summary") or spec.get("name") or "command" + shell_cmd = spec.get("shell") + if not shell_cmd: + return {"summary": summary, "status": "skipped", "error": "No shell command provided."} + rendered = render_template(shell_cmd, context) + preferred_runner = context.get("preferred_runner") + runner_choice = (spec.get("runner") or preferred_runner or TRIAGE_COMMAND_RUNNER).lower() + try: + command, display, use_shell, runner_label = build_runner_command(rendered, runner_choice, context, spec) + except RuntimeError as exc: + LOGGER.warning("Skipping command '%s': %s", summary, exc) + return {"summary": summary, "status": "skipped", "error": str(exc), "command": rendered} + return run_subprocess_command(command, display, summary, use_shell, runner_label) + + +def execute_triage_commands(entry: Dict[str, Any], alert: Dict[str, Any], context: Dict[str, Any]) -> List[Dict[str, Any]]: + host_os = context.get("host_os") or determine_host_os(alert) + context["host_os"] = host_os + specs = gather_command_specs(entry, host_os) + if not specs: + LOGGER.info("No triage commands defined for host_os=%s", host_os) + return [] + if not TRIAGE_ENABLE_COMMANDS: + LOGGER.info("Command execution disabled; %d commands queued but skipped.", len(specs)) + return [] + LOGGER.info("Executing up to %d triage commands for host_os=%s", len(specs), host_os) + results = [] + for spec in specs: + results.append(run_command_spec(spec, context)) + return results + + +def format_command_results_for_llm(results: List[Dict[str, Any]]) -> str: + lines: List[str] = [] + for idx, result in enumerate(results, start=1): + lines.append(f"{idx}. {result.get('summary')} [{result.get('status')}] {result.get('command')}") + stdout = result.get("stdout") + stderr = result.get("stderr") + error = result.get("error") + if stdout: + lines.append(" stdout:") + lines.append(indent(truncate_text(stdout), " ")) + if stderr: + lines.append(" stderr:") + lines.append(indent(truncate_text(stderr), " ")) + if error and result.get("status") != "ok": + lines.append(f" error: {error}") + if not lines: + return "No command results were available." + return "\n".join(lines) + + +def parse_structured_response(text: str) -> Optional[Dict[str, Any]]: + cleaned = text.strip() + try: + return json.loads(cleaned) + except json.JSONDecodeError: + start = cleaned.find("{") + end = cleaned.rfind("}") + if start != -1 and end != -1 and end > start: + snippet = cleaned[start : end + 1] + try: + return json.loads(snippet) + except json.JSONDecodeError: + return None + return None + + +def normalize_followup_command(item: Dict[str, Any]) -> Dict[str, Any]: + return { + "summary": item.get("summary") or item.get("name") or "Follow-up command", + "shell": item.get("command") or item.get("shell"), + "runner": item.get("runner"), + "host": item.get("host") or item.get("target"), + "ssh_user": item.get("ssh_user"), + "os": (item.get("os") or item.get("platform") or "").lower() or None, + } + + +def investigate_with_langchain( + entry: Dict[str, Any], + alert: Dict[str, Any], + parent_payload: Dict[str, Any], + context: Dict[str, Any], + initial_outputs: List[Dict[str, Any]], +) -> Tuple[str, List[Dict[str, Any]]]: + command_outputs = list(initial_outputs) + prompt = build_prompt(entry, alert, parent_payload, context, command_outputs) + log_verbose("Initial investigation prompt", prompt) + if not OPENROUTER_API_KEY: + return "OPENROUTER_API_KEY is not configured; unable to analyze alert.", command_outputs + + llm = OpenRouterLLM(api_key=OPENROUTER_API_KEY, model_name=OPENROUTER_MODEL) + dialogue = ( + prompt + + "\n\nRespond with JSON containing fields analysis, followup_commands, and complete. " + "Request commands only when more evidence is required." + ) + total_followup = 0 + final_summary = "" + for iteration in range(TRIAGE_MAX_ITERATIONS): + log_verbose(f"LLM dialogue iteration {iteration + 1}", dialogue) + llm_text = llm(dialogue) + log_verbose(f"LLM iteration {iteration + 1} output", llm_text) + dialogue += f"\nAssistant:\n{llm_text}\n" + parsed = parse_structured_response(llm_text) + if parsed: + log_verbose(f"LLM iteration {iteration + 1} parsed response", parsed) + if not parsed: + final_summary = llm_text + break + + analysis = parsed.get("analysis") or "" + followups = parsed.get("followup_commands") or parsed.get("commands") or [] + final_summary = analysis + complete_flag = bool(parsed.get("complete")) + + if complete_flag or not followups: + break + + log_verbose(f"LLM iteration {iteration + 1} requested follow-ups", followups) + allowed = max(0, TRIAGE_FOLLOWUP_MAX_COMMANDS - total_followup) + if not TRIAGE_ENABLE_COMMANDS or allowed <= 0: + dialogue += ( + "\nUser:\nCommand execution is disabled or budget exhausted. Provide final analysis with JSON format.\n" + ) + continue + + normalized_cmds: List[Dict[str, Any]] = [] + for raw in followups: + if not isinstance(raw, dict): + continue + normalized = normalize_followup_command(raw) + if not normalized.get("shell"): + continue + cmd_os = normalized.get("os") + if cmd_os and cmd_os != context.get("host_os"): + continue + normalized_cmds.append(normalized) + + log_verbose(f"Normalized follow-up commands (iteration {iteration + 1})", normalized_cmds) + if not normalized_cmds: + dialogue += "\nUser:\nNo valid commands to run. Finalize analysis in JSON format.\n" + continue + + normalized_cmds = normalized_cmds[:allowed] + executed_batch: List[Dict[str, Any]] = [] + for spec in normalized_cmds: + executed = run_command_spec(spec, context) + command_outputs.append(executed) + executed_batch.append(executed) + total_followup += 1 + + result_text = "Follow-up command results:\n" + format_command_results_for_llm(executed_batch) + dialogue += ( + "\nUser:\n" + + result_text + + "\nUpdate your analysis and respond with JSON (analysis, followup_commands, complete).\n" + ) + log_verbose("Executed follow-up commands", result_text) + else: + final_summary = final_summary or "Reached maximum iterations without a conclusive response." + + if not final_summary: + final_summary = "LLM did not return a valid analysis." + + log_verbose("Final LLM summary", final_summary) + return final_summary, command_outputs + + +def build_context(alert: Dict[str, Any], parent_payload: Dict[str, Any]) -> Dict[str, Any]: + labels = alert.get("labels", {}) + annotations = alert.get("annotations", {}) + context = { + "alertname": labels.get("alertname") or alert.get("title") or parent_payload.get("title") or parent_payload.get("ruleName"), + "host": labels.get("host") or labels.get("instance"), + "iface": labels.get("interface"), + "device": labels.get("device"), + "vmid": labels.get("vmid"), + "status": alert.get("status") or parent_payload.get("status"), + "value": alert.get("value") or annotations.get("value"), + "rule_url": alert.get("ruleUrl") or parent_payload.get("ruleUrl"), + } + context.setdefault("ssh_user", TRIAGE_SSH_USER) + return context + + +def build_prompt( + entry: Dict[str, Any], + alert: Dict[str, Any], + parent_payload: Dict[str, Any], + context: Dict[str, Any], + command_outputs: Optional[List[Dict[str, Any]]] = None, +) -> str: + template = entry.get("llm_prompt", "Alert {{ alertname }} fired for {{ host }}.") + rendered_template = render_template(template, {k: v or "" for k, v in context.items()}) + + evidence = entry.get("evidence_to_collect", []) + triage_steps = entry.get("triage", []) + remediation = entry.get("remediation", []) + + lines = [ + rendered_template.strip(), + "", + "Alert payload summary:", + f"- Status: {context.get('status') or alert.get('status')}", + f"- Host: {context.get('host')}", + f"- Value: {context.get('value')}", + f"- StartsAt: {alert.get('startsAt')}", + f"- EndsAt: {alert.get('endsAt')}", + f"- RuleURL: {context.get('rule_url')}", + f"- Host OS (inferred): {context.get('host_os')}", + "- Note: All timestamps are UTC/RFC3339 as provided by Grafana.", + summarize_dict("- Labels", alert.get("labels")), + summarize_dict("- Annotations", alert.get("annotations")), + ] + + if evidence: + lines.append("") + lines.append("Evidence to gather (for automation reference):") + for item in evidence: + lines.append(f"- {item}") + + if triage_steps: + lines.append("") + lines.append("Suggested manual checks:") + for step in triage_steps: + summary = step.get("summary") + linux = step.get("linux") + windows = step.get("windows") + lines.append(f"- {summary}") + if linux: + lines.append(f" Linux: {linux}") + if windows: + lines.append(f" Windows: {windows}") + + if remediation: + lines.append("") + lines.append("Remediation ideas:") + for item in remediation: + lines.append(f"- {item}") + + if command_outputs: + lines.append("") + lines.append("Command execution results:") + for result in command_outputs: + status = result.get("status", "unknown") + cmd_display = result.get("command", "") + lines.append(f"- {result.get('summary')} [{status}] {cmd_display}") + stdout = result.get("stdout") + stderr = result.get("stderr") + error = result.get("error") + if stdout: + lines.append(" stdout:") + lines.append(indent(truncate_text(stdout), " ")) + if stderr: + lines.append(" stderr:") + lines.append(indent(truncate_text(stderr), " ")) + if error and status != "ok": + lines.append(f" error: {error}") + + return "\n".join(lines).strip() + + +def get_alerts(payload: Dict[str, Any]) -> List[Dict[str, Any]]: + alerts = payload.get("alerts") + if isinstance(alerts, list) and alerts: + return alerts + return [payload] + + +@app.on_event("startup") +def startup_event() -> None: + global _RUNBOOK_INDEX, _INVENTORY_INDEX, _INVENTORY_GROUP_VARS + _RUNBOOK_INDEX = load_runbook() + _INVENTORY_INDEX, _INVENTORY_GROUP_VARS = load_ansible_inventory() + LOGGER.info( + "Alert webhook server ready with %d runbook entries and %d inventory hosts.", + len(_RUNBOOK_INDEX), + len(_INVENTORY_INDEX), + ) + + +@app.post("/alerts") +async def handle_alert(request: Request) -> Dict[str, Any]: + payload = await request.json() + LOGGER.info("Received Grafana payload: %s", json.dumps(payload, indent=2, sort_keys=True)) + results = [] + unmatched = [] + for alert in get_alerts(payload): + LOGGER.info("Processing alert: %s", json.dumps(alert, indent=2, sort_keys=True)) + unmatched_reason: Optional[str] = None + alert_status = str(alert.get("status") or payload.get("status") or "").lower() + if alert_status and alert_status != "firing": + details = {"reason": "non_firing_status", "status": alert_status, "alert": alert} + unmatched.append(details) + LOGGER.info("Skipping alert with status=%s (only 'firing' alerts are processed).", alert_status) + continue + rule_uid = extract_rule_uid(alert, payload) + if not rule_uid: + unmatched_reason = "missing_rule_uid" + derived_uid = derive_fallback_rule_uid(alert, payload) + details = {"reason": unmatched_reason, "derived_rule_uid": derived_uid, "alert": alert} + unmatched.append(details) + LOGGER.warning("Alert missing rule UID, using fallback identifier %s", derived_uid) + rule_uid = derived_uid + entry = _RUNBOOK_INDEX.get(rule_uid) + runbook_matched = entry is not None + if not entry: + unmatched_reason = unmatched_reason or "no_runbook_entry" + details = {"reason": unmatched_reason, "rule_uid": rule_uid, "alert": alert} + unmatched.append(details) + LOGGER.warning("No runbook entry for rule_uid=%s, using generic fallback.", rule_uid) + entry = build_fallback_runbook_entry(alert, payload) + context = build_context(alert, payload) + context["host_os"] = determine_host_os(alert) + context["rule_uid"] = rule_uid + apply_inventory_context(context) + initial_outputs = execute_triage_commands(entry, alert, context) + try: + llm_text, command_outputs = investigate_with_langchain(entry, alert, payload, context, initial_outputs) + except Exception as exc: # pylint: disable=broad-except + LOGGER.exception("Investigation failed for rule_uid=%s: %s", rule_uid, exc) + raise HTTPException(status_code=502, detail=f"LLM investigation error: {exc}") from exc + result = { + "rule_uid": rule_uid, + "alertname": entry.get("name"), + "host": alert.get("labels", {}).get("host"), + "llm_summary": llm_text, + "command_results": command_outputs, + "runbook_matched": runbook_matched, + } + if not runbook_matched and unmatched_reason: + result["fallback_reason"] = unmatched_reason + results.append(result) + send_summary_email(alert, result, context) + return {"processed": len(results), "results": results, "unmatched": unmatched} + + +@app.post("/reload-runbook") +def reload_runbook() -> Dict[str, Any]: + global _RUNBOOK_INDEX, _INVENTORY_INDEX, _INVENTORY_GROUP_VARS + _RUNBOOK_INDEX = load_runbook() + _INVENTORY_INDEX, _INVENTORY_GROUP_VARS = load_ansible_inventory() + return {"entries": len(_RUNBOOK_INDEX), "inventory_hosts": len(_INVENTORY_INDEX)} diff --git a/stacks/mllogwatcher/scripts/log_monitor.py b/stacks/mllogwatcher/scripts/log_monitor.py new file mode 100755 index 0000000..8512e04 --- /dev/null +++ b/stacks/mllogwatcher/scripts/log_monitor.py @@ -0,0 +1,178 @@ +#!/usr/bin/env python3 +""" +Log anomaly checker that queries Elasticsearch and asks an OpenRouter-hosted LLM +for a quick triage summary. Intended to be run on a schedule (cron/systemd). + +Required environment variables: + ELASTIC_HOST e.g. https://casper.localdomain:9200 + ELASTIC_API_KEY Base64 ApiKey used for Elasticsearch requests + OPENROUTER_API_KEY Token for https://openrouter.ai/ + +Optional environment variables: + OPENROUTER_MODEL Model identifier (default: openai/gpt-4o-mini) + OPENROUTER_REFERER Passed through as HTTP-Referer header + OPENROUTER_TITLE Passed through as X-Title header +""" + +from __future__ import annotations + +import argparse +import datetime as dt +import os +import sys +from typing import Any, Iterable + +import requests + + +def utc_iso(ts: dt.datetime) -> str: + """Return an ISO8601 string with Z suffix.""" + return ts.replace(microsecond=0).isoformat() + "Z" + + +def query_elasticsearch( + host: str, + api_key: str, + index_pattern: str, + minutes: int, + size: int, + verify: bool, +) -> list[dict[str, Any]]: + """Fetch recent logs from Elasticsearch.""" + end = dt.datetime.utcnow() + start = end - dt.timedelta(minutes=minutes) + url = f"{host.rstrip('/')}/{index_pattern}/_search" + payload = { + "size": size, + "sort": [{"@timestamp": {"order": "desc"}}], + "query": { + "range": { + "@timestamp": { + "gte": utc_iso(start), + "lte": utc_iso(end), + } + } + }, + "_source": ["@timestamp", "message", "host.name", "container.image.name", "log.level"], + } + headers = { + "Authorization": f"ApiKey {api_key}", + "Content-Type": "application/json", + } + response = requests.post(url, json=payload, headers=headers, timeout=30, verify=verify) + response.raise_for_status() + hits = response.json().get("hits", {}).get("hits", []) + return hits + + +def build_prompt(logs: Iterable[dict[str, Any]], limit_messages: int) -> str: + """Create the prompt that will be sent to the LLM.""" + selected = [] + for idx, hit in enumerate(logs): + if idx >= limit_messages: + break + source = hit.get("_source", {}) + message = source.get("message") or source.get("event", {}).get("original") or "" + timestamp = source.get("@timestamp", "unknown time") + host = source.get("host", {}).get("name") or source.get("host", {}).get("hostname") or "unknown-host" + container = source.get("container", {}).get("image", {}).get("name") or "" + level = source.get("log", {}).get("level") or source.get("log.level") or "" + selected.append( + f"[{timestamp}] host={host} level={level} container={container}\n{message}".strip() + ) + + if not selected: + return "No logs were returned from Elasticsearch in the requested window." + + prompt = ( + "You are assisting with HomeLab observability. Review the following log entries collected from " + "Elasticsearch and highlight any notable anomalies, errors, or emerging issues. " + "Explain the impact and suggest next steps when applicable. " + "Use concise bullet points. Logs:\n\n" + + "\n\n".join(selected) + ) + return prompt + + +def call_openrouter(prompt: str, model: str, api_key: str, referer: str | None, title: str | None) -> str: + """Send prompt to OpenRouter and return the model response text.""" + url = "https://openrouter.ai/api/v1/chat/completions" + headers = { + "Authorization": f"Bearer {api_key}", + "Content-Type": "application/json", + } + if referer: + headers["HTTP-Referer"] = referer + if title: + headers["X-Title"] = title + + body = { + "model": model, + "messages": [ + {"role": "system", "content": "You are a senior SRE helping analyze log anomalies."}, + {"role": "user", "content": prompt}, + ], + } + + response = requests.post(url, json=body, headers=headers, timeout=60) + response.raise_for_status() + data = response.json() + choices = data.get("choices", []) + if not choices: + raise RuntimeError("OpenRouter response did not include choices") + return choices[0]["message"]["content"] + + +def parse_args() -> argparse.Namespace: + parser = argparse.ArgumentParser(description="Query Elasticsearch and summarize logs with OpenRouter.") + parser.add_argument("--host", default=os.environ.get("ELASTIC_HOST"), help="Elasticsearch host URL") + parser.add_argument("--api-key", default=os.environ.get("ELASTIC_API_KEY"), help="Elasticsearch ApiKey") + parser.add_argument("--index", default="log*", help="Index pattern (default: log*)") + parser.add_argument("--minutes", type=int, default=60, help="Lookback window in minutes (default: 60)") + parser.add_argument("--size", type=int, default=200, help="Max number of logs to fetch (default: 200)") + parser.add_argument("--message-limit", type=int, default=50, help="Max log lines sent to LLM (default: 50)") + parser.add_argument("--openrouter-model", default=os.environ.get("OPENROUTER_MODEL", "openai/gpt-4o-mini")) + parser.add_argument("--insecure", action="store_true", help="Disable TLS verification for Elasticsearch") + return parser.parse_args() + + +def main() -> int: + args = parse_args() + if not args.host or not args.api_key: + print("ELASTIC_HOST and ELASTIC_API_KEY must be provided via environment or CLI", file=sys.stderr) + return 1 + + logs = query_elasticsearch( + host=args.host, + api_key=args.api_key, + index_pattern=args.index, + minutes=args.minutes, + size=args.size, + verify=not args.insecure, + ) + + prompt = build_prompt(logs, limit_messages=args.message_limit) + if not prompt.strip() or prompt.startswith("No logs"): + print(prompt) + return 0 + + openrouter_key = os.environ.get("OPENROUTER_API_KEY") + if not openrouter_key: + print("OPENROUTER_API_KEY is required to summarize logs", file=sys.stderr) + return 1 + + referer = os.environ.get("OPENROUTER_REFERER") + title = os.environ.get("OPENROUTER_TITLE", "Elastic Log Monitor") + response_text = call_openrouter( + prompt=prompt, + model=args.openrouter_model, + api_key=openrouter_key, + referer=referer, + title=title, + ) + print(response_text.strip()) + return 0 + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/stacks/mllogwatcher/testing.py b/stacks/mllogwatcher/testing.py new file mode 100755 index 0000000..5efd879 --- /dev/null +++ b/stacks/mllogwatcher/testing.py @@ -0,0 +1,17 @@ +# pip install -qU langchain "langchain[anthropic]" +from langchain.agents import create_agent + +def get_weather(city: str) -> str: + """Get weather for a given city.""" + return f"It's always sunny in {city}!" + +agent = create_agent( + model="claude-sonnet-4-5-20250929", + tools=[get_weather], + system_prompt="You are a helpful assistant", +) + +# Run the agent +agent.invoke( + {"messages": [{"role": "user", "content": "what is the weather in sf"}]} +) diff --git a/stacks/mllogwatcher/worklog-2025-12-29.txt b/stacks/mllogwatcher/worklog-2025-12-29.txt new file mode 100644 index 0000000..7e761f0 --- /dev/null +++ b/stacks/mllogwatcher/worklog-2025-12-29.txt @@ -0,0 +1,10 @@ +# Worklog – 2025-12-29 + +1. Added containerization assets for grafana_alert_webhook: + - `Dockerfile`, `.dockerignore`, `docker-compose.yml`, `.env.example`, and consolidated `requirements.txt`. + - Compose mounts the runbook, `/etc/ansible/hosts`, and `.ssh` so SSH automation works inside the container. + - README now documents the compose workflow. +2. Copied knight’s SSH key to `.ssh/webhook_id_rsa` and updated `jet-alone` inventory entry with `ansible_user` + `ansible_ssh_private_key_file` so remote commands can run non-interactively. +3. Updated `OpenRouterLLM` to satisfy Pydantic’s field validation inside the container. +4. Brought the webhook up under Docker Compose, tested alerts end-to-end, and reverted `OPENROUTER_MODEL` to the valid `openai/gpt-5.1-codex-max`. +5. Created `/var/core/ansible/ops_baseline.yml` to install sysstat/iotop/smartmontools/hdparm and enforce synchronized Bash history (`/etc/profile.d/99-bash-history.sh`). Ran the playbook against the primary LAN hosts; noted remediation items for the few that failed (outdated mirrors, pending grub configuration, missing sudo password). diff --git a/stacks/network-mcp/.env.example b/stacks/network-mcp/.env.example new file mode 100644 index 0000000..e66ebc9 --- /dev/null +++ b/stacks/network-mcp/.env.example @@ -0,0 +1,26 @@ +ES_URL=http://elasticsearch:9200 +# Elasticsearch API Key authentication (preferred over user/pass) +ES_API_ID= +ES_API_KEY= + +# Or, Elasticsearch Basic authentication (if no API key) +# ES_USER=elastic +# ES_PASS=changeme +ES_VERIFY_SSL=false + +OPNSENSE_URL=https://192.168.1.1 +OPNSENSE_API_KEY=your_key +OPNSENSE_API_SECRET=your_secret + +COLLECTOR_INTERVAL_SECONDS=60 + +NMAP_INTERVAL_SECONDS=300 +NMAP_PORT_RANGE=1-1024 +NMAP_BATCH_SIZE=10 +NMAP_DISCOVERY_ENABLED=true +NMAP_DISCOVERY_INTERVAL_SECONDS=3600 +NMAP_DISCOVERY_VLANS= +NMAP_DISCOVERY_EXTRA_ARGS="-sn -n" +NMAP_QUICK_BATCH_SIZE=30 +NMAP_QUICK_EXTRA_ARGS="-sS --top-ports 100 -T4 --open -Pn" +NMAP_FULL_INTERVAL_SECONDS=86400 diff --git a/stacks/network-mcp/.env.template b/stacks/network-mcp/.env.template new file mode 100644 index 0000000..e66ebc9 --- /dev/null +++ b/stacks/network-mcp/.env.template @@ -0,0 +1,26 @@ +ES_URL=http://elasticsearch:9200 +# Elasticsearch API Key authentication (preferred over user/pass) +ES_API_ID= +ES_API_KEY= + +# Or, Elasticsearch Basic authentication (if no API key) +# ES_USER=elastic +# ES_PASS=changeme +ES_VERIFY_SSL=false + +OPNSENSE_URL=https://192.168.1.1 +OPNSENSE_API_KEY=your_key +OPNSENSE_API_SECRET=your_secret + +COLLECTOR_INTERVAL_SECONDS=60 + +NMAP_INTERVAL_SECONDS=300 +NMAP_PORT_RANGE=1-1024 +NMAP_BATCH_SIZE=10 +NMAP_DISCOVERY_ENABLED=true +NMAP_DISCOVERY_INTERVAL_SECONDS=3600 +NMAP_DISCOVERY_VLANS= +NMAP_DISCOVERY_EXTRA_ARGS="-sn -n" +NMAP_QUICK_BATCH_SIZE=30 +NMAP_QUICK_EXTRA_ARGS="-sS --top-ports 100 -T4 --open -Pn" +NMAP_FULL_INTERVAL_SECONDS=86400 diff --git a/stacks/network-mcp/.gitignore b/stacks/network-mcp/.gitignore new file mode 100644 index 0000000..5786558 --- /dev/null +++ b/stacks/network-mcp/.gitignore @@ -0,0 +1,11 @@ +.env +.venv/ +__pycache__/ +*.pyc +.DS_Store + +# Local/infra +node_modules/ + +# Logs +*.log diff --git a/stacks/network-mcp/PROJECT_SUMMARY.md b/stacks/network-mcp/PROJECT_SUMMARY.md new file mode 100644 index 0000000..3543e85 --- /dev/null +++ b/stacks/network-mcp/PROJECT_SUMMARY.md @@ -0,0 +1,76 @@ +# Network MCP - Project Summary + +## Overview +This project is a long-running Network MCP service that merges OPNsense discovery data, Nmap scans, and static inventory into Elasticsearch, then exposes both a minimal web UI and a full MCP JSON-RPC interface for LLM agents. It runs via Docker Compose and is now located at `/var/core/network-mcp`. + +## What We Built +- **Collectors** + - OPNsense collector ingests DHCP/ARP/DNS and overlays inventory targets. + - Nmap collector performs discovery and port scans. + - Data lands in Elasticsearch: `network-hosts` (current state) and `network-events-*` (historical events). +- **Inventory merge** + - Inventory data from `inventory_targets.yml` is merged onto live hosts by IP when a MAC is known (so live MAC-based records carry inventory notes/expected ports). +- **Frontend** + - Flask UI + JSON API, containerized with Gunicorn and exposed on port `5001` for LAN access. +- **MCP server** + - JSON-RPC endpoint at `/.well-known/mcp.json` (and `/api/mcp`) supports: + - `initialize`, `ping`, `tools/list`, `tools/call` + - `resources/list`, `resources/read`, `resources/templates/list` + - Tool schemas include titles, descriptions, input/output schemas, and annotations (read-only hints). + - Resource templates provide snapshot + query access (e.g. `network://hosts?q=...`). +- **Search behavior** + - Host search is case-insensitive across name/hostname/IP/MAC. +- **Tests** + - Unit tests for REST and MCP search by hostname/IP/MAC, MCP resource reads, and MCP notifications. + +## Key Endpoints +- UI: `http://:5001/` +- REST: + - `GET /api/hosts` (supports `q`, `source`, `limit`) + - `GET /api/hosts/` + - `GET /api/events` + - `GET /api/hosts//events` + - `GET /api/map` +- MCP JSON-RPC: `POST /.well-known/mcp.json` + +## MCP Tools (JSON-RPC) +- `list_hosts` (search by hostname/IP/MAC; case-insensitive) +- `get_host` (optional events) +- `list_events` +- `host_events` +- `network_map` + +## MCP Resources +- `resources/list` -> `network://hosts`, `network://map`, `network://events` +- `resources/templates/list` -> query templates such as: + - `network://hosts{?q,source,limit}` + - `network://host/{host_id}{?include_events,events_limit}` + - `network://events{?host_id,type,since,limit}` + +## Docker & Repo State +- Repo path: `/var/core/network-mcp` +- `inventory_targets.yml` lives in the repo and is mounted via compose. +- Services run via `docker-compose up -d`. +- Git repo initialized and initial commit created. + +## Gotchas / Pitfalls We Hit +- **MCP handshake**: Codex sent `notifications/initialized` without `id` (notification). Returning a response caused the transport to close. Fixed by treating notifications as no-response. +- **Case-sensitive search**: Elasticsearch wildcard on `.keyword` fields was case-sensitive, so `seele` didn’t match `SEELE`. Fixed via `case_insensitive: true` in wildcard queries. +- **Inventory merge duplication**: Initial inventory-only docs were `ip:*` and live docs were `mac:*`, so both existed. Merge now attaches inventory to live MAC records by IP. Legacy `ip:*` docs may remain stale unless cleaned. +- **MCP errors**: Tool errors are now returned as `CallToolResult` with `isError: true` (instead of JSON-RPC errors), so LLMs can see and correct issues. +- **Service move**: Repo moved from `/var/core/ansible/network-mcp` to `/var/core/network-mcp`. Compose mount paths updated. + +## Verification Performed +- REST search works for hostname/IP/MAC. +- MCP `initialize`, `tools/list`, `tools/call` work. +- MCP resource list/templates/read work. +- Services verified running via `docker-compose up -d`. + +## Future Work Ideas +- **Cleanup**: Add a cleanup job to remove stale `ip:*` docs after successful MAC merge. +- **Resource subscriptions**: Implement `resources/subscribe` if clients need push updates. +- **Auth**: Optional token on the MCP endpoint for shared LAN exposure. +- **More UI**: Add filters/alerts for stale hosts or missing expected ports. +- **Metrics**: Export collector stats to detect scan/ingest failures. +- **Schema mapping**: Improve Elasticsearch mappings for search (e.g., lowercase normalizers for names/hostnames). + diff --git a/stacks/network-mcp/README.md b/stacks/network-mcp/README.md new file mode 100644 index 0000000..f7cdb91 --- /dev/null +++ b/stacks/network-mcp/README.md @@ -0,0 +1,105 @@ +# Network MCP + +A "source of truth" for network devices and ports, backed by Elasticsearch, OPNsense, and Nmap. + +## Architecture + +- **Elasticsearch**: Stores current state (`network-hosts`) and historical events (`network-events-*`). +- **OPNsense Collector**: Fetches DHCP/ARP/DNS data to discover hosts. +- **Nmap Collector**: Scans discovered hosts for open ports and OS info. + +## Setup + +1. **Environment Config** + Copy `.env.example` to `.env` and fill in your details: + ```bash + cp .env.example .env + # Edit .env + ``` + +2. **Bootstrap Elastic** + Run the bootstrap script (requires `requests` installed locally, or you can run it inside a container): + ```bash + python3 scripts/bootstrap_indices.py + ``` + *Note: Ensure you have connectivity to your Elasticsearch instance.* + +3. **Start Services** + ```bash + docker-compose up -d --build + ``` + This brings up the collectors and the lightweight frontend (reachable on port `5001`). + +## Configuration + +- **Static Metadata**: Edit `static/host_metadata.json` to add manual notes, roles, or tags to hosts (keyed by `mac:xx:xx...`). +- **Intervals**: Adjust polling intervals in `.env`. +- **VLAN Discovery (default on)**: Discovery sweeps (`nmap -sn`) run periodically across the OPNsense interfaces listed in `NMAP_DISCOVERY_VLANS`. Adjust the list (or set the flag to `false`) if you only want targeted subnets. +- **Quick vs Full Port Scans**: Each collector loop runs a fast, common-port sweep (`NMAP_QUICK_EXTRA_ARGS`, `NMAP_QUICK_BATCH_SIZE`) while a deeper service scan (`NMAP_PORT_RANGE`, `NMAP_BATCH_SIZE`) is triggered once per `NMAP_FULL_INTERVAL_SECONDS` (default daily). Tune these env vars to balance coverage vs. runtime. +- **Inventory Overlay**: Entries in `./inventory_targets.yml` are mounted into the OPNsense collector and merged by IP—offline/static hosts from that file (names, notes, expected ports) now appear in `network-hosts` with `source: inventory`. + +## Data Model + +- **`network-hosts`**: Current state of every known host. +- **`network-events-YYYY.MM.DD`**: Immutable log of scans and discovery events. + +## Usage + +Query `network-hosts` for the latest view of your network: +```json +GET network-hosts/_search +{ + "query": { + "match_all": {} + } +} +``` + +### Quick Frontend + +A minimal Flask frontend is bundled in docker-compose (service `frontend`) and is exposed on port `5001` so it can be reached from other machines: + +```bash +docker-compose up -d frontend +``` + +Then visit `http://:5001/` to see the merged view (inventory entries are marked with `source: inventory`). If you prefer to run it without Docker for debugging, follow the steps below: + +```bash +cd network-mcp +python3 -m venv .venv && source .venv/bin/activate +pip install -r frontend/requirements.txt +python frontend/app.py +``` + +### MCP / API Endpoints + +The frontend doubles as a Model Context Protocol server. It exposes the manifest at `/.well-known/mcp.json` (or `/api/mcp`) and supports the standard JSON-RPC handshake (`initialize`, `tools/list`, `tools/call`) on the same URL. Agents can either use the RPC tools below or hit the underlying REST endpoints directly. + +- MCP Resources are also available (`resources/list`, `resources/read`, `resources/templates/list`) for clients that prefer resource-style access to snapshots and queries. + +- `GET /api/hosts` – merged host list (supports `limit`, `source`, and repeated `q` params to fuzzy search names, hostnames, IPs, or MACs in a single call). +- `GET /api/hosts/` – single host document with optional `include_events=true`. +- `GET /api/events` – recent scan/discovery events (`limit`, `host_id`, `type`, `since` filters). +- `GET /api/hosts//events` – scoped events for a host. +- `GET /api/map` – high-level “network map” grouping hosts by detected /24 (IPv4) or /64 (IPv6). + +RPC tool names (mirrored in the manifest) are: + +- `list_hosts` – accepts `{limit, source, terms}` and returns the merged host list. +- `network_map` – optional `{limit}` for building /24-/64 summaries. +- `get_host` – requires `{host_id}` plus optional `include_events`, `events_limit`. +- `list_events` – `{limit, host_id, type, since}`. +- `host_events` – requires `{host_id}` plus optional `limit`, `type`, `since`. + +Resource URI examples: + +- `network://hosts?q=seele&limit=50` +- `network://host/mac:dc:a6:32:67:55:dc?include_events=true&events_limit=50` +- `network://events?type=discovery&limit=100` + +All RPC and REST calls share the Elasticsearch credentials from `.env`, so an agent only needs HTTP access to port `5001` to query hosts, notes, and event timelines. Registering the MCP with Codex looks like: + +```bash +codex mcp install network-mcp http://:5001/.well-known/mcp.json +``` diff --git a/stacks/network-mcp/collectors/common/__init__.py b/stacks/network-mcp/collectors/common/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/stacks/network-mcp/collectors/common/es_auth.py b/stacks/network-mcp/collectors/common/es_auth.py new file mode 100644 index 0000000..8fe6c3c --- /dev/null +++ b/stacks/network-mcp/collectors/common/es_auth.py @@ -0,0 +1,55 @@ +import base64 +from typing import Optional, Tuple + + +def _clean(value: Optional[str]) -> str: + """ + Normalize values coming from env files where quotes might be preserved. + """ + if not value: + return "" + return value.strip().strip('"').strip() + + +def resolve_api_key(api_id: Optional[str], api_key: Optional[str]) -> Tuple[Optional[str], Optional[str]]: + """ + Accept various API key formats and return (api_id, api_key). + Supported formats: + - Explicit ES_API_ID and ES_API_KEY values. + - ES_API_KEY that already contains \"id:key\". + - ES_API_KEY that is the base64 encoding of \"id:key\". + """ + cleaned_id = _clean(api_id) + cleaned_key = _clean(api_key) + + if cleaned_id and cleaned_key: + return cleaned_id, cleaned_key + + if not cleaned_key: + return None, None + + # Raw "id:key" format + if ":" in cleaned_key: + potential_id, potential_key = cleaned_key.split(":", 1) + if potential_id and potential_key: + return potential_id, potential_key + + # Base64 encoded "id:key" format + try: + decoded = base64.b64decode(cleaned_key, validate=True).decode() + if ":" in decoded: + potential_id, potential_key = decoded.split(":", 1) + if potential_id and potential_key: + return potential_id, potential_key + except Exception: + pass + + return None, None + + +def build_api_key_header(api_id: str, api_key: str) -> str: + """ + Return the value for the Authorization header using ApiKey auth. + """ + token = base64.b64encode(f"{api_id}:{api_key}".encode()).decode() + return f"ApiKey {token}" diff --git a/stacks/network-mcp/collectors/common/es_client.py b/stacks/network-mcp/collectors/common/es_client.py new file mode 100644 index 0000000..79cdade --- /dev/null +++ b/stacks/network-mcp/collectors/common/es_client.py @@ -0,0 +1,85 @@ +import os +import time +import urllib3 +from elasticsearch import Elasticsearch, helpers +from .es_auth import resolve_api_key +from .logging_config import setup_logging + +# Suppress insecure request warnings if SSL verification is disabled +urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning) + +logger = setup_logging("es_client") + +class ESClient: + def __init__(self): + self.url = os.getenv("ES_URL", "http://localhost:9200") + env_api_id = os.getenv("ES_API_ID") + env_api_key = os.getenv("ES_API_KEY") + self.api_id, self.api_key = resolve_api_key(env_api_id, env_api_key) + self.user = os.getenv("ES_USER", "elastic") + self.password = os.getenv("ES_PASS", "changeme") + self.verify_ssl = os.getenv("ES_VERIFY_SSL", "true").lower() == "true" + + if self.api_id and self.api_key: + # Use API key authentication + self.client = Elasticsearch( + self.url, + api_key=(self.api_id, self.api_key), + verify_certs=self.verify_ssl, + ssl_show_warn=False + ) + logger.info("Using Elasticsearch API key authentication.") + else: + # Fallback to basic auth + self.client = Elasticsearch( + self.url, + basic_auth=(self.user, self.password), + verify_certs=self.verify_ssl, + ssl_show_warn=False + ) + logger.info("Using Elasticsearch basic authentication.") + + def check_connection(self): + try: + return self.client.info() + except Exception as e: + logger.error(f"Failed to connect to Elasticsearch: {e}") + raise + + def bulk_index(self, actions): + """ + Bulk index a list of actions. + actions: list of dicts compatible with elasticsearch.helpers.bulk + """ + if not actions: + return 0, [] + + try: + success, failed = helpers.bulk(self.client, actions, stats_only=False, raise_on_error=False) + if failed: + logger.warning(f"Bulk index had failures: {len(failed)} items failed.") + for item in failed[:5]: # Log first 5 failures + logger.warning(f"Failure sample: {item}") + else: + logger.info(f"Bulk index successful: {success} items.") + return success, failed + except Exception as e: + logger.error(f"Bulk index exception: {e}") + raise + + def search_hosts(self, index="network-hosts", query=None, size=1000): + """ + Search for hosts in network-hosts index. + """ + if query is None: + query = {"match_all": {}} + + try: + resp = self.client.search(index=index, query=query, size=size) + return [hit["_source"] for hit in resp["hits"]["hits"]] + except Exception as e: + logger.error(f"Search failed: {e}") + return [] + +def get_es_client(): + return ESClient() diff --git a/stacks/network-mcp/collectors/common/logging_config.py b/stacks/network-mcp/collectors/common/logging_config.py new file mode 100644 index 0000000..5fce818 --- /dev/null +++ b/stacks/network-mcp/collectors/common/logging_config.py @@ -0,0 +1,21 @@ +import logging +import os +import sys + +def setup_logging(name: str = "collector") -> logging.Logger: + """ + Sets up a structured logger. + """ + logger = logging.getLogger(name) + level = os.getenv("LOG_LEVEL", "INFO").upper() + logger.setLevel(level) + + if not logger.handlers: + handler = logging.StreamHandler(sys.stdout) + formatter = logging.Formatter( + '%(asctime)s [%(levelname)s] %(name)s: %(message)s' + ) + handler.setFormatter(formatter) + logger.addHandler(handler) + + return logger diff --git a/stacks/network-mcp/collectors/common/nmap_parser.py b/stacks/network-mcp/collectors/common/nmap_parser.py new file mode 100644 index 0000000..010746f --- /dev/null +++ b/stacks/network-mcp/collectors/common/nmap_parser.py @@ -0,0 +1,131 @@ +import subprocess +import xml.etree.ElementTree as ET +import shutil +from typing import List, Dict, Optional +from .logging_config import setup_logging + +logger = setup_logging("nmap_parser") + +def run_nmap_scan(ips: List[str], extra_args: Optional[List[str]] = None) -> List[Dict]: + """ + Run nmap on the given IPs and return a list of parsed host dicts. + """ + if not ips: + return [] + + if not shutil.which("nmap"): + logger.error("nmap binary not found in PATH") + return [] + + # Default args: -oX - (XML to stdout) + cmd = ["nmap", "-oX", "-"] + if extra_args: + cmd.extend(extra_args) + + # Append IPs + cmd.extend(ips) + + logger.info(f"Running nmap command: {' '.join(cmd)}") + + try: + result = subprocess.run(cmd, capture_output=True, text=True, check=True) + xml_output = result.stdout + return parse_nmap_xml(xml_output) + except subprocess.CalledProcessError as e: + logger.error(f"Nmap failed: {e.stderr}") + return [] + except Exception as e: + logger.error(f"Error running nmap: {e}") + return [] + +def parse_nmap_xml(xml_string: str) -> List[Dict]: + """ + Parse Nmap XML output into our internal host/port structure. + """ + try: + root = ET.fromstring(xml_string) + except ET.ParseError as e: + logger.error(f"Failed to parse Nmap XML: {e}") + return [] + + hosts = [] + + for host_node in root.findall("host"): + # Helper to find basic info + ip = None + mac = None + hostname = None + vendor = None + + # Addresses + for addr in host_node.findall("address"): + addr_type = addr.get("addrtype") + if addr_type == "ipv4": + ip = addr.get("addr") + elif addr_type == "mac": + mac = addr.get("addr") + vendor = addr.get("vendor") + + # Hostnames + hostnames_node = host_node.find("hostnames") + if hostnames_node is not None: + # Pick first for now + hn = hostnames_node.find("hostname") + if hn is not None: + hostname = hn.get("name") + + # Ports + ports = [] + ports_node = host_node.find("ports") + if ports_node is not None: + for port_node in ports_node.findall("port"): + state_node = port_node.find("state") + state = state_node.get("state") if state_node is not None else "unknown" + + # Only care about open ports usually, but keep all for now if needed + if state != "open": + continue + + port_id = int(port_node.get("portid")) + protocol = port_node.get("protocol") + + service_node = port_node.find("service") + service_name = service_node.get("name") if service_node is not None else "unknown" + product = service_node.get("product") if service_node is not None else None + version = service_node.get("version") if service_node is not None else None + + service_def = { + "name": service_name, + } + if product: service_def["product"] = product + if version: service_def["version"] = version + + ports.append({ + "port": port_id, + "proto": protocol, + "state": state, + "service": service_def + }) + + # OS detection (basic) + os_match = None + os_node = host_node.find("os") + if os_node is not None: + os_match_node = os_node.find("osmatch") + if os_match_node is not None: + os_match = { + "name": os_match_node.get("name"), + "accuracy": os_match_node.get("accuracy") + } + + host_data = { + "ip": ip, + "mac": mac, # might be None if scanning remote segment + "hostname": hostname, + "vendor": vendor, + "ports": ports, + "os_match": os_match + } + hosts.append(host_data) + + return hosts diff --git a/stacks/network-mcp/collectors/common/opnsense_client.py b/stacks/network-mcp/collectors/common/opnsense_client.py new file mode 100644 index 0000000..afc2790 --- /dev/null +++ b/stacks/network-mcp/collectors/common/opnsense_client.py @@ -0,0 +1,105 @@ +import os +import requests +import json +import ipaddress +from .logging_config import setup_logging + +logger = setup_logging("opnsense_client") + +class OPNsenseClient: + def __init__(self): + self.base_url = os.getenv("OPNSENSE_URL", "https://192.168.1.1").rstrip('/') + self.api_key = os.getenv("OPNSENSE_API_KEY") + self.api_secret = os.getenv("OPNSENSE_API_SECRET") + self.verify_ssl = os.getenv("ES_VERIFY_SSL", "true").lower() == "true" # Reusing verify flag or add explicit OPNSENSE_VERIFY_SSL + + if not self.api_key or not self.api_secret: + logger.warning("OPNSENSE_API_KEY or OPNSENSE_API_SECRET not set. API calls will fail.") + + def _get(self, endpoint, params=None): + url = f"{self.base_url}{endpoint}" + try: + response = requests.get( + url, + auth=(self.api_key, self.api_secret), + verify=self.verify_ssl, + params=params, + timeout=10 + ) + response.raise_for_status() + return response.json() + except Exception as e: + logger.error(f"Failed to fetch {url}: {e}") + return {} + + def get_dhcp_leases_v4(self): + # Endpoint: /api/dhcpv4/leases/search + # Note: 'search' endpoints in OPNsense often expect POST or GET with params for filtering. + # Often a simple GET works for 'searchLeases' or similar. + # Standard OPNsense API for leases might be under /api/dhcpv4/leases/searchLeases + # Let's try the standard search endpoint. + data = self._get("/api/dhcpv4/leases/searchLease") + # API return structure usually: {"rows": [...], "total": ...} + return data.get("rows", []) + + def get_arp_table(self): + # Endpoint: /api/diagnostics/arp/search + # This endpoint returns the ARP table. + data = self._get("/api/diagnostics/interface/getArp") + # Structure varies, let's assume standard response list or rows + # If the standard plugin is used, it might be /api/diagnostics/interface/getArp + # Or /api/diagnostics/network/arp ... + # NOTE: OPNsense API paths can be tricky. /api/diagnostics/interface/getArp is a common one. + # It returns a list directly or a dict with rows. + # Let's assume list of dicts or {"rows": []} + if isinstance(data, list): + return data + return data.get("rows", []) + + def get_dns_overrides(self): + # Endpoint: /api/unbound/settings/searchHostOverride + data = self._get("/api/unbound/settings/searchHostOverride") + return data.get("rows", []) + + def get_vlan_networks(self): + """ + Build a list of IPv4 networks (CIDRs) from the routing table, grouped by interface description. + """ + routes = self._get("/api/diagnostics/interface/getRoutes") + networks = [] + if not isinstance(routes, list): + return networks + + seen = set() + for route in routes: + if route.get("proto") != "ipv4": + continue + destination = route.get("destination") + if not destination or "/" not in destination or destination == "default": + continue + desc = route.get("intf_description") + if not desc: + continue + try: + network = ipaddress.ip_network(destination, strict=False) + except ValueError: + continue + # Skip host routes (/32) which are usually static peers + if network.prefixlen == 32: + continue + if network.prefixlen < 16: + continue + + key = (desc, str(network)) + if key in seen: + continue + seen.add(key) + networks.append({ + "key": desc, + "name": desc, + "cidr": str(network) + }) + return networks + +def get_opnsense_client(): + return OPNsenseClient() diff --git a/stacks/network-mcp/collectors/nmap_collector/Dockerfile b/stacks/network-mcp/collectors/nmap_collector/Dockerfile new file mode 100644 index 0000000..61e92c8 --- /dev/null +++ b/stacks/network-mcp/collectors/nmap_collector/Dockerfile @@ -0,0 +1,14 @@ +FROM python:3.11-slim + +RUN apt-get update && apt-get install -y nmap && rm -rf /var/lib/apt/lists/* + +WORKDIR /app + +COPY collectors/common /app/collectors/common +COPY collectors/nmap_collector /app/collectors/nmap_collector + +ENV PYTHONPATH=/app + +RUN pip install requests elasticsearch==8.15.1 + +CMD ["python", "collectors/nmap_collector/main.py"] diff --git a/stacks/network-mcp/collectors/nmap_collector/main.py b/stacks/network-mcp/collectors/nmap_collector/main.py new file mode 100644 index 0000000..ccc76e1 --- /dev/null +++ b/stacks/network-mcp/collectors/nmap_collector/main.py @@ -0,0 +1,378 @@ +import os +import time +import datetime +import sys +import json +import shlex + +# Ensure we can import from common +sys.path.append(os.path.join(os.path.dirname(__file__), '..', '..')) + +from collectors.common.es_client import get_es_client +from collectors.common.opnsense_client import get_opnsense_client +from collectors.common.nmap_parser import run_nmap_scan +from collectors.common.logging_config import setup_logging + +logger = setup_logging("nmap_collector") + +def get_now_iso(): + return datetime.datetime.now(datetime.timezone.utc).isoformat() + +def chunk_list(lst, n): + for i in range(0, len(lst), n): + yield lst[i:i + n] + +def should_scan_vlan(vlan, allowlist): + if not allowlist: + return True + name = (vlan.get("name") or "").strip() + key = (vlan.get("key") or "").strip() + return name in allowlist or key in allowlist + +def build_discovery_update_action(host_id, mac, ip, hostname, vendor, ts_iso): + mac_norm = mac.lower() if mac else None + upsert_host = { + "host": { + "id": host_id, + "macs": [mac_norm] if mac_norm else [], + "ips": [ip] if ip else [], + "name": hostname, + "hostnames": [hostname] if hostname else [], + "vendor": vendor, + "sources": ["nmap-discovery"], + "last_seen": ts_iso, + "first_seen": ts_iso + } + } + + script_source = """ + if (ctx._source.host == null) { ctx._source.host = [:]; } + if (ctx._source.host.macs == null) { ctx._source.host.macs = []; } + if (ctx._source.host.ips == null) { ctx._source.host.ips = []; } + if (ctx._source.host.hostnames == null) { ctx._source.host.hostnames = []; } + if (ctx._source.host.sources == null) { ctx._source.host.sources = []; } + + if (params.mac != null && !ctx._source.host.macs.contains(params.mac)) { + ctx._source.host.macs.add(params.mac); + } + if (params.ip != null && !ctx._source.host.ips.contains(params.ip)) { + ctx._source.host.ips.add(params.ip); + } + if (params.hostname != null && !ctx._source.host.hostnames.contains(params.hostname)) { + ctx._source.host.hostnames.add(params.hostname); + } + if (!ctx._source.host.sources.contains(params.source_tag)) { + ctx._source.host.sources.add(params.source_tag); + } + ctx._source.host.last_seen = params.ts; + if (ctx._source.host.name == null && params.hostname != null) { + ctx._source.host.name = params.hostname; + } + if (params.vendor != null && (ctx._source.host.vendor == null || ctx._source.host.vendor == \"\")) { + ctx._source.host.vendor = params.vendor; + } + """ + + return { + "_index": "network-hosts", + "_op_type": "update", + "_id": host_id, + "script": { + "source": script_source, + "lang": "painless", + "params": { + "mac": mac_norm, + "ip": ip, + "hostname": hostname, + "vendor": vendor, + "ts": ts_iso, + "source_tag": "nmap-discovery" + } + }, + "upsert": upsert_host + } + +def run_vlan_discovery(es, opnsense_client, discovery_args, vlan_filter): + networks = opnsense_client.get_vlan_networks() + if not networks: + logger.info("VLAN discovery skipped: OPNsense returned no interfaces.") + return + + scoped_networks = [n for n in networks if should_scan_vlan(n, vlan_filter)] + if not scoped_networks: + logger.info("VLAN discovery skipped: no interfaces matched NMAP_DISCOVERY_VLANS.") + return + + actions = [] + today = datetime.datetime.now().strftime("%Y.%m.%d") + event_index = f"network-events-{today}" + + for vlan in scoped_networks: + cidr = vlan.get("cidr") + if not cidr: + continue + logger.info(f"VLAN discovery scan for {vlan.get('name')} ({cidr})") + scan_ts = get_now_iso() + scan_id = f"nmap_discovery_{vlan.get('name')}_{scan_ts}" + results = run_nmap_scan([cidr], discovery_args) + + for res in results: + ip = res.get("ip") + if not ip: + continue + + mac = res.get("mac") + hostname = res.get("hostname") + vendor = res.get("vendor") + host_id = f"mac:{mac.lower()}" if mac else None + + event_doc = { + "@timestamp": scan_ts, + "source": "nmap-discovery", + "scan_id": scan_id, + "vlan": vlan.get("name"), + "cidr": cidr, + "host": { + "id": host_id, + "ip": ip, + "mac": mac, + "hostname": hostname, + "vendor": vendor + } + } + actions.append({ + "_index": event_index, + "_op_type": "index", + "_source": event_doc + }) + + if host_id: + actions.append( + build_discovery_update_action(host_id, mac, ip, hostname, vendor, scan_ts) + ) + + if actions: + logger.info(f"VLAN discovery produced {len(actions)} Elasticsearch actions.") + es.bulk_index(actions) + else: + logger.info("VLAN discovery finished with no hosts discovered.") + +def main(): + es = get_es_client() + opnsense_client = get_opnsense_client() + + interval = int(os.getenv("NMAP_INTERVAL_SECONDS", "300")) + full_batch_size = int(os.getenv("NMAP_BATCH_SIZE", "10")) + quick_batch_size = int(os.getenv("NMAP_QUICK_BATCH_SIZE", "30")) + port_range = os.getenv("NMAP_PORT_RANGE", "1-1024") # Full scan range + discovery_enabled = os.getenv("NMAP_DISCOVERY_ENABLED", "false").lower() == "true" + discovery_interval = int(os.getenv("NMAP_DISCOVERY_INTERVAL_SECONDS", "3600")) + discovery_vlan_filter = [v.strip() for v in os.getenv("NMAP_DISCOVERY_VLANS", "").split(",") if v.strip()] + discovery_extra_args = os.getenv("NMAP_DISCOVERY_EXTRA_ARGS", "-sn -n").strip() + if discovery_extra_args: + discovery_extra_args = shlex.split(discovery_extra_args) + else: + discovery_extra_args = ["-sn", "-n"] + discovery_last_run = time.time() - discovery_interval if discovery_enabled else 0.0 + full_interval = int(os.getenv("NMAP_FULL_INTERVAL_SECONDS", "86400")) + quick_extra_str = os.getenv("NMAP_QUICK_EXTRA_ARGS", "-sS --top-ports 100 -T4 --open -Pn").strip() + quick_extra_args = shlex.split(quick_extra_str) if quick_extra_str else ["-sS", "--top-ports", "100", "-T4", "--open", "-Pn"] + last_full_scan = time.time() + + # Construct base nmap args + # -sV for service version, -O for OS detection (requires root usually), --open to only show open + # We run as root in docker (usually) or need capabilities. + extra_args = ["-sV", "--open"] + + # Check if port_range looks like a range or specific ports + if port_range: + extra_args.extend(["-p", port_range]) + + # Add user provided extra args + user_args = os.getenv("NMAP_EXTRA_ARGS", "") + if user_args: + extra_args.extend(user_args.split()) + + logger.info("Starting Nmap collector loop...") + + while True: + try: + start_time = time.time() + ts_iso = get_now_iso() + now = time.time() + use_full_scan = (now - last_full_scan) >= full_interval + scan_type = "full" if use_full_scan else "quick" + scan_id = f"nmap_{scan_type}_{ts_iso}" + current_batch_size = full_batch_size if use_full_scan else quick_batch_size + scan_args = extra_args if use_full_scan else quick_extra_args + + if use_full_scan: + last_full_scan = now + logger.info("Running scheduled full service scan.") + else: + logger.info("Running quick common-port sweep.") + + if discovery_enabled and (time.time() - discovery_last_run) >= discovery_interval: + run_vlan_discovery(es, opnsense_client, discovery_extra_args, discovery_vlan_filter) + discovery_last_run = time.time() + + # 1. Get targets from ES + # We only want hosts that have an IP. + hosts = es.search_hosts(index="network-hosts", size=1000) + + # Extract IPs to scan. Map IP -> Host ID to correlate back + targets = [] + ip_to_host_id = {} + + for h in hosts: + # h is {"host": {...}, "ports": [...]} + host_info = h.get("host", {}) + hid = host_info.get("id") + ips = host_info.get("ips", []) + + if not hid or not ips: + continue + + # Pick the "best" IP? Or scan all? + # Scaning all might be duplicate work if they point to same box. + # Let's pick the first one for now. + target_ip = ips[0] + targets.append(target_ip) + ip_to_host_id[target_ip] = hid + + logger.info(f"Found {len(targets)} targets to scan ({scan_type}).") + + total_processed = 0 + logger.info(f"Scanning {scan_type} run with {len(targets)} targets.") + scan_results = run_nmap_scan(targets, scan_args) + actions = [] + today = datetime.datetime.now().strftime("%Y.%m.%d") + event_index = f"network-events-{today}" + + for res in scan_results: + ip = res.get("ip") + if not ip or ip not in ip_to_host_id: + continue + + hid = ip_to_host_id[ip] + total_processed += 1 + + for p in res["ports"]: + p["last_seen"] = ts_iso + p["last_scan_id"] = scan_id + + event_doc = { + "@timestamp": ts_iso, + "source": "nmap", + "scan_id": scan_id, + "host": {"id": hid, "ip": ip}, + "ports": res["ports"], + "os": res.get("os_match") + } + actions.append({ + "_index": event_index, + "_op_type": "index", + "_source": event_doc + }) + + script_source = """ + if (ctx._source.host == null) { ctx._source.host = [:]; } + if (ctx._source.host.sources == null) { ctx._source.host.sources = []; } + if (!ctx._source.host.sources.contains('nmap')) { + ctx._source.host.sources.add('nmap'); + } + ctx._source.host.last_seen = params.ts; + + if (params.os != null) { + ctx._source.host.os = params.os; + } + + if (ctx._source.ports == null) { + ctx._source.ports = []; + } + + for (new_p in params.new_ports) { + boolean found = false; + for (old_p in ctx._source.ports) { + if (old_p.port == new_p.port && old_p.proto == new_p.proto) { + old_p.last_seen = params.ts; + old_p.state = new_p.state; + old_p.service = new_p.service; + old_p.last_scan_id = params.scan_id; + found = true; + break; + } + } + if (!found) { + new_p.first_seen = params.ts; + ctx._source.ports.add(new_p); + } + } + """ + + actions.append({ + "_index": "network-hosts", + "_op_type": "update", + "_id": hid, + "script": { + "source": script_source, + "lang": "painless", + "params": { + "ts": ts_iso, + "os": res.get("os_match"), + "new_ports": res["ports"], + "scan_id": scan_id + } + } + }) + + for p in res["ports"]: + svc_id = f"{hid}:{p['proto']}:{p['port']}" + svc_script = """ + ctx._source.last_seen = params.ts; + ctx._source.state = params.state; + ctx._source.service = params.service; + if (ctx._source.first_seen == null) { + ctx._source.first_seen = params.ts; + } + """ + actions.append({ + "_index": "network-services", + "_op_type": "update", + "_id": svc_id, + "script": { + "source": svc_script, + "lang": "painless", + "params": { + "ts": ts_iso, + "state": p["state"], + "service": p["service"] + } + }, + "upsert": { + "host_id": hid, + "host_ip": ip, + "port": p["port"], + "proto": p["proto"], + "service": p["service"], + "state": p["state"], + "last_seen": ts_iso, + "first_seen": ts_iso, + "sources": ["nmap"] + } + }) + + if actions: + es.bulk_index(actions) + + elapsed = time.time() - start_time + sleep_time = max(0, interval - elapsed) + logger.info(f"Nmap {scan_type} cycle done. Scanned {total_processed} hosts in {elapsed:.2f}s. Sleeping {sleep_time:.2f}s") + time.sleep(sleep_time) + + except Exception as e: + logger.error(f"Error in Nmap loop: {e}") + time.sleep(10) + +if __name__ == "__main__": + main() diff --git a/stacks/network-mcp/collectors/opnsense_collector/Dockerfile b/stacks/network-mcp/collectors/opnsense_collector/Dockerfile new file mode 100644 index 0000000..e89e8b4 --- /dev/null +++ b/stacks/network-mcp/collectors/opnsense_collector/Dockerfile @@ -0,0 +1,14 @@ +FROM python:3.11-slim + +WORKDIR /app + +COPY collectors/common /app/collectors/common +COPY collectors/opnsense_collector /app/collectors/opnsense_collector + +# We need to make sure the module path works. +# The main.py does sys.path.append, but cleanest is to set PYTHONPATH. +ENV PYTHONPATH=/app + +RUN pip install requests elasticsearch==8.15.1 pyyaml + +CMD ["python", "collectors/opnsense_collector/main.py"] diff --git a/stacks/network-mcp/collectors/opnsense_collector/main.py b/stacks/network-mcp/collectors/opnsense_collector/main.py new file mode 100644 index 0000000..d2dc7bd --- /dev/null +++ b/stacks/network-mcp/collectors/opnsense_collector/main.py @@ -0,0 +1,261 @@ +import os +import time +import json +import datetime +import sys +import yaml + +# Ensure we can import from common +sys.path.append(os.path.join(os.path.dirname(__file__), '..', '..')) + +from collectors.common.es_client import get_es_client +from collectors.common.opnsense_client import get_opnsense_client +from collectors.common.logging_config import setup_logging + +logger = setup_logging("opnsense_collector") + +def load_static_metadata(path="/app/static/host_metadata.json"): + if not os.path.exists(path): + logger.info(f"No static metadata found at {path}") + return {} + try: + with open(path, 'r') as f: + return json.load(f) + except Exception as e: + logger.error(f"Failed to load static metadata: {e}") + return {} + +def load_inventory_targets(path=None): + path = path or os.getenv("INVENTORY_FILE", "/app/inventory_targets.yml") + if not os.path.exists(path): + logger.info(f"No inventory targets found at {path}") + return [] + try: + with open(path, 'r') as f: + data = yaml.safe_load(f) or {} + return data.get("inventory_targets", []) + except Exception as e: + logger.error(f"Failed to load inventory targets: {e}") + return [] + +def normalize_mac(mac): + if not mac: + return None + return mac.lower().replace("-", ":") + +def get_now_iso(): + return datetime.datetime.now(datetime.timezone.utc).isoformat() + +def main(): + es = get_es_client() + opn = get_opnsense_client() + + interval = int(os.getenv("COLLECTOR_INTERVAL_SECONDS", "60")) + + logger.info("Starting OPNsense collector loop...") + + while True: + try: + start_time = time.time() + ts_iso = get_now_iso() + + # 1. Fetch Data + dhcp_v4 = opn.get_dhcp_leases_v4() + arp_table = opn.get_arp_table() + dns_overrides = opn.get_dns_overrides() + + static_meta = load_static_metadata() + inventory_entries = load_inventory_targets() + + # 2. Process Data -> hosts map + # Key: identifier (mac:xx... or ip:xxx) + hosts_map = {} + + def create_host_entry(identifier): + return { + "id": identifier, + "macs": set(), + "ips": set(), + "hostnames": set(), + "sources": set(), + "preferred_name": None, + "inventory_notes": None, + "inventory_ports": None + } + + def get_or_create_host(mac): + norm_mac = normalize_mac(mac) + if not norm_mac: + return None + identifier = f"mac:{norm_mac}" + host = hosts_map.setdefault(identifier, create_host_entry(identifier)) + host["macs"].add(norm_mac) + return host + + def get_or_create_host_by_ip(ip): + if not ip: + return None + identifier = f"ip:{ip}" + host = hosts_map.setdefault(identifier, create_host_entry(identifier)) + host["ips"].add(ip) + return host + + # Process DHCP + for lease in dhcp_v4: + # Structure depends on OPNsense version, but usually has 'mac', 'address', 'hostname' + mac = lease.get('mac') or lease.get('hw_address') + ip = lease.get('address') or lease.get('ip') + hostname = lease.get('hostname') + + host = get_or_create_host(mac) + if host: + if ip: host["ips"].add(ip) + if hostname: host["hostnames"].add(hostname) + host["sources"].add("opnsense-dhcp") + + # Process ARP + for entry in arp_table: + # Structure: 'mac', 'ip', 'hostname' (sometimes) + mac = entry.get('mac') + ip = entry.get('ip') + hostname = entry.get('hostname') + + host = get_or_create_host(mac) + if host: + if ip: host["ips"].add(ip) + if hostname and hostname != "?": host["hostnames"].add(hostname) + host["sources"].add("opnsense-arp") + + # Process DNS Overrides (mapped by IP when possible) + ip_to_identifier = {} + for identifier, h in hosts_map.items(): + for ip in h["ips"]: + ip_to_identifier[ip] = identifier + + for override in dns_overrides: + ip = override.get('ip') + domain = override.get('domain') + hostname = override.get('hostname') + full_fqdn = f"{hostname}.{domain}" if hostname and domain else hostname + + if ip and ip in ip_to_identifier: + identifier = ip_to_identifier[ip] + if full_fqdn: + hosts_map[identifier]["hostnames"].add(full_fqdn) + hosts_map[identifier]["sources"].add("opnsense-dns") + + # Process inventory targets (by IP) + for entry in inventory_entries: + ip = entry.get("ip") + if not ip: + continue + identifier = ip_to_identifier.get(ip) + if identifier: + host = hosts_map.get(identifier) + if host is None: + host = get_or_create_host_by_ip(ip) + ip_to_identifier[ip] = host["id"] + else: + host = get_or_create_host_by_ip(ip) + if host: + ip_to_identifier[ip] = host["id"] + if not host: + continue + hostname = entry.get("hostname") + name = entry.get("name") + if hostname: + host["hostnames"].add(hostname) + if name: + host["hostnames"].add(name) + host["preferred_name"] = name + host["sources"].add("inventory") + notes = entry.get("notes") + if notes: + host["inventory_notes"] = notes + ports = entry.get("ports") + if ports: + host["inventory_ports"] = ports + + # 3. Build Actions + actions = [] + today = datetime.datetime.now().strftime("%Y.%m.%d") + event_index = f"network-events-{today}" + + for _, h_data in hosts_map.items(): + name = h_data.get("preferred_name") + if not name and h_data["hostnames"]: + name = next(iter(h_data["hostnames"])) + + final_host = { + "host": { + "id": h_data["id"], + "macs": list(h_data["macs"]), + "ips": list(h_data["ips"]), + "name": name, + "hostnames": list(h_data["hostnames"]), + "last_seen": ts_iso, + "sources": list(h_data["sources"]) + } + } + + if h_data.get("inventory_notes"): + final_host["host"]["notes"] = h_data["inventory_notes"] + if h_data.get("inventory_ports"): + final_host["host"]["expected_ports"] = h_data["inventory_ports"] + # Merge Static Metadata + if h_data["id"] in static_meta: + meta = static_meta[h_data["id"]] + # Merge fields + for k, v in meta.items(): + final_host["host"][k] = v + + # 3a. Event Document + event_doc = { + "@timestamp": ts_iso, + "source": "opnsense", + "scan_id": f"opnsense_{ts_iso}", + "host": final_host["host"] + } + actions.append({ + "_index": event_index, + "_op_type": "index", + "_source": event_doc + }) + + # 3b. Host Upsert + # We use a script upsert or doc_as_upsert. + # doc_as_upsert is simpler but replaces lists. + # Ideally we merge lists (ips, macs), but for now replacing with latest 'truth' from OPNsense + Static is okay. + # However, we don't want to lose 'ports' info from Nmap. + # So we must NOT overwrite 'ports'. + + host_update_doc = { + "host": final_host["host"] + } + + actions.append({ + "_index": "network-hosts", + "_op_type": "update", + "_id": h_data["id"], + "doc": host_update_doc, + "doc_as_upsert": True + }) + + # 4. Send to ES + if actions: + logger.info(f"Sending {len(actions)} actions to Elasticsearch...") + success, failed = es.bulk_index(actions) + else: + logger.info("No hosts found or no actions generated.") + + elapsed = time.time() - start_time + sleep_time = max(0, interval - elapsed) + logger.info(f"Cycle done in {elapsed:.2f}s. Sleeping for {sleep_time:.2f}s") + time.sleep(sleep_time) + + except Exception as e: + logger.error(f"Error in main loop: {e}") + time.sleep(10) + +if __name__ == "__main__": + main() diff --git a/stacks/network-mcp/docker-compose.yml b/stacks/network-mcp/docker-compose.yml new file mode 100644 index 0000000..4c58d33 --- /dev/null +++ b/stacks/network-mcp/docker-compose.yml @@ -0,0 +1,43 @@ +version: "3.9" + +services: + frontend: + build: + context: . + dockerfile: frontend/Dockerfile + restart: always + env_file: + - .env + environment: + FRONTEND_PORT: "5001" + ports: + - "5001:5001" + + opnsense_collector: + build: + context: . + dockerfile: collectors/opnsense_collector/Dockerfile + restart: always + env_file: + - .env + volumes: + - ./static:/app/static + - ./inventory_targets.yml:/app/inventory_targets.yml:ro + environment: + COLLECTOR_INTERVAL_SECONDS: "60" + INVENTORY_FILE: "/app/inventory_targets.yml" + + nmap_collector: + build: + context: . + dockerfile: collectors/nmap_collector/Dockerfile + restart: always + cap_add: + - NET_RAW + - NET_ADMIN + env_file: + - .env + environment: + NMAP_INTERVAL_SECONDS: "300" + NMAP_PORT_RANGE: "1-1024" + NMAP_BATCH_SIZE: "10" diff --git a/stacks/network-mcp/frontend/Dockerfile b/stacks/network-mcp/frontend/Dockerfile new file mode 100644 index 0000000..a4b6549 --- /dev/null +++ b/stacks/network-mcp/frontend/Dockerfile @@ -0,0 +1,15 @@ +FROM python:3.11-slim + +ENV PYTHONDONTWRITEBYTECODE=1 +ENV PYTHONUNBUFFERED=1 + +WORKDIR /app + +COPY frontend/requirements.txt /tmp/requirements.txt +RUN pip install --no-cache-dir -r /tmp/requirements.txt && rm /tmp/requirements.txt + +COPY frontend/ /app/ + +EXPOSE 5001 + +CMD ["gunicorn", "--bind", "0.0.0.0:5001", "app:app"] diff --git a/stacks/network-mcp/frontend/__init__.py b/stacks/network-mcp/frontend/__init__.py new file mode 100644 index 0000000..e16e80e --- /dev/null +++ b/stacks/network-mcp/frontend/__init__.py @@ -0,0 +1,2 @@ +"""Network MCP frontend package (used for local testing/imports).""" + diff --git a/stacks/network-mcp/frontend/app.py b/stacks/network-mcp/frontend/app.py new file mode 100644 index 0000000..c84116e --- /dev/null +++ b/stacks/network-mcp/frontend/app.py @@ -0,0 +1,934 @@ +import base64 +import json +import os +from pathlib import Path +from typing import Any, Dict, List, Optional +from urllib.parse import parse_qs, unquote, urlparse + +import requests +from dotenv import load_dotenv +from flask import Flask, abort, jsonify, render_template, request + +BASE_DIR = Path(__file__).resolve().parent.parent +env_path = BASE_DIR / ".env" +if env_path.exists(): + load_dotenv(env_path) + +ES_URL = os.getenv("ES_URL", "http://localhost:9200").rstrip("/") +ES_VERIFY_SSL = os.getenv("ES_VERIFY_SSL", "false").lower() == "true" + +app = Flask(__name__) + +HOST_SEARCH_LIMIT = int(os.getenv("FRONTEND_HOST_LIMIT", "1000")) +DEFAULT_EVENT_LIMIT = int(os.getenv("FRONTEND_EVENT_LIMIT", "200")) +SERVER_VERSION = os.getenv("NETWORK_MCP_VERSION", "0.1.0") + +REST_TOOLS = [ + { + "name": "list_hosts", + "description": "Return the merged view of every known device on the network (searchable by hostname, IP, or MAC).", + "method": "GET", + "path": "/api/hosts", + }, + { + "name": "network_map", + "description": "Summarize hosts grouped by detected /24 (IPv4) or /64 (IPv6) networks.", + "method": "GET", + "path": "/api/map", + }, + { + "name": "get_host", + "description": "Fetch a single host document by ID (e.g. ip:192.168.5.10).", + "method": "GET", + "path": "/api/hosts/{host_id}", + }, + { + "name": "list_events", + "description": "List recent scan/discovery events with filters for host, type, or time range.", + "method": "GET", + "path": "/api/events", + }, + { + "name": "host_events", + "description": "List the recent events associated with a specific host.", + "method": "GET", + "path": "/api/hosts/{host_id}/events", + }, +] + + +def tool_schema(description: str, properties: Dict[str, Any], required: Optional[List[str]] = None, title: Optional[str] = None): + schema: Dict[str, Any] = { + "type": "object", + "description": description, + "properties": properties, + "additionalProperties": False, + } + if required: + schema["required"] = required + if title: + schema["title"] = title + return schema + + +PORT_SCHEMA = tool_schema( + "Observed port entry.", + { + "port": {"type": "integer", "description": "Port number."}, + "state": {"type": "string", "description": "State reported by nmap (e.g. open, closed)."}, + "service": {"type": "string", "description": "Detected service name, if available."}, + }, + required=["port"], + title="Port", +) + +HOST_SCHEMA = tool_schema( + "Host summary merged from inventory, OPNsense, and nmap.", + { + "id": {"type": "string", "description": "Stable host identifier (ip:* or mac:*)."}, + "name": {"type": "string", "description": "Best-known display name."}, + "ips": {"type": "array", "items": {"type": "string"}, "description": "Associated IP addresses."}, + "macs": {"type": "array", "items": {"type": "string"}, "description": "Observed MAC addresses."}, + "hostnames": {"type": "array", "items": {"type": "string"}, "description": "DNS or hostnames discovered."}, + "sources": {"type": "array", "items": {"type": "string"}, "description": "Data sources contributing to this record."}, + "last_seen": {"type": "string", "description": "ISO timestamp of the most recent observation."}, + "notes": {"type": "string", "description": "Inventory notes/annotations, if present."}, + "expected_ports": {"type": "array", "items": {"type": "string"}, "description": "Ports expected per inventory targets."}, + "ports": {"type": "array", "items": PORT_SCHEMA, "description": "Latest observed open ports."}, + }, + required=["id"], + title="Host", +) + +EVENT_SCHEMA = tool_schema( + "Scan or discovery event emitted by collectors.", + { + "id": {"type": "string", "description": "Event document identifier."}, + "timestamp": {"type": "string", "description": "Observation timestamp (@timestamp)."}, + "source": {"type": "string", "description": "Collector that produced the event (nmap, opnsense, inventory)."}, + "event": {"type": "object", "description": "Event metadata (type, outcome)."}, + "host": HOST_SCHEMA, + "ports": {"type": "array", "items": PORT_SCHEMA, "description": "Ports included with the event (if any)."}, + }, + required=["id", "timestamp"], + title="Event", +) + +NETWORK_ENTRY_SCHEMA = tool_schema( + "Network grouping entry showing hosts per /24 or /64.", + { + "cidr": {"type": "string", "description": "CIDR label (e.g. 192.168.5.0/24)."}, + "hosts": {"type": "array", "items": HOST_SCHEMA, "description": "Hosts that belong to this network."}, + }, + required=["cidr", "hosts"], + title="NetworkEntry", +) + + +MCP_TOOL_DEFINITIONS = { + "list_hosts": { + "title": "List Hosts", + "description": "Return the merged view of every known device on the network with optional filtering by source or identifier.", + "annotations": {"readOnlyHint": True, "destructiveHint": False, "openWorldHint": False}, + "inputSchema": tool_schema( + "Filter options when listing hosts.", + { + "limit": {"type": "integer", "minimum": 1, "maximum": 5000, "title": "Limit", "description": "Maximum number of hosts to return."}, + "source": {"type": "string", "title": "Source filter", "description": "Only include hosts that contain this source tag (e.g. inventory, nmap, opnsense-arp)."}, + "terms": { + "type": "array", + "items": {"type": "string"}, + "title": "Search terms", + "description": "Identifiers (names, hostnames, IPs, or MACs) to match. Equivalent to repeated q parameters in the REST API.", + }, + }, + title="ListHostsInput", + ), + "outputSchema": tool_schema( + "Host list result payload.", + { + "total": {"type": "integer", "description": "Number of hosts returned."}, + "hosts": {"type": "array", "items": HOST_SCHEMA, "description": "Host entries sorted by last-seen time."}, + }, + required=["total", "hosts"], + title="ListHostsResult", + ), + }, + "network_map": { + "title": "Network Map", + "description": "Summarize hosts grouped by detected /24 (IPv4) or /64 (IPv6) ranges.", + "annotations": {"readOnlyHint": True, "destructiveHint": False, "openWorldHint": False}, + "inputSchema": tool_schema( + "Options when generating the network grouping.", + { + "limit": {"type": "integer", "minimum": 1, "maximum": 5000, "title": "Host limit", "description": "Maximum number of hosts to consider when building the map."}, + }, + title="NetworkMapInput", + ), + "outputSchema": tool_schema( + "Grouped view of networks and their hosts.", + { + "host_count": {"type": "integer", "description": "Number of hosts examined for this map."}, + "networks": {"type": "array", "items": NETWORK_ENTRY_SCHEMA, "description": "List of network segments and their hosts."}, + }, + required=["host_count", "networks"], + title="NetworkMapResult", + ), + }, + "get_host": { + "title": "Get Host", + "description": "Fetch a single host document by ID, optionally including recent events.", + "annotations": {"readOnlyHint": True, "destructiveHint": False, "openWorldHint": False}, + "inputSchema": tool_schema( + "Parameters for retrieving an individual host.", + { + "host_id": {"type": "string", "title": "Host ID", "description": "Host identifier (e.g. ip:192.168.5.10, mac:aa:bb:cc...)."}, + "include_events": {"type": "boolean", "title": "Include events", "description": "If true, include recent events for the host."}, + "events_limit": {"type": "integer", "minimum": 1, "maximum": 1000, "title": "Events limit", "description": "Number of events to include if requested."}, + }, + required=["host_id"], + title="GetHostInput", + ), + "outputSchema": tool_schema( + "Host payload with optional embedded events.", + { + "host": HOST_SCHEMA, + "events": {"type": "array", "items": EVENT_SCHEMA, "description": "Recent events when include_events=true."}, + }, + required=["host"], + title="GetHostResult", + ), + }, + "list_events": { + "title": "List Events", + "description": "List recent scan/discovery events with optional filters.", + "annotations": {"readOnlyHint": True, "destructiveHint": False, "openWorldHint": False}, + "inputSchema": tool_schema( + "Filters applied when querying events.", + { + "limit": {"type": "integer", "minimum": 1, "maximum": 1000, "title": "Limit", "description": "Maximum number of events to return."}, + "host_id": {"type": "string", "title": "Host filter", "description": "Only include events for this host identifier."}, + "type": {"type": "string", "title": "Event type", "description": "Restrict to a specific event type (e.g. scan, discovery)."}, + "since": {"type": "string", "title": "Since timestamp", "description": "ISO8601 timestamp used as a lower bound for @timestamp."}, + }, + title="ListEventsInput", + ), + "outputSchema": tool_schema( + "Event search result.", + { + "total": {"type": "integer", "description": "Number of events returned."}, + "events": {"type": "array", "items": EVENT_SCHEMA, "description": "Event documents sorted by timestamp."}, + }, + required=["total", "events"], + title="ListEventsResult", + ), + }, + "host_events": { + "title": "Host Events", + "description": "List recent events associated with a specific host.", + "annotations": {"readOnlyHint": True, "destructiveHint": False, "openWorldHint": False}, + "inputSchema": tool_schema( + "Parameters when retrieving events bound to a single host.", + { + "host_id": {"type": "string", "title": "Host ID", "description": "Host identifier to filter by."}, + "limit": {"type": "integer", "minimum": 1, "maximum": 1000, "title": "Limit", "description": "Maximum number of events to return."}, + "type": {"type": "string", "title": "Event type", "description": "Restrict to a specific event type (e.g. scan, discovery)."}, + "since": {"type": "string", "title": "Since timestamp", "description": "ISO8601 timestamp used as a lower bound for @timestamp."}, + }, + required=["host_id"], + title="HostEventsInput", + ), + "outputSchema": tool_schema( + "Event list scoped to a host.", + { + "total": {"type": "integer", "description": "Number of events returned for the host."}, + "events": {"type": "array", "items": EVENT_SCHEMA, "description": "Host-specific event entries."}, + }, + required=["total", "events"], + title="HostEventsResult", + ), + }, +} + + +def resolve_api_key(api_id: str, api_key: str): + if api_id and api_key: + return api_id, api_key + if not api_key: + return None, None + if ":" in api_key: + possible_id, possible_key = api_key.split(":", 1) + return possible_id, possible_key + try: + decoded = base64.b64decode(api_key).decode() + if ":" in decoded: + possible_id, possible_key = decoded.split(":", 1) + return possible_id, possible_key + except Exception: + pass + return None, None + + +def build_es_request(): + headers = {} + auth = None + + api_id = os.getenv("ES_API_ID") + api_key = os.getenv("ES_API_KEY") + api_id, api_key = resolve_api_key(api_id, api_key) + if api_id and api_key: + token = base64.b64encode(f"{api_id}:{api_key}".encode()).decode() + headers["Authorization"] = f"ApiKey {token}" + else: + auth = (os.getenv("ES_USER", "elastic"), os.getenv("ES_PASS", "changeme")) + return headers, auth + + +def normalize_host(doc: Dict) -> Dict: + host = doc.get("host", {}) + ports = doc.get("ports", []) + return { + "id": host.get("id"), + "name": host.get("name") or host.get("id"), + "ips": host.get("ips", []), + "macs": host.get("macs", []), + "hostnames": host.get("hostnames", []), + "sources": host.get("sources", []), + "last_seen": host.get("last_seen"), + "notes": host.get("notes"), + "expected_ports": host.get("expected_ports", []), + "ports": [ + { + "port": p.get("port"), + "state": p.get("state"), + "service": (p.get("service") or {}).get("name"), + } + for p in ports + ], + } + + +def parse_search_terms(raw_terms: List[str]) -> List[str]: + terms: List[str] = [] + for raw in raw_terms: + if not raw: + continue + cleaned = raw.replace(",", " ") + for chunk in cleaned.split(): + chunk = chunk.strip() + if chunk: + terms.append(chunk) + return terms + + +def coerce_string_list(value: Any) -> List[str]: + if value is None: + return [] + if isinstance(value, str): + return [value] + if isinstance(value, (list, tuple)): + return [str(item) for item in value if item is not None] + return [] + + +def clamp_int(value: Any, default: int, min_value: int, max_value: int) -> int: + try: + if value is None: + return default + parsed = int(value) + except (TypeError, ValueError): + return default + return max(min_value, min(max_value, parsed)) + + +def coerce_bool(value: Any, default: bool = False) -> bool: + if value is None: + return default + if isinstance(value, bool): + return value + if isinstance(value, str): + return value.lower() in {"1", "true", "yes", "on"} + return default + + +def build_search_clause(term: str) -> Dict: + wildcard = f"*{term}*" + return { + "bool": { + "should": [ + {"wildcard": {"host.name.keyword": {"value": wildcard, "case_insensitive": True}}}, + {"wildcard": {"host.hostnames.keyword": {"value": wildcard, "case_insensitive": True}}}, + {"wildcard": {"host.id.keyword": {"value": wildcard, "case_insensitive": True}}}, + {"wildcard": {"host.ips": {"value": wildcard, "case_insensitive": True}}}, + {"wildcard": {"host.macs": {"value": wildcard, "case_insensitive": True}}}, + ], + "minimum_should_match": 1, + } + } + + +def fetch_hosts(limit: int = HOST_SEARCH_LIMIT, source: Optional[str] = None, search_terms: Optional[List[str]] = None): + headers, auth = build_es_request() + body = { + "size": limit, + "sort": [{"host.last_seen": {"order": "desc"}}], + } + filters: List[Dict] = [] + if source: + filters.append({"term": {"host.sources.keyword": source}}) + if search_terms: + should_clauses = [build_search_clause(term) for term in search_terms] + filters.append({"bool": {"should": should_clauses, "minimum_should_match": 1}}) + if filters: + body["query"] = {"bool": {"filter": filters}} + resp = requests.get( + f"{ES_URL}/network-hosts/_search", + json=body, + headers=headers, + auth=auth, + verify=ES_VERIFY_SSL, + ) + resp.raise_for_status() + return [normalize_host(hit.get("_source", {})) for hit in resp.json()["hits"]["hits"]] + + +def fetch_host_by_id(host_id: str) -> Optional[Dict]: + headers, auth = build_es_request() + body = {"size": 1, "query": {"term": {"host.id.keyword": host_id}}} + resp = requests.get( + f"{ES_URL}/network-hosts/_search", + json=body, + headers=headers, + auth=auth, + verify=ES_VERIFY_SSL, + ) + resp.raise_for_status() + hits = resp.json()["hits"]["hits"] + if not hits: + return None + return normalize_host(hits[0].get("_source", {})) + + +def fetch_events(host_id: Optional[str] = None, limit: int = DEFAULT_EVENT_LIMIT, event_type: Optional[str] = None, since: Optional[str] = None): + headers, auth = build_es_request() + filters: List[Dict] = [] + if host_id: + filters.append({"term": {"host.id.keyword": host_id}}) + if event_type: + filters.append({"term": {"event.type.keyword": event_type}}) + if since: + filters.append({"range": {"@timestamp": {"gte": since}}}) + body: Dict = { + "size": limit, + "sort": [{"@timestamp": {"order": "desc"}}], + } + if filters: + body["query"] = {"bool": {"filter": filters}} + resp = requests.get( + f"{ES_URL}/network-events-*/_search", + json=body, + headers=headers, + auth=auth, + verify=ES_VERIFY_SSL, + ) + if resp.status_code == 404: + return [] + resp.raise_for_status() + events = [] + for hit in resp.json()["hits"]["hits"]: + doc = hit.get("_source", {}) + events.append( + { + "id": hit.get("_id"), + "timestamp": doc.get("@timestamp"), + "event": doc.get("event", {}), + "host": doc.get("host", {}), + "observed": doc.get("observed"), + "scan": doc.get("scan"), + "ports": doc.get("ports", []), + "source": doc.get("source"), + } + ) + return events + + +def derive_network_label(ip: str) -> str: + if not ip: + return "unknown" + if ":" in ip: + parts = ip.split(":") + prefix = ":".join(parts[:4]) + return f"{prefix}::/64" + octets = ip.split(".") + if len(octets) == 4: + return f"{octets[0]}.{octets[1]}.{octets[2]}.0/24" + return "unknown" + + +def build_network_map(hosts: List[Dict]): + networks: Dict[str, Dict] = {} + for host in hosts: + seen = set() + for ip in host.get("ips", []): + label = derive_network_label(ip) + if label in seen: + continue + seen.add(label) + entry = networks.setdefault(label, {"cidr": label, "hosts": []}) + entry["hosts"].append( + { + "id": host.get("id"), + "name": host.get("name"), + "ips": host.get("ips", []), + "sources": host.get("sources", []), + "last_seen": host.get("last_seen"), + } + ) + sorted_networks = sorted(networks.values(), key=lambda n: n["cidr"]) + for entry in sorted_networks: + entry["hosts"].sort(key=lambda h: h.get("name") or h.get("id") or "") + return sorted_networks + + +def bool_arg(value: Optional[str], default: bool = False) -> bool: + if value is None: + return default + return value.lower() in {"1", "true", "yes", "on"} + + +def build_manifest(base_url: str) -> Dict: + base = base_url.rstrip("/") + tools = [] + for tool in REST_TOOLS: + tools.append( + { + "name": tool["name"], + "description": tool["description"], + "method": tool["method"], + "path": tool["path"], + "url": f"{base}{tool['path']}", + } + ) + return { + "name": "network-mcp", + "description": "Network discovery source-of-truth backed by Elasticsearch, Nmap, and OPNsense.", + "schema": "1.0", + "tools": tools, + "auth": "env", + } + + +def tool_result(summary: str, data: Dict[str, Any]): + return summary, data + + +def handle_tool_list_hosts(arguments: Dict[str, Any]): + limit = clamp_int(arguments.get("limit"), HOST_SEARCH_LIMIT, 1, 5000) + raw_terms = coerce_string_list(arguments.get("terms")) + search_terms = parse_search_terms(raw_terms) + hosts = fetch_hosts(limit=limit, source=arguments.get("source"), search_terms=search_terms or None) + return tool_result(f"Returned {len(hosts)} hosts.", {"hosts": hosts, "total": len(hosts)}) + + +def handle_tool_network_map(arguments: Dict[str, Any]): + limit = clamp_int(arguments.get("limit"), HOST_SEARCH_LIMIT, 1, 5000) + hosts = fetch_hosts(limit=limit) + network_map = build_network_map(hosts) + return tool_result(f"Computed {len(network_map)} networks.", {"networks": network_map, "host_count": len(hosts)}) + + +def handle_tool_get_host(arguments: Dict[str, Any]): + host_id = arguments.get("host_id") + if not host_id: + raise ValueError("host_id is required") + host = fetch_host_by_id(host_id) + if not host: + raise KeyError(f"Host {host_id} not found") + include_events = coerce_bool(arguments.get("include_events"), default=False) + result = {"host": host} + if include_events: + events_limit = clamp_int(arguments.get("events_limit"), DEFAULT_EVENT_LIMIT, 1, 1000) + result["events"] = fetch_events(host_id=host_id, limit=events_limit) + return tool_result(f"Fetched host {host_id}.", result) + + +def handle_tool_list_events(arguments: Dict[str, Any]): + limit = clamp_int(arguments.get("limit"), DEFAULT_EVENT_LIMIT, 1, 1000) + events = fetch_events( + host_id=arguments.get("host_id"), + limit=limit, + event_type=arguments.get("type"), + since=arguments.get("since"), + ) + return tool_result(f"Returned {len(events)} events.", {"events": events, "total": len(events)}) + + +def handle_tool_host_events(arguments: Dict[str, Any]): + host_id = arguments.get("host_id") + if not host_id: + raise ValueError("host_id is required") + limit = clamp_int(arguments.get("limit"), DEFAULT_EVENT_LIMIT, 1, 1000) + events = fetch_events(host_id=host_id, limit=limit, event_type=arguments.get("type"), since=arguments.get("since")) + return tool_result(f"Returned {len(events)} events for {host_id}.", {"events": events, "total": len(events)}) + + +TOOL_HANDLERS = { + "list_hosts": handle_tool_list_hosts, + "network_map": handle_tool_network_map, + "get_host": handle_tool_get_host, + "list_events": handle_tool_list_events, + "host_events": handle_tool_host_events, +} + + +def list_mcp_tools(): + tools = [] + for name, meta in MCP_TOOL_DEFINITIONS.items(): + tool = { + "name": name, + "description": meta.get("description"), + "inputSchema": meta.get("inputSchema", {"type": "object"}), + } + title = meta.get("title") + if title: + tool["title"] = title + output_schema = meta.get("outputSchema") + if output_schema: + tool["outputSchema"] = output_schema + annotations = meta.get("annotations") + if annotations: + tool["annotations"] = annotations + tools.append(tool) + return tools + + +def call_tool_by_name(name: str, arguments: Optional[Dict[str, Any]] = None): + if name not in TOOL_HANDLERS: + raise KeyError(f"Unknown tool: {name}") + handler = TOOL_HANDLERS[name] + summary, data = handler(arguments or {}) + return summary, data + + +def list_mcp_resources(base_uri: str = "network://"): + return [ + { + "uri": f"{base_uri}hosts", + "name": "hosts", + "title": "Hosts (Snapshot)", + "mimeType": "application/json", + "description": "Snapshot of merged hosts (inventory + opnsense + nmap). Use resources/templates/list for search parameters.", + }, + { + "uri": f"{base_uri}map", + "name": "map", + "title": "Network Map (Snapshot)", + "mimeType": "application/json", + "description": "Snapshot of networks grouped by /24 (IPv4) or /64 (IPv6).", + }, + { + "uri": f"{base_uri}events", + "name": "events", + "title": "Recent Events (Snapshot)", + "mimeType": "application/json", + "description": "Recent scan/discovery events. Use resources/templates/list for filters (host_id/type/since).", + }, + ] + + +def list_mcp_resource_templates(base_uri: str = "network://"): + return [ + { + "uriTemplate": f"{base_uri}hosts{{?q,source,limit}}", + "name": "hosts_query", + "title": "Hosts Query", + "mimeType": "application/json", + "description": "Query hosts by q (hostname/IP/MAC/name, case-insensitive), source, and limit. Repeat q to provide multiple terms.", + }, + { + "uriTemplate": f"{base_uri}host/{{host_id}}{{?include_events,events_limit}}", + "name": "host_detail", + "title": "Host Detail", + "mimeType": "application/json", + "description": "Fetch a single host by host_id (e.g. mac:aa:bb.. or ip:192.168.5.10). Optionally include events.", + }, + { + "uriTemplate": f"{base_uri}events{{?host_id,type,since,limit}}", + "name": "events_query", + "title": "Events Query", + "mimeType": "application/json", + "description": "Query recent events with optional filters host_id, type, since (ISO8601), and limit.", + }, + { + "uriTemplate": f"{base_uri}map{{?limit}}", + "name": "map_query", + "title": "Network Map", + "mimeType": "application/json", + "description": "Build a network map from up to limit hosts.", + }, + ] + + +def read_mcp_resource(uri: str): + parsed = urlparse(uri) + if parsed.scheme != "network": + raise ValueError(f"Unsupported resource URI scheme: {parsed.scheme}") + + netloc = parsed.netloc + query = parse_qs(parsed.query or "") + + if netloc == "hosts": + limit = clamp_int((query.get("limit") or [HOST_SEARCH_LIMIT])[0], HOST_SEARCH_LIMIT, 1, 5000) + source = (query.get("source") or [None])[0] + q_terms = query.get("q") or [] + search_terms = parse_search_terms(q_terms) + payload = {"hosts": fetch_hosts(limit=limit, source=source, search_terms=search_terms or None)} + payload["total"] = len(payload["hosts"]) + return {"contents": [{"uri": uri, "mimeType": "application/json", "text": json.dumps(payload)}]} + + if netloc == "map": + limit = clamp_int((query.get("limit") or [HOST_SEARCH_LIMIT])[0], HOST_SEARCH_LIMIT, 1, 5000) + hosts = fetch_hosts(limit=limit) + payload = {"networks": build_network_map(hosts), "host_count": len(hosts)} + return {"contents": [{"uri": uri, "mimeType": "application/json", "text": json.dumps(payload)}]} + + if netloc == "events": + limit = clamp_int((query.get("limit") or [DEFAULT_EVENT_LIMIT])[0], DEFAULT_EVENT_LIMIT, 1, 1000) + host_id = (query.get("host_id") or [None])[0] + event_type = (query.get("type") or [None])[0] + since = (query.get("since") or [None])[0] + events = fetch_events(host_id=host_id, limit=limit, event_type=event_type, since=since) + payload = {"events": events, "total": len(events)} + return {"contents": [{"uri": uri, "mimeType": "application/json", "text": json.dumps(payload)}]} + + if netloc == "host": + host_id = unquote((parsed.path or "").lstrip("/")) + if not host_id: + raise ValueError("Host resource requires / path") + include_events = coerce_bool((query.get("include_events") or [False])[0], default=False) + events_limit = clamp_int((query.get("events_limit") or [DEFAULT_EVENT_LIMIT])[0], DEFAULT_EVENT_LIMIT, 1, 1000) + host = fetch_host_by_id(host_id) + if not host: + raise KeyError(f"Host {host_id} not found") + payload = {"host": host} + if include_events: + payload["events"] = fetch_events(host_id=host_id, limit=events_limit) + return {"contents": [{"uri": uri, "mimeType": "application/json", "text": json.dumps(payload)}]} + + raise ValueError(f"Unknown resource URI: {uri}") + + +def jsonrpc_error(rpc_id: Any, code: int, message: str): + return { + "jsonrpc": "2.0", + "id": rpc_id, + "error": {"code": code, "message": message}, + } + + +def build_initialize_result(protocol_version: Optional[str] = None): + protocol_version = protocol_version or "2025-11-25" + return { + "protocolVersion": protocol_version, + "capabilities": { + "tools": {"listChanged": False}, + "resources": {"listChanged": False, "subscribe": False}, + }, + "serverInfo": {"name": "network-mcp", "version": SERVER_VERSION}, + "instructions": "Start with list_hosts (search by hostname/IP/MAC), then use get_host for details and list_events/host_events for timelines; network_map gives a quick /24-/64 overview.", + } + + +def process_rpc_request(payload: Dict[str, Any]): + if not isinstance(payload, dict): + return jsonrpc_error(None, -32600, "Invalid request") + rpc_id = payload.get("id") + method = payload.get("method") + params = payload.get("params") or {} + is_notification = rpc_id is None + + if method == "initialize": + requested = params.get("protocolVersion") + requested_str = str(requested) if requested is not None else None + return {"jsonrpc": "2.0", "id": rpc_id, "result": build_initialize_result(requested_str)} + + if method == "ping": + return {"jsonrpc": "2.0", "id": rpc_id, "result": {}} + + if method == "tools/list": + result = {"tools": list_mcp_tools(), "nextCursor": None} + return {"jsonrpc": "2.0", "id": rpc_id, "result": result} + + if method == "resources/list": + result = {"resources": list_mcp_resources(), "nextCursor": None} + return {"jsonrpc": "2.0", "id": rpc_id, "result": result} + + if method == "resources/templates/list": + result = {"resourceTemplates": list_mcp_resource_templates(), "nextCursor": None} + return {"jsonrpc": "2.0", "id": rpc_id, "result": result} + + if method == "resources/read": + uri = (params or {}).get("uri") + if not uri: + return jsonrpc_error(rpc_id, -32602, "uri is required") + try: + result = read_mcp_resource(uri) + return {"jsonrpc": "2.0", "id": rpc_id, "result": result} + except ValueError as exc: + return jsonrpc_error(rpc_id, -32602, str(exc)) + except KeyError as exc: + message = exc.args[0] if exc.args else str(exc) + return jsonrpc_error(rpc_id, -32004, message) + + if method == "notifications/initialized": + # No response for notifications. + return None + + if method == "tools/call": + name = params.get("name") + if not name: + if is_notification: + return None + return jsonrpc_error(rpc_id, -32602, "Tool name is required") + arguments = params.get("arguments") or {} + try: + summary, data = call_tool_by_name(name, arguments) + result = { + "content": [{"type": "text", "text": summary}], + "structuredContent": data, + "isError": False, + } + if is_notification: + return None + return {"jsonrpc": "2.0", "id": rpc_id, "result": result} + except ValueError as exc: + if is_notification: + return None + result = { + "content": [{"type": "text", "text": f"Tool argument error: {exc}"}], + "structuredContent": {"error": str(exc)}, + "isError": True, + } + return {"jsonrpc": "2.0", "id": rpc_id, "result": result} + except KeyError as exc: + message = exc.args[0] if exc.args else str(exc) + if is_notification: + return None + result = { + "content": [{"type": "text", "text": f"Tool error: {message}"}], + "structuredContent": {"error": message}, + "isError": True, + } + return {"jsonrpc": "2.0", "id": rpc_id, "result": result} + except Exception as exc: # pragma: no cover - defensive + if is_notification: + return None + return jsonrpc_error(rpc_id, -32603, f"Internal error: {exc}") + + if is_notification: + return None + + return jsonrpc_error(rpc_id, -32601, f"Method {method} not found") + + +def process_rpc_envelope(payload: Any): + if isinstance(payload, list): + responses = [] + for entry in payload: + response = process_rpc_request(entry) + if response is not None: + responses.append(response) + return responses + if isinstance(payload, dict): + return process_rpc_request(payload) + return jsonrpc_error(None, -32600, "Invalid request") + + +@app.route("/api/hosts") +def api_hosts(): + limit = min(int(request.args.get("limit", HOST_SEARCH_LIMIT)), 5000) + q_args = request.args.getlist("q") + search_terms = parse_search_terms(q_args) + hosts = fetch_hosts( + limit=limit, + source=request.args.get("source"), + search_terms=search_terms if search_terms else None, + ) + return jsonify({"hosts": hosts, "total": len(hosts)}) + + +@app.route("/api/hosts/") +def api_host_detail(host_id: str): + host = fetch_host_by_id(host_id) + if not host: + abort(404, description=f"Host {host_id} not found") + include_events = bool_arg(request.args.get("include_events"), default=False) + result = {"host": host} + if include_events: + limit = min(int(request.args.get("events_limit", DEFAULT_EVENT_LIMIT)), 1000) + result["events"] = fetch_events(host_id=host_id, limit=limit) + return jsonify(result) + + +@app.route("/api/events") +def api_events(): + limit = min(int(request.args.get("limit", DEFAULT_EVENT_LIMIT)), 1000) + events = fetch_events( + host_id=request.args.get("host_id"), + limit=limit, + event_type=request.args.get("type"), + since=request.args.get("since"), + ) + return jsonify({"events": events, "total": len(events)}) + + +@app.route("/api/hosts//events") +def api_host_events(host_id: str): + limit = min(int(request.args.get("limit", DEFAULT_EVENT_LIMIT)), 1000) + events = fetch_events(host_id=host_id, limit=limit, event_type=request.args.get("type"), since=request.args.get("since")) + return jsonify({"events": events, "total": len(events)}) + + +@app.route("/api/map") +def api_map(): + limit = min(int(request.args.get("limit", HOST_SEARCH_LIMIT)), 5000) + hosts = fetch_hosts(limit=limit) + network_map = build_network_map(hosts) + return jsonify({"networks": network_map, "host_count": len(hosts)}) + + +@app.route("/.well-known/mcp.json", methods=["GET", "POST", "OPTIONS"]) +@app.route("/api/mcp", methods=["GET", "POST", "OPTIONS"]) +def api_manifest(): + if request.method == "OPTIONS": + return ("", 204, {"Allow": "GET,POST,OPTIONS"}) + if request.method == "POST": + payload = request.get_json(silent=True) + if payload is None: + return jsonify(jsonrpc_error(None, -32700, "Invalid JSON")), 400 + rpc_response = process_rpc_envelope(payload) + if rpc_response is None or (isinstance(rpc_response, list) and not rpc_response): + return ("", 204) + return jsonify(rpc_response) + manifest = build_manifest(request.url_root.rstrip("/")) + return jsonify(manifest) + + +@app.route("/") +def index(): + hosts = fetch_hosts() + total = len(hosts) + with_ports = sum(1 for h in hosts if h["ports"]) + inventory_hosts = sum(1 for h in hosts if "inventory" in h["sources"]) + return render_template( + "index.html", + hosts=hosts, + total=total, + with_ports=with_ports, + inventory_hosts=inventory_hosts, + es_url=ES_URL, + ) + + +if __name__ == "__main__": + app.run(host="0.0.0.0", port=int(os.getenv("FRONTEND_PORT", "5001"))) diff --git a/stacks/network-mcp/frontend/requirements.txt b/stacks/network-mcp/frontend/requirements.txt new file mode 100644 index 0000000..755a326 --- /dev/null +++ b/stacks/network-mcp/frontend/requirements.txt @@ -0,0 +1,4 @@ +Flask==2.2.5 +requests==2.31.0 +python-dotenv==0.21.1 +gunicorn==21.2.0 diff --git a/stacks/network-mcp/frontend/templates/index.html b/stacks/network-mcp/frontend/templates/index.html new file mode 100644 index 0000000..a8bc2f3 --- /dev/null +++ b/stacks/network-mcp/frontend/templates/index.html @@ -0,0 +1,206 @@ + + + + + + Network MCP Hosts + + + +
+

Network MCP Overview

+

Elasticsearch: {{ es_url }}

+
+ +
+
+

Total Hosts

+

{{ total }}

+
+
+

With Port Data

+

{{ with_ports }}

+
+
+

Inventory Entries

+

{{ inventory_hosts }}

+
+
+ +
+ {% for host in hosts %} +
+

{{ host.name }} + {% if host.notes %} + 📝 + {% endif %} +

+
+ {% for source in host.sources %} + {% set tag_class = "" %} + {% if source == "inventory" %} + {% set tag_class = "source-tag-inventory" %} + {% elif source.startswith("opnsense") %} + {% set tag_class = "source-tag-opnsense" %} + {% elif source == "nmap" %} + {% set tag_class = "source-tag-nmap" %} + {% elif source == "nmap-discovery" %} + {% set tag_class = "source-tag-discovery" %} + {% endif %} + {{ source }} + {% endfor %} +
+
+ IPs: {{ host.ips|join(", ") if host.ips else "—" }} +
+ {% if host.macs %} +
+ MACs: {{ host.macs|join(", ") }} +
+ {% endif %} + {% if host.hostnames %} +
+ Hostnames: {{ host.hostnames|join(", ") }} +
+ {% endif %} +
+ Last seen: {{ host.last_seen or "unknown" }} +
+ {% if host.notes %} +
+ Notes: {{ host.notes }} +
+ {% endif %} + {% if host.expected_ports %} +
+

Expected Ports

+
+ {% for port in host.expected_ports %} + {{ port }} + {% endfor %} +
+
+ {% endif %} + {% if host.ports %} +
+

Observed Ports

+
+ {% for port in host.ports %} + {{ port.port }} {{ port.service or "" }} + {% endfor %} +
+
+ {% endif %} +
+ {% endfor %} +
+ + diff --git a/stacks/network-mcp/frontend/tests/__init__.py b/stacks/network-mcp/frontend/tests/__init__.py new file mode 100644 index 0000000..c3ab24e --- /dev/null +++ b/stacks/network-mcp/frontend/tests/__init__.py @@ -0,0 +1,2 @@ +"""Unit tests for the Network MCP frontend.""" + diff --git a/stacks/network-mcp/frontend/tests/test_mcp.py b/stacks/network-mcp/frontend/tests/test_mcp.py new file mode 100644 index 0000000..4e249a8 --- /dev/null +++ b/stacks/network-mcp/frontend/tests/test_mcp.py @@ -0,0 +1,203 @@ +import json +import unittest +from unittest.mock import patch + + +class FakeResponse: + def __init__(self, payload, status_code=200): + self._payload = payload + self.status_code = status_code + + def json(self): + return self._payload + + def raise_for_status(self): + if self.status_code >= 400: + raise RuntimeError(f"HTTP {self.status_code}") + + +def _wildcard_match(pattern: str, value: str, case_insensitive: bool) -> bool: + if value is None: + return False + if case_insensitive: + pattern = pattern.lower() + value = value.lower() + if pattern.startswith("*") and pattern.endswith("*"): + needle = pattern.strip("*") + return needle in value + return pattern == value + + +def _extract_wildcard_clause(field_clause): + # Supports either {"field": "*term*"} or {"field": {"value":"*term*", "case_insensitive":true}} + if not isinstance(field_clause, dict): + return None, None, None + if len(field_clause) != 1: + return None, None, None + field, value = next(iter(field_clause.items())) + if isinstance(value, str): + return field, value, False + if isinstance(value, dict): + return field, value.get("value"), bool(value.get("case_insensitive")) + return None, None, None + + +def _filter_hosts_by_query(host_docs, query): + if not query: + return host_docs + bool_query = query.get("bool") if isinstance(query, dict) else None + if not bool_query: + return host_docs + filters = bool_query.get("filter") or [] + if not filters: + return host_docs + + matched = host_docs + for f in filters: + if "term" in f and "host.sources.keyword" in f["term"]: + src = f["term"]["host.sources.keyword"] + matched = [h for h in matched if src in (h.get("host", {}).get("sources") or [])] + continue + + if "bool" in f and "should" in f["bool"]: + shoulds = f["bool"]["should"] + + def matches_any(host_doc): + host = host_doc.get("host", {}) + haystacks = { + "host.name.keyword": [host.get("name")], + "host.hostnames.keyword": host.get("hostnames") or [], + "host.id.keyword": [host.get("id")], + "host.ips": host.get("ips") or [], + "host.macs": host.get("macs") or [], + } + for clause in shoulds: + if "bool" in clause and "should" in clause["bool"]: + # nested should from multiple search terms + nested_shoulds = clause["bool"]["should"] + for nested in nested_shoulds: + if "wildcard" not in nested: + continue + field, value, ci = _extract_wildcard_clause(nested["wildcard"]) + if not field or value is None: + continue + for candidate in haystacks.get(field, []): + if _wildcard_match(value, str(candidate or ""), ci): + return True + if "wildcard" in clause: + field, value, ci = _extract_wildcard_clause(clause["wildcard"]) + if not field or value is None: + continue + for candidate in haystacks.get(field, []): + if _wildcard_match(value, str(candidate or ""), ci): + return True + return False + + matched = [h for h in matched if matches_any(h)] + continue + return matched + + +class TestNetworkMCP(unittest.TestCase): + def setUp(self): + from frontend import app as app_module + + self.app_module = app_module + self.client = app_module.app.test_client() + + self.host_docs = [ + { + "host": { + "id": "mac:dc:a6:32:67:55:dc", + "name": "SEELE", + "hostnames": ["SEELE"], + "ips": ["192.168.5.208"], + "macs": ["dc:a6:32:67:55:dc"], + "sources": ["opnsense-dhcp", "opnsense-arp"], + "last_seen": "2025-12-14T16:27:15.427091+00:00", + }, + "ports": [{"port": 22, "state": "open", "service": {"name": "ssh"}}], + }, + { + "host": { + "id": "mac:aa:bb:cc:dd:ee:ff", + "name": "core", + "hostnames": ["core.localdomain"], + "ips": ["192.168.5.34"], + "macs": ["aa:bb:cc:dd:ee:ff"], + "sources": ["inventory", "opnsense-arp"], + "last_seen": "2025-12-14T16:27:15.427091+00:00", + "notes": "Production Docker host", + }, + "ports": [{"port": 443, "state": "open", "service": {"name": "https"}}], + }, + ] + + def fake_requests_get(self, url, json=None, headers=None, auth=None, verify=None): + if url.endswith("/network-hosts/_search"): + query = (json or {}).get("query") + hits = _filter_hosts_by_query(self.host_docs, query) + return FakeResponse({"hits": {"hits": [{"_source": h} for h in hits]}}) + if "/network-events-" in url and url.endswith("/_search"): + return FakeResponse({"hits": {"hits": []}}) + return FakeResponse({}, status_code=404) + + def test_rest_search_hostname_case_insensitive(self): + with patch.object(self.app_module.requests, "get", side_effect=self.fake_requests_get): + resp = self.client.get("/api/hosts?q=seele&limit=50") + self.assertEqual(resp.status_code, 200) + payload = resp.get_json() + self.assertEqual(payload["total"], 1) + self.assertEqual(payload["hosts"][0]["name"], "SEELE") + + def test_rest_search_by_ip(self): + with patch.object(self.app_module.requests, "get", side_effect=self.fake_requests_get): + resp = self.client.get("/api/hosts?q=192.168.5.208") + payload = resp.get_json() + self.assertEqual(payload["total"], 1) + self.assertEqual(payload["hosts"][0]["id"], "mac:dc:a6:32:67:55:dc") + + def test_rest_search_by_mac(self): + with patch.object(self.app_module.requests, "get", side_effect=self.fake_requests_get): + resp = self.client.get("/api/hosts?q=dc:a6:32:67:55:dc") + payload = resp.get_json() + self.assertEqual(payload["total"], 1) + self.assertEqual(payload["hosts"][0]["name"], "SEELE") + + def test_mcp_tools_call_search_terms(self): + with patch.object(self.app_module.requests, "get", side_effect=self.fake_requests_get): + body = { + "jsonrpc": "2.0", + "id": 1, + "method": "tools/call", + "params": {"name": "list_hosts", "arguments": {"terms": ["seele"], "limit": 10}}, + } + resp = self.client.post("/.well-known/mcp.json", data=json.dumps(body), content_type="application/json") + self.assertEqual(resp.status_code, 200) + payload = resp.get_json() + self.assertFalse(payload["result"]["isError"]) + hosts = payload["result"]["structuredContent"]["hosts"] + self.assertEqual(len(hosts), 1) + self.assertEqual(hosts[0]["name"], "SEELE") + + def test_mcp_resources_read_hosts_query(self): + with patch.object(self.app_module.requests, "get", side_effect=self.fake_requests_get): + body = {"jsonrpc": "2.0", "id": 2, "method": "resources/read", "params": {"uri": "network://hosts?q=seele&limit=5"}} + resp = self.client.post("/.well-known/mcp.json", data=json.dumps(body), content_type="application/json") + self.assertEqual(resp.status_code, 200) + result = resp.get_json()["result"] + self.assertEqual(result["contents"][0]["mimeType"], "application/json") + data = json.loads(result["contents"][0]["text"]) + self.assertEqual(data["total"], 1) + self.assertEqual(data["hosts"][0]["name"], "SEELE") + + def test_mcp_notifications_initialized_no_response(self): + with patch.object(self.app_module.requests, "get", side_effect=self.fake_requests_get): + body = {"jsonrpc": "2.0", "method": "notifications/initialized", "params": {}} + resp = self.client.post("/.well-known/mcp.json", data=json.dumps(body), content_type="application/json") + self.assertEqual(resp.status_code, 204) + + +if __name__ == "__main__": + unittest.main() + diff --git a/stacks/network-mcp/ilm/network-events-ilm.json b/stacks/network-mcp/ilm/network-events-ilm.json new file mode 100644 index 0000000..8c2ec9b --- /dev/null +++ b/stacks/network-mcp/ilm/network-events-ilm.json @@ -0,0 +1,24 @@ +{ + "policy": { + "phases": { + "hot": { + "min_age": "0ms", + "actions": {} + }, + "warm": { + "min_age": "7d", + "actions": { + "forcemerge": { + "max_num_segments": 1 + } + } + }, + "delete": { + "min_age": "90d", + "actions": { + "delete": {} + } + } + } + } +} \ No newline at end of file diff --git a/stacks/network-mcp/ilm/network-events-template.json b/stacks/network-mcp/ilm/network-events-template.json new file mode 100644 index 0000000..5569a40 --- /dev/null +++ b/stacks/network-mcp/ilm/network-events-template.json @@ -0,0 +1,39 @@ +{ + "index_patterns": ["network-events-*"], + "template": { + "settings": { + "index.lifecycle.name": "network-events-ilm" + }, + "mappings": { + "properties": { + "@timestamp": { "type": "date" }, + "host": { + "properties": { + "ip": { "type": "ip" }, + "ips": { "type": "ip" }, + "mac": { "type": "keyword" }, + "macs": { "type": "keyword" }, + "id": { "type": "keyword" }, + "name": { "type": "keyword" }, + "hostname": { "type": "keyword" }, + "hostnames": { "type": "keyword" } + } + }, + "ports": { + "properties": { + "port": { "type": "integer" }, + "proto": { "type": "keyword" }, + "state": { "type": "keyword" }, + "service": { + "properties": { + "name": { "type": "keyword" }, + "product": { "type": "keyword" }, + "version": { "type": "keyword" } + } + } + } + } + } + } + } +} diff --git a/stacks/network-mcp/ilm/network-hosts-template.json b/stacks/network-mcp/ilm/network-hosts-template.json new file mode 100644 index 0000000..a4f5c06 --- /dev/null +++ b/stacks/network-mcp/ilm/network-hosts-template.json @@ -0,0 +1,40 @@ +{ + "index_patterns": ["network-hosts"], + "template": { + "mappings": { + "properties": { + "host": { + "properties": { + "id": { "type": "keyword" }, + "name": { "type": "keyword" }, + "fqdn": { "type": "keyword" }, + "ips": { "type": "ip" }, + "macs": { "type": "keyword" }, + "first_seen": { "type": "date" }, + "last_seen": { "type": "date" }, + "last_state_change": { "type": "date" }, + "state": { "type": "keyword" }, + "role": { "type": "keyword" }, + "tags": { "type": "keyword" }, + "notes": { "type": "text" } + } + }, + "ports": { + "properties": { + "port": { "type": "integer" }, + "proto": { "type": "keyword" }, + "state": { "type": "keyword" }, + "first_seen": { "type": "date" }, + "last_seen": { "type": "date" }, + "service": { + "properties": { + "name": { "type": "keyword" }, + "product": { "type": "keyword" } + } + } + } + } + } + } + } +} diff --git a/stacks/network-mcp/inventory_targets.yml b/stacks/network-mcp/inventory_targets.yml new file mode 100644 index 0000000..eff15e6 --- /dev/null +++ b/stacks/network-mcp/inventory_targets.yml @@ -0,0 +1,280 @@ +inventory_targets: +- name: Blackmoon + hostname: blackmoon.localdomain + ip: 192.168.5.1 + notes: Core OpnSense gateway; ping only +- name: Supermicro-BMC + hostname: 192.168.5.30 + ip: 192.168.5.30 + ports: + - 22 + - 80 + notes: "Supermicro IPMI (ATEN login portal on 80\u2192443) for rack chassis" +- name: Jet-Alone + hostname: jet-alone.localdomain + ip: 192.168.5.31 + ports: + - 22 + notes: GPU/LLM server +- name: Wille + hostname: wille.localdomain + ip: 192.168.5.33 + ports: + - 22 + - 80 + - 443 + notes: TrueNAS SCALE primary storage (iXsystems /ui interface) +- name: Core + hostname: core.localdomain + ip: 192.168.5.34 + ports: + - 22 + - 80 + - 443 + notes: Production Docker swarm (Traefik, Gitea, Authentik, Immich, etc.) +- name: NERV-III + hostname: NERV-III + ip: 192.168.5.35 + ports: + - 22 + notes: 'Standalone Proxmox host (Fedora CoreOS VMs: container-dev VM110 plus Ramiel + containers)' +- name: TP-Link-AP-1 + hostname: 192.168.5.36 + ip: 192.168.5.36 + ports: + - 22 + - 80 + notes: TP-Link EAP/Omada AP web UI (login page on HTTP) +- name: TP-Link-AP-2 + hostname: 192.168.5.39 + ip: 192.168.5.39 + ports: + - 22 + - 80 + notes: TP-Link EAP/Omada AP web UI (login page on HTTP) +- name: Subspace-Mote-1 + hostname: subspace-mote-1.localdomain + ip: 192.168.5.41 + ports: + - 22 + notes: SBC cluster member +- name: BirdNET-GO + hostname: 192.168.5.71 + ip: 192.168.5.71 + ports: + - 22 + - 8080 + notes: Armbian (rz3w-02) running birdnet-go container (port 8080) +- name: rz3w-02 + hostname: rz3w-02.localdomain + ports: + - 22 + notes: Subspace node with metrics/logging +- name: Arael + hostname: arael.localdomain + ip: 192.168.5.44 + ports: + - 22 + notes: Debian host, purpose TBD +- name: Synology-NAS + hostname: 192.168.5.45 + ip: 192.168.5.45 + ports: + - 22 + - 80 + - 443 + - 5000 + notes: Synology DSM primary NAS (HTTP redirect to DSM on 5000/5001) +- name: Docker-Public + hostname: docker-public.localdomain + ip: 192.168.5.46 + ports: + - 22 + notes: Traefik/Docker public host (Traefik on 8080; hosts Invidious, Matomo, FreshRSS, + etc.) +- name: Frigate + hostname: frigate.localdomain + ip: 192.168.5.47 + ports: + - 22 + - 5000 + notes: NVR VM +- name: HomeAssistant + hostname: homeassistant.localdomain + ip: 192.168.5.48 + ports: + - 22 + - 8123 + notes: Home automation host +- name: Casper + hostname: casper.localdomain + ip: 192.168.5.50 + ports: + - 22 + notes: Logging/Metrics VM +- name: Ramiel + hostname: ramiel.localdomain + ip: 192.168.5.51 + ports: + - 22 + - 6443 + notes: Cluster node +- name: Ramiel-III + hostname: ramiel-iii.localdomain + ip: 192.168.5.230 + ports: + - 22 + notes: Additional Ramiel host +- name: NERV + hostname: nerv.localdomain + ip: 192.168.5.203 + ports: + - 22 + - 8006 + notes: Proxmox host +- name: Magi2 + hostname: magi2.localdomain + ip: 192.168.5.202 + ports: + - 22 + - 8006 + notes: Proxmox host (JSON listed as Magi) +- name: JHCI + hostname: jhci.localdomain + ip: 192.168.5.201 + ports: + - 22 + - 8006 + notes: Proxmox host +- name: Balthasar + hostname: balthasar.localdomain + ip: 192.168.5.237 + ports: + - 22 + - 80 + notes: Technitium DNS server (hosts DoH UI) +- name: Unit-00 + hostname: unit-00.localdomain + ip: 192.168.5.222 + ports: + - 22 + notes: Client that connects to docker-dev +- name: TrueNAS-Backup + hostname: ARKII.localdomain + ip: 192.168.5.32 + ports: + - 22 + - 80 + - 443 + notes: "TrueNAS SCALE backup NAS (ARKII chassis) \u2013 HTTPS /ui, SSH pending credentials" +- name: Mokerlink-POE + hostname: 192.168.5.226 + ip: 192.168.5.226 + ports: + - 80 + notes: Mokerlink POE-2G08110GSM switch (web login only) +- name: EtherNetIP-Controller + hostname: 192.168.5.17 + ip: 192.168.5.17 + ports: + - 2222 + notes: CNC/3D printer controller interface +- name: P1S-Printer + hostname: P1S + ip: 192.168.5.42 + notes: Bambu Lab P1S (LLMNR responder only; no TCP services) +- name: Container-Dev + hostname: container-dev + ip: 192.168.5.236 + ports: + - 22 + - 5355 + notes: Fedora CoreOS VM (NERV-III VM110) for container dev; only key-based SSH + + LLMNR +- name: VPS-TransparentProxy-19222713430 + hostname: 192.227.134.30 + ip: 192.227.134.30 + ports: + - 22 + - 80 + - 443 + notes: Transparent HAProxy node (Debian 10) running haproxy + zerotier-one + telegraf +- name: VPS-TransparentProxy-1071722798 + hostname: 107.172.27.98 + ip: 107.172.27.98 + ports: + - 22 + - 80 + - 443 + notes: Transparent HAProxy node (Debian 12) running haproxy + tailscale + zerotier-one + + telegraf/filebeat +- name: VPS-TransparentProxy-10717425061 + hostname: 107.174.250.61 + ip: 107.174.250.61 + ports: + - 22 + - 80 + - 443 + notes: Transparent HAProxy (Debian 12) with haproxy, docker/containerd, iperf3, + filebeat, tailscale, zerotier +- name: VPS-Headscale + hostname: 198.46.218.8 + ip: 198.46.218.8 + ports: + - 22 + - 80 + - 443 + notes: Headscale coordination server (Ubuntu 20.04) running headscale, HAProxy, + Uptime Kuma, tailscale, zerotier +- name: VPS-MailInABox + hostname: 198.23.146.170 + ip: 198.23.146.170 + ports: + - 22 + - 80 + - 443 + notes: mail.uplink.tel Mail-in-a-Box (Postfix, Dovecot, BIND, NSD, nginx, SpamPD, + Filebeat, Tailscale) +- name: VPS-FriendServer + hostname: 172.245.88.186 + ip: 172.245.88.186 + ports: + - 22 + - 80 + - 443 + notes: '"Friend server managed" (Debian 12) hosting Apache, InspIRCd, MariaDB, Gitea + (docker), Tor, Tailscale' +- name: VPS-Meow + hostname: 107.174.64.22 + ip: 107.174.64.22 + ports: + - 22 + - 80 + - 443 + notes: '"Meow" VPS (Debian 12) running Docker stack: traefik, wg-easy, wordpress/mysql, + nginx, filebrowser' +- name: VPS-Lukes + hostname: 23.94.206.75 + ip: 23.94.206.75 + ports: + - 22 + - 80 + - 443 + notes: "Luke's VPS (Debian 12) \u2013 running Docker (Traefik, Caddy, GoatCounter,\ + \ TTRSS stack, Radicale, filebrowser, ssh-tunnel)" +- name: VPS-Tailscale-Edge + hostname: 100.64.0.14 + ip: 100.64.0.14 + ports: + - 22 + - 80 + - 443 + notes: 'Tailscale interface into mail.uplink.tel (Mail-in-a-Box stack: Postfix/Dovecot/BIND/nginx)' +- name: BirdNET-Pi + hostname: orangepizero2.localdomain + ip: 192.168.5.18 + ports: + - 22 + - 80 + notes: Orangepi Zero2 running BirdNET-Pi (Caddy on port 80) diff --git a/stacks/network-mcp/scripts/bootstrap_indices.py b/stacks/network-mcp/scripts/bootstrap_indices.py new file mode 100644 index 0000000..809c554 --- /dev/null +++ b/stacks/network-mcp/scripts/bootstrap_indices.py @@ -0,0 +1,77 @@ +import os +import sys +import json +import requests +import urllib3 + +REPO_ROOT = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) +if REPO_ROOT not in sys.path: + sys.path.insert(0, REPO_ROOT) + +from collectors.common.es_auth import resolve_api_key, build_api_key_header + +# Suppress insecure request warnings +urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning) + +def load_json(path): + with open(path, 'r') as f: + return json.load(f) + +def main(): + es_url = os.getenv("ES_URL", "http://localhost:9200").rstrip('/') + env_api_id = os.getenv("ES_API_ID") + env_api_key = os.getenv("ES_API_KEY") + es_api_id, es_api_key = resolve_api_key(env_api_id, env_api_key) + es_user = os.getenv("ES_USER", "elastic") + es_pass = os.getenv("ES_PASS", "changeme") + verify_ssl = os.getenv("ES_VERIFY_SSL", "true").lower() == "true" + + auth_args = {} + if es_api_id and es_api_key: + auth_args["headers"] = {"Authorization": build_api_key_header(es_api_id, es_api_key)} + print("Using Elasticsearch API key authentication for bootstrap.") + else: + auth_args["auth"] = (es_user, es_pass) + print("Using Elasticsearch basic authentication for bootstrap.") + + print(f"Bootstrapping Elastic at {es_url}...") + + def put(endpoint, data): + url = f"{es_url}{endpoint}" + print(f"PUT {url}") + try: + resp = requests.put(url, json=data, verify=verify_ssl, **auth_args) + print(f"Response: {resp.status_code} {resp.text}") + resp.raise_for_status() + except Exception as e: + print(f"Error: {e}") + # Don't exit, try next + + # 1. ILM Policy + ilm_path = "ilm/network-events-ilm.json" + if os.path.exists(ilm_path): + data = load_json(ilm_path) + put("/_ilm/policy/network-events-ilm", data) + else: + print(f"Missing {ilm_path}") + + # 2. Network Events Template + tpl_path = "ilm/network-events-template.json" + if os.path.exists(tpl_path): + data = load_json(tpl_path) + put("/_index_template/network-events", data) + else: + print(f"Missing {tpl_path}") + + # 3. Network Hosts Template + tpl_path = "ilm/network-hosts-template.json" + if os.path.exists(tpl_path): + data = load_json(tpl_path) + put("/_index_template/network-hosts", data) + else: + print(f"Missing {tpl_path}") + + print("Bootstrap complete.") + +if __name__ == "__main__": + main() diff --git a/stacks/network-mcp/static/host_metadata.json b/stacks/network-mcp/static/host_metadata.json new file mode 100644 index 0000000..8f10e7d --- /dev/null +++ b/stacks/network-mcp/static/host_metadata.json @@ -0,0 +1,9 @@ +{ + "mac:aa:bb:cc:dd:ee:ff": { + "role": "router", + "owner": "admin", + "location": "server-room", + "tags": ["critical", "gateway"], + "notes": "Main gateway" + } +} diff --git a/stacks/obby/docker-compose.yml b/stacks/obby/docker-compose.yml new file mode 100644 index 0000000..b359b01 --- /dev/null +++ b/stacks/obby/docker-compose.yml @@ -0,0 +1,20 @@ +--- +services: + obsidian: + image: lscr.io/linuxserver/obsidian:latest + container_name: obsidian + security_opt: + - seccomp:unconfined #optional + environment: + - PUID=1000 + - PGID=1000 + - TZ=Etc/UTC + volumes: + - ./config:/config + ports: + - 3002:3000 + - 3003:3001 + devices: + - /dev/dri:/dev/dri #optional + shm_size: "1gb" + restart: unless-stopped diff --git a/stacks/snowflake/docker-compose.yml b/stacks/snowflake/docker-compose.yml new file mode 100644 index 0000000..e8d8724 --- /dev/null +++ b/stacks/snowflake/docker-compose.yml @@ -0,0 +1,15 @@ +services: + snowflake-proxy: + network_mode: host + image: containers.torproject.org/tpo/anti-censorship/pluggable-transports/snowflake:latest + container_name: snowflake-proxy + restart: unless-stopped + # For a full list of Snowflake Proxy CLI parameters see + # https://gitlab.torproject.org/tpo/anti-censorship/pluggable-transports/snowflake/-/tree/main/proxy?ref_type=heads#running-a-standalone-snowflake-proxy + #command: [ "-ephemeral-ports-range", "30000:60000" ] + watchtower: + image: containrrr/watchtower + container_name: watchtower + volumes: + - /var/run/docker.sock:/var/run/docker.sock + command: snowflake-proxy diff --git a/stacks/szurubooru/.env.template b/stacks/szurubooru/.env.template new file mode 100644 index 0000000..5f7e8d5 --- /dev/null +++ b/stacks/szurubooru/.env.template @@ -0,0 +1,8 @@ +POSTGRES_USER=szuru +POSTGRES_PASSWORD=change_me +BUILD_INFO=local-dev +PORT=8080 +THREADS=4 +BASE_URL=/ +MOUNT_DATA=./volumes/data +MOUNT_SQL=./volumes/postgres diff --git a/stacks/szurubooru/docker-compose.yml b/stacks/szurubooru/docker-compose.yml new file mode 100644 index 0000000..4fd677d --- /dev/null +++ b/stacks/szurubooru/docker-compose.yml @@ -0,0 +1,46 @@ +## Example Docker Compose configuration +## +## Use this as a template to set up docker compose, or as guide to set up other +## orchestration services +services: + + server: + image: szurubooru/server:latest + depends_on: + - sql + environment: + ## These should be the names of the dependent containers listed below, + ## or FQDNs/IP addresses if these services are running outside of Docker + POSTGRES_HOST: sql + ## Credentials for database: + POSTGRES_USER: + POSTGRES_PASSWORD: + ## Commented Values are Default: + #POSTGRES_DB: defaults to same as POSTGRES_USER + #POSTGRES_PORT: 5432 + #LOG_SQL: 0 (1 for verbose SQL logs) + THREADS: + volumes: + - "${MOUNT_DATA}:/data" + - "./server/config.yaml:/opt/app/config.yaml" + + client: + image: szurubooru/client:latest + depends_on: + - server + environment: + BACKEND_HOST: server + BASE_URL: + volumes: + - "${MOUNT_DATA}:/data:ro" + ports: + - "${PORT}:80" + + sql: + image: postgres:11-alpine + restart: unless-stopped + environment: + POSTGRES_USER: + POSTGRES_PASSWORD: + volumes: + - "${MOUNT_SQL}:/var/lib/postgresql/data" diff --git a/stacks/szurubooru/server/config.yaml b/stacks/szurubooru/server/config.yaml new file mode 100644 index 0000000..f2ea42b --- /dev/null +++ b/stacks/szurubooru/server/config.yaml @@ -0,0 +1,3 @@ +name: Hyrax Hub +domain: http://localhost:8080 +secret: "CHANGE_ME"