Compare commits
2 Commits
0bcfed8fb8
...
aeac252491
| Author | SHA1 | Date | |
|---|---|---|---|
| aeac252491 | |||
| 13989e2b59 |
@ -22,9 +22,14 @@ jobs:
|
||||
- name: Find and deploy changed stacks
|
||||
env:
|
||||
DOMAIN: ${{ secrets.DOMAIN }}
|
||||
DEPLOY_ALL: "1"
|
||||
run: |
|
||||
# Find changed stacks
|
||||
STACKS=$(git diff --name-only HEAD~1 HEAD 2>/dev/null | grep '^stacks/' | cut -d'/' -f2 | sort -u || echo "")
|
||||
if [ "$DEPLOY_ALL" = "1" ]; then
|
||||
STACKS=$(ls stacks/)
|
||||
else
|
||||
# Find changed stacks
|
||||
STACKS=$(git diff --name-only HEAD~1 HEAD 2>/dev/null | grep '^stacks/' | cut -d'/' -f2 | sort -u || echo "")
|
||||
fi
|
||||
|
||||
if [ -z "$STACKS" ]; then
|
||||
echo "No stacks changed, deploying all..."
|
||||
|
||||
8
stacks/meshmon/.env.template
Normal file
8
stacks/meshmon/.env.template
Normal file
@ -0,0 +1,8 @@
|
||||
MESHTASTIC_NODE_IP=192.168.5.242
|
||||
ALLOWED_ORIGINS=http://docker-dev:8383,https://meshmon.ghost.tel
|
||||
RATE_LIMIT_API=20000
|
||||
NODE_ENV=production
|
||||
SESSION_SECRET=change-me
|
||||
TRUST_PROXY=true
|
||||
COOKIE_SECURE=true
|
||||
DISABLE_ANONYMOUS=true
|
||||
14
stacks/meshmon/docker-compose.yml
Normal file
14
stacks/meshmon/docker-compose.yml
Normal file
@ -0,0 +1,14 @@
|
||||
services:
|
||||
meshmonitor:
|
||||
image: ghcr.io/yeraze/meshmonitor:latest
|
||||
container_name: meshmonitor
|
||||
ports:
|
||||
- "8383:3001"
|
||||
restart: unless-stopped
|
||||
volumes:
|
||||
- meshmonitor-data:/data
|
||||
env_file:
|
||||
- .env
|
||||
volumes:
|
||||
meshmonitor-data:
|
||||
driver: local
|
||||
7
stacks/meshtastic-web/docker-compose.yml
Normal file
7
stacks/meshtastic-web/docker-compose.yml
Normal file
@ -0,0 +1,7 @@
|
||||
services:
|
||||
meshtastic-web:
|
||||
image: ghcr.io/meshtastic/web:latest
|
||||
container_name: meshtastic-web
|
||||
restart: unless-stopped
|
||||
ports:
|
||||
- "8585:8080"
|
||||
7
stacks/mllogwatcher/.dockerignore
Normal file
7
stacks/mllogwatcher/.dockerignore
Normal file
@ -0,0 +1,7 @@
|
||||
.venv
|
||||
__pycache__/
|
||||
*.pyc
|
||||
.git
|
||||
.gitignore
|
||||
.env
|
||||
tmp/
|
||||
14
stacks/mllogwatcher/.env.example
Normal file
14
stacks/mllogwatcher/.env.example
Normal file
@ -0,0 +1,14 @@
|
||||
OPENROUTER_API_KEY=
|
||||
OPENROUTER_MODEL=openai/gpt-5.2-codex-max
|
||||
TRIAGE_ENABLE_COMMANDS=1
|
||||
TRIAGE_COMMAND_RUNNER=local
|
||||
TRIAGE_VERBOSE_LOGS=1
|
||||
TRIAGE_EMAIL_ENABLED=1
|
||||
TRIAGE_EMAIL_FROM=alertai@example.com
|
||||
TRIAGE_EMAIL_TO=admin@example.com
|
||||
TRIAGE_SMTP_HOST=smtp.example.com
|
||||
TRIAGE_SMTP_PORT=465
|
||||
TRIAGE_SMTP_USER=alertai@example.com
|
||||
TRIAGE_SMTP_PASSWORD=
|
||||
TRIAGE_SMTP_SSL=1
|
||||
TRIAGE_SMTP_STARTTLS=0
|
||||
14
stacks/mllogwatcher/.env.template
Normal file
14
stacks/mllogwatcher/.env.template
Normal file
@ -0,0 +1,14 @@
|
||||
OPENROUTER_API_KEY=
|
||||
OPENROUTER_MODEL=openai/gpt-5.2-codex-max
|
||||
TRIAGE_ENABLE_COMMANDS=1
|
||||
TRIAGE_COMMAND_RUNNER=local
|
||||
TRIAGE_VERBOSE_LOGS=1
|
||||
TRIAGE_EMAIL_ENABLED=1
|
||||
TRIAGE_EMAIL_FROM=alertai@example.com
|
||||
TRIAGE_EMAIL_TO=admin@example.com
|
||||
TRIAGE_SMTP_HOST=smtp.example.com
|
||||
TRIAGE_SMTP_PORT=465
|
||||
TRIAGE_SMTP_USER=alertai@example.com
|
||||
TRIAGE_SMTP_PASSWORD=
|
||||
TRIAGE_SMTP_SSL=1
|
||||
TRIAGE_SMTP_STARTTLS=0
|
||||
20
stacks/mllogwatcher/Dockerfile
Normal file
20
stacks/mllogwatcher/Dockerfile
Normal file
@ -0,0 +1,20 @@
|
||||
FROM python:3.11-slim
|
||||
|
||||
ENV PYTHONDONTWRITEBYTECODE=1 \
|
||||
PYTHONUNBUFFERED=1
|
||||
|
||||
WORKDIR /var/core/mlLogWatcher
|
||||
|
||||
RUN apt-get update && \
|
||||
apt-get install -y --no-install-recommends openssh-client && \
|
||||
rm -rf /var/lib/apt/lists/*
|
||||
|
||||
COPY requirements.txt .
|
||||
RUN pip install --no-cache-dir -r requirements.txt
|
||||
|
||||
COPY alert_runbook.yaml ./alert_runbook.yaml
|
||||
COPY scripts ./scripts
|
||||
|
||||
EXPOSE 8081
|
||||
|
||||
CMD ["uvicorn", "scripts.grafana_alert_webhook:app", "--host", "0.0.0.0", "--port", "8081"]
|
||||
120
stacks/mllogwatcher/README.md
Executable file
120
stacks/mllogwatcher/README.md
Executable file
@ -0,0 +1,120 @@
|
||||
# ML Log Watcher Utilities
|
||||
|
||||
This repository now contains two automation entry points that work together to
|
||||
triage Elasticsearch logs and Grafana alerts with the help of OpenRouter-hosted
|
||||
language models.
|
||||
|
||||
## 1. `scripts/log_monitor.py`
|
||||
|
||||
Existing script that queries Elasticsearch indices, pulls a recent window of
|
||||
logs, and asks an LLM for anomaly highlights. Run it ad-hoc or schedule via
|
||||
cron/systemd.
|
||||
|
||||
```
|
||||
ELASTIC_HOST=https://casper.localdomain:9200 \
|
||||
ELASTIC_API_KEY=... \
|
||||
OPENROUTER_API_KEY=... \
|
||||
python3 scripts/log_monitor.py --index 'log*' --minutes 30
|
||||
```
|
||||
|
||||
## 2. `scripts/grafana_alert_webhook.py`
|
||||
|
||||
A FastAPI web server that accepts Grafana alert webhooks, finds the matching
|
||||
entry in `alert_runbook.yaml`, renders the LLM prompt, and posts it to
|
||||
OpenRouter. The response text is returned to Grafana (or any caller) immediately
|
||||
so automation can fan out to chat, ticketing, etc.
|
||||
|
||||
### Dependencies
|
||||
|
||||
```
|
||||
python3 -m venv .venv
|
||||
.venv/bin/pip install fastapi uvicorn pyyaml requests langchain
|
||||
```
|
||||
|
||||
### Environment
|
||||
|
||||
- `OPENROUTER_API_KEY` – required.
|
||||
- `OPENROUTER_MODEL` – optional (default `openai/gpt-4o-mini`).
|
||||
- `RUNBOOK_PATH` – optional (default `alert_runbook.yaml` in repo root).
|
||||
- `ANSIBLE_HOSTS_PATH` – optional (default `/etc/ansible/hosts`). When set, the webhook auto-loads the Ansible inventory so alerts targeting known hosts inherit their SSH user/port/key information.
|
||||
- `OPENROUTER_REFERER` / `OPENROUTER_TITLE` – forwarded headers if needed.
|
||||
- `TRIAGE_ENABLE_COMMANDS` – set to `1` to let the webhook execute runbook commands (default `0` keeps it in read-only mode).
|
||||
- `TRIAGE_COMMAND_RUNNER` – `ssh` (default) or `local`. When using ssh, also set `TRIAGE_SSH_USER` and optional `TRIAGE_SSH_OPTIONS`.
|
||||
- `TRIAGE_COMMAND_TIMEOUT`, `TRIAGE_MAX_COMMANDS`, `TRIAGE_OUTPUT_LIMIT`, `TRIAGE_DEFAULT_OS` – tune execution behavior.
|
||||
- `TRIAGE_VERBOSE_LOGS` – set to `1` to stream the entire LLM dialogue, prompts, and command outputs to the webhook logs for debugging.
|
||||
- `TRIAGE_EMAIL_ENABLED` – when `1`, the webhook emails the final LLM summary per alert. Requires `TRIAGE_EMAIL_FROM`, `TRIAGE_EMAIL_TO` (comma-separated), `TRIAGE_SMTP_HOST`, and optional `TRIAGE_SMTP_PORT`, `TRIAGE_SMTP_USER`, `TRIAGE_SMTP_PASSWORD`, `TRIAGE_SMTP_STARTTLS`, `TRIAGE_SMTP_SSL`.
|
||||
|
||||
### Running
|
||||
|
||||
```
|
||||
source .venv/bin/activate
|
||||
export OPENROUTER_API_KEY=...
|
||||
uvicorn scripts.grafana_alert_webhook:app --host 0.0.0.0 --port 8081
|
||||
```
|
||||
|
||||
The server loads the runbook at startup and exposes:
|
||||
|
||||
- `POST /alerts` – Grafana webhook target.
|
||||
- `POST /reload-runbook` – force runbook reload without restarting.
|
||||
|
||||
When `TRIAGE_ENABLE_COMMANDS=1`, the server executes the relevant triage commands
|
||||
for each alert (via SSH or locally), captures stdout/stderr, and appends the
|
||||
results to both the OpenRouter prompt and the HTTP response JSON. This lets you
|
||||
automate evidence gathering directly from the runbook instructions. Use
|
||||
environment variables to control which user/host the commands target and to
|
||||
limit timeouts/output size. LangChain powers the multi-turn investigation flow:
|
||||
the LLM can call the provided tools (`run_local_command`, `run_ssh_command`) to
|
||||
gather additional evidence until it’s ready to deliver a final summary.
|
||||
When `/etc/ansible/hosts` (or `ANSIBLE_HOSTS_PATH`) is available the server
|
||||
automatically enriches the alert context with SSH metadata (user, host, port,
|
||||
identity file, and common args) so runbook commands default to using SSH against
|
||||
the alerting host instead of the webhook server.
|
||||
|
||||
### Running with Docker Compose
|
||||
|
||||
1. Copy `.env.example` to `.env` and fill in your OpenRouter key, email SMTP
|
||||
settings, and other toggles.
|
||||
2. Place any SSH keys the webhook needs inside `./.ssh/` (the compose file
|
||||
mounts this directory read-only inside the container).
|
||||
3. Run `docker compose up -d` to build and launch the webhook. It listens on
|
||||
port `8081` by default and uses the mounted `alert_runbook.yaml` plus the
|
||||
host `/etc/ansible/hosts`.
|
||||
4. Use `docker compose logs -f` to watch verbose LangChain output or restart
|
||||
with `docker compose restart` when updating the code/runbook.
|
||||
|
||||
### Sample payload
|
||||
|
||||
```
|
||||
curl -X POST http://localhost:8081/alerts \
|
||||
-H 'Content-Type: application/json' \
|
||||
-d '{
|
||||
"status":"firing",
|
||||
"ruleUid":"edkmsdmlay2o0c",
|
||||
"ruleUrl":"http://casper:3000/alerting/grafana/edkmsdmlay2o0c/view",
|
||||
"alerts":[
|
||||
{
|
||||
"status":"firing",
|
||||
"labels":{
|
||||
"alertname":"High Mem.",
|
||||
"host":"unit-02",
|
||||
"rule_uid":"edkmsdmlay2o0c"
|
||||
},
|
||||
"annotations":{
|
||||
"summary":"Memory usage above 95% for 10m",
|
||||
"value":"96.2%"
|
||||
},
|
||||
"startsAt":"2025-09-22T17:20:00Z",
|
||||
"endsAt":"0001-01-01T00:00:00Z"
|
||||
}
|
||||
]
|
||||
}'
|
||||
```
|
||||
|
||||
With a valid OpenRouter key this returns a JSON body containing the LLM summary
|
||||
per alert plus any unmatched alerts (missing runbook entries or rule UIDs).
|
||||
|
||||
### Testing without OpenRouter
|
||||
|
||||
Set `OPENROUTER_API_KEY=dummy` and point the DNS entry to a mock (e.g. mitmproxy)
|
||||
if you need to capture outbound requests. Otherwise, hits will fail fast with
|
||||
HTTP 502 so Grafana knows the automation need to be retried.
|
||||
254
stacks/mllogwatcher/alert_runbook.yaml
Executable file
254
stacks/mllogwatcher/alert_runbook.yaml
Executable file
@ -0,0 +1,254 @@
|
||||
# Grafana alert triage playbook for the HomeLab telemetry stack.
|
||||
# Each entry contains the alert metadata, what the signal means,
|
||||
# the evidence to capture automatically, and the manual / scripted steps.
|
||||
metadata:
|
||||
generated: "2025-09-22T00:00:00Z"
|
||||
grafana_url: "http://casper:3000"
|
||||
datasource: "InfluxDB telegraf (uid=P951FEA4DE68E13C5)"
|
||||
llm_provider: "OpenRouter"
|
||||
alerts:
|
||||
- name: "Data Stale"
|
||||
rule_uid: "fdk9orif6fytcf"
|
||||
description: "No CPU usage_user metrics have arrived for non-unit hosts within 5 minutes."
|
||||
signal:
|
||||
metric: "cpu.usage_user"
|
||||
condition: "count(host samples over 5m) < 1"
|
||||
impact: "Host is no longer reporting to Telegraf/Influx -> monitoring blind spot."
|
||||
evidence_to_collect:
|
||||
- "Influx: `from(bucket:\"telegraf\") |> range(start:-10m) |> filter(fn:(r)=>r._measurement==\"cpu\" and r.host==\"{{ host }}\") |> count()`"
|
||||
- "Telegraf log tail"
|
||||
- "System journal for network/auth errors"
|
||||
triage:
|
||||
- summary: "Verify Telegraf agent health"
|
||||
linux: "sudo systemctl status telegraf && sudo journalctl -u telegraf -n 100"
|
||||
windows: "Get-Service telegraf; Get-Content 'C:\\Program Files\\telegraf\\telegraf.log' -Tail 100"
|
||||
- summary: "Check connectivity from host to Influx (`casper:8086`)"
|
||||
linux: "curl -sSf http://casper:8086/ping"
|
||||
windows: "Invoke-WebRequest -UseBasicParsing http://casper:8086/ping"
|
||||
- summary: "Confirm host clock drift <5s (important for Influx line protocol timestamps)"
|
||||
linux: "chronyc tracking"
|
||||
windows: "w32tm /query /status"
|
||||
remediation:
|
||||
- "Restart Telegraf after config validation: `sudo telegraf --test --config /etc/telegraf/telegraf.conf` then `sudo systemctl restart telegraf`."
|
||||
- "Re-apply Ansible telemetry playbook if multiple hosts fail."
|
||||
llm_prompt: >
|
||||
Alert {{ alertname }} fired for {{ host }}. Telegraf stopped sending cpu.usage_user metrics. Given the collected logs and command output, identify root causes (agent down, auth failures, firewall, time skew) and list the next action.
|
||||
|
||||
- name: "High CPU"
|
||||
rule_uid: "fdkms407ubmdcc"
|
||||
description: "Mean CPU usage_system over the last 10 minutes exceeds 85%."
|
||||
signal:
|
||||
metric: "cpu.usage_system"
|
||||
condition: "mean over 10m > 85%"
|
||||
impact: "Host is near saturation; scheduler latency and queueing likely."
|
||||
evidence_to_collect:
|
||||
- "Top CPU processes snapshot (Linux: `ps -eo pid,cmd,%cpu --sort=-%cpu | head -n 15`; Windows: `Get-Process | Sort-Object CPU -Descending | Select -First 15`)"
|
||||
- "Load vs CPU core count"
|
||||
- "Recent deploys / cron jobs metadata"
|
||||
triage:
|
||||
- summary: "Confirm sustained CPU pressure"
|
||||
linux: "uptime && mpstat 1 5"
|
||||
windows: "typeperf \"\\Processor(_Total)\\% Processor Time\" -sc 15"
|
||||
- summary: "Check offending processes/services"
|
||||
linux: "sudo ps -eo pid,user,comm,%cpu,%mem --sort=-%cpu | head"
|
||||
windows: "Get-Process | Sort-Object CPU -Descending | Select -First 10 Name,CPU"
|
||||
- summary: "Inspect cgroup / VM constraints if on Proxmox"
|
||||
linux: "sudo pct status {{ vmid }} && sudo pct config {{ vmid }}"
|
||||
remediation:
|
||||
- "Throttle or restart runaway service; scale workload or tune limits."
|
||||
- "Consider moving noisy neighbors off shared hypervisor."
|
||||
llm_prompt: >
|
||||
High CPU alert for {{ host }}. Review process table, recent deploys, and virtualization context; determine why cpu.usage_system stayed above 85% and recommend mitigation.
|
||||
|
||||
- name: "High Mem."
|
||||
rule_uid: "edkmsdmlay2o0c"
|
||||
description: "Mean memory used_percent over 10 minutes > 95% (excluding hosts jhci/nerv*/magi*)."
|
||||
signal:
|
||||
metric: "mem.used_percent"
|
||||
condition: "mean over 10m > 95%"
|
||||
impact: "OOM risk and swap thrash."
|
||||
evidence_to_collect:
|
||||
- "Free/available memory snapshot"
|
||||
- "Top consumers (Linux: `sudo smem -rt rss | head`; Windows: `Get-Process | Sort-Object WorkingSet -Descending`)"
|
||||
- "Swap in/out metrics"
|
||||
triage:
|
||||
- summary: "Validate actual memory pressure"
|
||||
linux: "free -m && vmstat -SM 5 5"
|
||||
windows: "Get-Counter '\\Memory\\Available MBytes'"
|
||||
- summary: "Identify leaking services"
|
||||
linux: "sudo ps -eo pid,user,comm,%mem,rss --sort=-%mem | head"
|
||||
windows: "Get-Process | Sort-Object WS -Descending | Select -First 10 ProcessName,WS"
|
||||
- summary: "Check recent kernel/OOM logs"
|
||||
linux: "sudo dmesg | tail -n 50"
|
||||
windows: "Get-WinEvent -LogName System -MaxEvents 50 | ? { $_.Message -match 'memory' }"
|
||||
remediation:
|
||||
- "Restart or reconfigure offender; add swap as stop-gap; increase VM memory allocation."
|
||||
llm_prompt: >
|
||||
High Mem alert for {{ host }}. After reviewing free memory, swap activity, and top processes, explain the likely cause and propose remediation steps with priority.
|
||||
|
||||
- name: "High Disk IO"
|
||||
rule_uid: "bdkmtaru7ru2od"
|
||||
description: "Mean merged_reads/writes per second converted to GB/s exceeds 10."
|
||||
signal:
|
||||
metric: "diskio.merged_reads + merged_writes"
|
||||
condition: "mean over 10m > 10 GB/s"
|
||||
impact: "Storage controller saturated; latency spikes, possible backlog."
|
||||
evidence_to_collect:
|
||||
- "iostat extended output"
|
||||
- "Process level IO (pidstat/nethogs equivalent)"
|
||||
- "ZFS/MDADM status for relevant pools"
|
||||
triage:
|
||||
- summary: "Inspect device queues"
|
||||
linux: "iostat -xzd 5 3"
|
||||
windows: "Get-WmiObject -Class Win32_PerfFormattedData_PerfDisk_LogicalDisk | Format-Table Name,DiskWritesPersec,DiskReadsPersec,AvgDisksecPerTransfer"
|
||||
- summary: "Correlate to filesystem / VM"
|
||||
linux: "sudo lsof +D /mnt/critical -u {{ user }}"
|
||||
- summary: "Check backup or replication windows"
|
||||
linux: "journalctl -u pvebackup -n 50"
|
||||
remediation:
|
||||
- "Pause heavy jobs, move backups off-peak, evaluate faster storage tiers."
|
||||
llm_prompt: >
|
||||
High Disk IO on {{ host }}. With iostat/pidstat output provided, decide whether activity is expected (backup, scrub) or abnormal and list mitigations.
|
||||
|
||||
- name: "Low Uptime"
|
||||
rule_uid: "ddkmuadxvkm4ge"
|
||||
description: "System uptime converted to minutes is below 10 -> host rebooted recently."
|
||||
signal:
|
||||
metric: "system.uptime"
|
||||
condition: "last uptime_minutes < 10"
|
||||
impact: "Unexpected reboot or crash; may need RCA."
|
||||
evidence_to_collect:
|
||||
- "Boot reason logs"
|
||||
- "Last patch/maintenance window from Ansible inventory"
|
||||
- "Smart log excerpt for power events"
|
||||
triage:
|
||||
- summary: "Confirm uptime and reason"
|
||||
linux: "uptime && last -x | head"
|
||||
windows: "Get-WinEvent -LogName System -MaxEvents 50 | ? { $_.Id -in 41,6006,6008 }"
|
||||
- summary: "Check kernel panic or watchdog traces"
|
||||
linux: "sudo journalctl -k -b -1 | tail -n 200"
|
||||
- summary: "Validate patch automation logs"
|
||||
linux: "sudo tail -n 100 /var/log/ansible-pull.log"
|
||||
remediation:
|
||||
- "Schedule deeper diagnostics if crash; reschedule workloads once stable."
|
||||
llm_prompt: >
|
||||
Low Uptime alert: host restarted within 10 minutes. Inspect boot reason logs and recommend whether this is maintenance or a fault needing follow-up.
|
||||
|
||||
- name: "High Load"
|
||||
rule_uid: "ddkmul9x8gcn4d"
|
||||
description: "system.load5 > 6 for 5 minutes."
|
||||
signal:
|
||||
metric: "system.load5"
|
||||
condition: "last value > 6"
|
||||
impact: "Runnable queue more than CPU threads -> latency growth."
|
||||
evidence_to_collect:
|
||||
- "Load vs CPU count (`nproc`)"
|
||||
- "Process states (D/R blocked tasks)"
|
||||
- "IO wait percentage"
|
||||
triage:
|
||||
- summary: "Correlate load to CPU and IO"
|
||||
linux: "uptime && vmstat 1 5"
|
||||
- summary: "Identify stuck IO"
|
||||
linux: "sudo pidstat -d 1 5"
|
||||
- summary: "Check Proxmox scheduler for resource contention"
|
||||
linux: "pveperf && qm list"
|
||||
remediation:
|
||||
- "Reduce cron concurrency, add CPU, or fix IO bottleneck causing runnable queue growth."
|
||||
llm_prompt: >
|
||||
High Load alert on {{ host }}. Based on vmstat/pidstat output, explain whether CPU saturation, IO wait, or runnable pile-up is at fault and propose actions.
|
||||
|
||||
- name: "High Network Traffic (Download)"
|
||||
rule_uid: "cdkpct82a7g8wd"
|
||||
description: "Derivative of bytes_recv > 50 MB/s on any interface over last hour."
|
||||
signal:
|
||||
metric: "net.bytes_recv"
|
||||
condition: "mean download throughput > 50 MB/s"
|
||||
impact: "Link saturation, potential DDoS or backup window."
|
||||
evidence_to_collect:
|
||||
- "Interface counters (Linux: `ip -s link show {{ iface }}`; Windows: `Get-NetAdapterStatistics`)"
|
||||
- "Top talkers (Linux: `sudo nethogs {{ iface }}` or `iftop -i {{ iface }}`)"
|
||||
- "Firewall/IDS logs"
|
||||
triage:
|
||||
- summary: "Confirm interface experiencing spike"
|
||||
linux: "sar -n DEV 1 5 | grep {{ iface }}"
|
||||
windows: "Get-Counter -Counter '\\Network Interface({{ iface }})\\Bytes Received/sec' -Continuous -SampleInterval 1 -MaxSamples 5"
|
||||
- summary: "Identify process or remote peer"
|
||||
linux: "sudo ss -ntu state established | sort -k4"
|
||||
windows: "Get-NetTCPConnection | Sort-Object -Property LocalPort"
|
||||
remediation:
|
||||
- "Throttle offending transfers, move backup replication, verify no compromised service."
|
||||
llm_prompt: >
|
||||
High download throughput on {{ host }} interface {{ iface }}. Review interface counters and connection list to determine if traffic is expected and advise throttling or blocking steps.
|
||||
|
||||
- name: "High Network Traffic (Upload)"
|
||||
rule_uid: "aec650pbtvzswa"
|
||||
description: "Derivative of bytes_sent > 30 MB/s for an interface."
|
||||
signal:
|
||||
metric: "net.bytes_sent"
|
||||
condition: "mean upload throughput > 30 MB/s"
|
||||
impact: "Excess upstream usage; may saturate ISP uplink."
|
||||
evidence_to_collect:
|
||||
- "Interface statistics"
|
||||
- "NetFlow sample if available (`/var/log/telegraf/netflow.log`)"
|
||||
- "List of active transfers"
|
||||
triage:
|
||||
- summary: "Measure upload curve"
|
||||
linux: "bmon -p {{ iface }} -o ascii"
|
||||
windows: "Get-Counter '\\Network Interface({{ iface }})\\Bytes Sent/sec' -Continuous -SampleInterval 1 -MaxSamples 5"
|
||||
- summary: "Find process generating traffic"
|
||||
linux: "sudo iftop -i {{ iface }} -t -s 30"
|
||||
windows: "Get-NetAdapterStatistics -Name {{ iface }}"
|
||||
remediation:
|
||||
- "Pause replication jobs, confirm backups not stuck, search for data exfiltration."
|
||||
llm_prompt: >
|
||||
High upload alert for {{ host }} interface {{ iface }}. Using captured traffic samples, determine whether replication/backup explains the pattern or if anomalous traffic needs blocking.
|
||||
|
||||
- name: "High Disk Usage"
|
||||
rule_uid: "cdma6i5k2gem8d"
|
||||
description: "Disk used_percent >= 95% for Linux devices (filters out unwanted devices)."
|
||||
signal:
|
||||
metric: "disk.used_percent"
|
||||
condition: "last value > 95%"
|
||||
impact: "Filesystem full -> service crashes or write failures."
|
||||
evidence_to_collect:
|
||||
- "`df -h` or `Get-Volume` output for device"
|
||||
- "Largest directories snapshot (Linux: `sudo du -xhd1 /path`; Windows: `Get-ChildItem | Sort Length`)"
|
||||
- "Recent deploy or backup expansion logs"
|
||||
triage:
|
||||
- summary: "Validate usage"
|
||||
linux: "df -h {{ mountpoint }}"
|
||||
windows: "Get-Volume -FileSystemLabel {{ volume }}"
|
||||
- summary: "Identify growth trend"
|
||||
linux: "sudo journalctl -u telegraf -g 'disk usage' -n 20"
|
||||
- summary: "Check for stale docker volumes"
|
||||
linux: "docker system df && docker volume ls"
|
||||
remediation:
|
||||
- "Prune temp artifacts, expand disk/VM, move logs to remote storage."
|
||||
llm_prompt: >
|
||||
High Disk Usage alert on {{ host }} device {{ device }}. Summarize what consumed the space and recommend reclaim or expansion actions with priority.
|
||||
|
||||
- name: "CPU Heartbeat"
|
||||
rule_uid: "eec62gqn3oetcf"
|
||||
description: "Counts cpu.usage_system samples per host; fires if <1 sample arrives within window."
|
||||
signal:
|
||||
metric: "cpu.usage_system"
|
||||
condition: "sample count within 10m < 1"
|
||||
impact: "Indicates host stopped reporting metrics entirely (telemetry silent)."
|
||||
evidence_to_collect:
|
||||
- "Influx query for recent cpu samples"
|
||||
- "Telegraf service and logs"
|
||||
- "Network reachability from host to casper"
|
||||
triage:
|
||||
- summary: "Check host alive and reachable"
|
||||
linux: "ping -c 3 {{ host }} && ssh {{ host }} uptime"
|
||||
windows: "Test-Connection {{ host }} -Count 3"
|
||||
- summary: "Inspect Telegraf state"
|
||||
linux: "sudo systemctl status telegraf && sudo tail -n 100 /var/log/telegraf/telegraf.log"
|
||||
windows: "Get-Service telegraf; Get-EventLog -LogName Application -Newest 50 | ? { $_.Source -match 'Telegraf' }"
|
||||
- summary: "Validate API key / Influx auth"
|
||||
linux: "sudo grep -n 'outputs.influxdb' -n /etc/telegraf/telegraf.conf"
|
||||
remediation:
|
||||
- "Re-issue Telegraf credentials, run `ansible-playbook telemetry.yml -l {{ host }}`."
|
||||
- "If host intentionally offline, silence alert via Grafana maintenance window."
|
||||
llm_prompt: >
|
||||
CPU Heartbeat for {{ host }} indicates telemetry silent. Use connectivity tests and Telegraf logs to determine if host is down or just metrics disabled; propose fixes.
|
||||
14
stacks/mllogwatcher/docker-compose.yml
Normal file
14
stacks/mllogwatcher/docker-compose.yml
Normal file
@ -0,0 +1,14 @@
|
||||
version: "3.9"
|
||||
|
||||
services:
|
||||
grafana-alert-webhook:
|
||||
build: .
|
||||
env_file:
|
||||
- .env
|
||||
ports:
|
||||
- "8081:8081"
|
||||
volumes:
|
||||
- ./alert_runbook.yaml:/var/core/mlLogWatcher/alert_runbook.yaml:ro
|
||||
- /etc/ansible/hosts:/etc/ansible/hosts:ro
|
||||
- ./.ssh:/var/core/mlLogWatcher/.ssh:ro
|
||||
restart: unless-stopped
|
||||
5
stacks/mllogwatcher/requirements.txt
Normal file
5
stacks/mllogwatcher/requirements.txt
Normal file
@ -0,0 +1,5 @@
|
||||
fastapi==0.115.5
|
||||
uvicorn[standard]==0.32.0
|
||||
pyyaml==6.0.2
|
||||
requests==2.32.3
|
||||
langchain==0.2.15
|
||||
988
stacks/mllogwatcher/scripts/grafana_alert_webhook.py
Executable file
988
stacks/mllogwatcher/scripts/grafana_alert_webhook.py
Executable file
@ -0,0 +1,988 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Minimal FastAPI web server that accepts Grafana alert webhooks, looks up the
|
||||
matching runbook entry, builds an LLM prompt, and calls OpenRouter to return a
|
||||
triage summary.
|
||||
|
||||
Run with:
|
||||
uvicorn scripts.grafana_alert_webhook:app --host 0.0.0.0 --port 8081
|
||||
|
||||
Environment variables:
|
||||
RUNBOOK_PATH Path to alert_runbook.yaml (default: ./alert_runbook.yaml)
|
||||
OPENROUTER_API_KEY Required; API token for https://openrouter.ai
|
||||
OPENROUTER_MODEL Optional; default openai/gpt-4o-mini
|
||||
OPENROUTER_REFERER Optional referer header
|
||||
OPENROUTER_TITLE Optional title header (default: Grafana Alert Webhook)
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import logging
|
||||
import os
|
||||
import re
|
||||
from pathlib import Path
|
||||
from typing import Any, Dict, List, Optional, Tuple
|
||||
import json
|
||||
import shlex
|
||||
import subprocess
|
||||
from textwrap import indent
|
||||
import smtplib
|
||||
from email.message import EmailMessage
|
||||
|
||||
import requests
|
||||
import yaml
|
||||
from fastapi import FastAPI, HTTPException, Request
|
||||
from langchain.llms.base import LLM
|
||||
|
||||
LOGGER = logging.getLogger("grafana_webhook")
|
||||
logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s %(message)s")
|
||||
|
||||
RUNBOOK_PATH = Path(os.environ.get("RUNBOOK_PATH", "alert_runbook.yaml"))
|
||||
ANSIBLE_HOSTS_PATH = Path(os.environ.get("ANSIBLE_HOSTS_PATH", "/etc/ansible/hosts"))
|
||||
OPENROUTER_API_KEY = os.environ.get("OPENROUTER_API_KEY")
|
||||
OPENROUTER_MODEL = os.environ.get("OPENROUTER_MODEL", "openai/gpt-4o-mini")
|
||||
OPENROUTER_REFERER = os.environ.get("OPENROUTER_REFERER")
|
||||
OPENROUTER_TITLE = os.environ.get("OPENROUTER_TITLE", "Grafana Alert Webhook")
|
||||
|
||||
TRIAGE_ENABLE_COMMANDS = os.environ.get("TRIAGE_ENABLE_COMMANDS", "0").lower() in {"1", "true", "yes", "on"}
|
||||
TRIAGE_COMMAND_RUNNER = os.environ.get("TRIAGE_COMMAND_RUNNER", "ssh").lower()
|
||||
TRIAGE_SSH_USER = os.environ.get("TRIAGE_SSH_USER", "root")
|
||||
TRIAGE_SSH_OPTIONS = shlex.split(
|
||||
os.environ.get("TRIAGE_SSH_OPTIONS", "-o BatchMode=yes -o StrictHostKeyChecking=no -o ConnectTimeout=5")
|
||||
)
|
||||
TRIAGE_COMMAND_TIMEOUT = int(os.environ.get("TRIAGE_COMMAND_TIMEOUT", "60"))
|
||||
TRIAGE_DEFAULT_OS = os.environ.get("TRIAGE_DEFAULT_OS", "linux").lower()
|
||||
TRIAGE_MAX_COMMANDS = int(os.environ.get("TRIAGE_MAX_COMMANDS", "3"))
|
||||
TRIAGE_OUTPUT_LIMIT = int(os.environ.get("TRIAGE_OUTPUT_LIMIT", "1200"))
|
||||
# LangChain-driven investigation loop
|
||||
TRIAGE_MAX_ITERATIONS = int(os.environ.get("TRIAGE_MAX_ITERATIONS", "3"))
|
||||
TRIAGE_FOLLOWUP_MAX_COMMANDS = int(os.environ.get("TRIAGE_FOLLOWUP_MAX_COMMANDS", "4"))
|
||||
TRIAGE_SYSTEM_PROMPT = os.environ.get(
|
||||
"TRIAGE_SYSTEM_PROMPT",
|
||||
(
|
||||
"You are assisting with on-call investigations. Always reply with JSON containing:\n"
|
||||
"analysis: your findings and next steps.\n"
|
||||
"followup_commands: list of command specs (summary, command, optional runner/os) to gather more data.\n"
|
||||
"complete: true when sufficient information is gathered.\n"
|
||||
"Request commands only when more evidence is required."
|
||||
),
|
||||
)
|
||||
TRIAGE_VERBOSE_LOGS = os.environ.get("TRIAGE_VERBOSE_LOGS", "0").lower() in {"1", "true", "yes", "on"}
|
||||
TRIAGE_EMAIL_ENABLED = os.environ.get("TRIAGE_EMAIL_ENABLED", "0").lower() in {"1", "true", "yes", "on"}
|
||||
TRIAGE_EMAIL_FROM = os.environ.get("TRIAGE_EMAIL_FROM")
|
||||
TRIAGE_EMAIL_TO = [addr.strip() for addr in os.environ.get("TRIAGE_EMAIL_TO", "").split(",") if addr.strip()]
|
||||
TRIAGE_SMTP_HOST = os.environ.get("TRIAGE_SMTP_HOST")
|
||||
TRIAGE_SMTP_PORT = int(os.environ.get("TRIAGE_SMTP_PORT", "587"))
|
||||
TRIAGE_SMTP_USER = os.environ.get("TRIAGE_SMTP_USER")
|
||||
TRIAGE_SMTP_PASSWORD = os.environ.get("TRIAGE_SMTP_PASSWORD")
|
||||
TRIAGE_SMTP_STARTTLS = os.environ.get("TRIAGE_SMTP_STARTTLS", "1").lower() in {"1", "true", "yes", "on"}
|
||||
TRIAGE_SMTP_SSL = os.environ.get("TRIAGE_SMTP_SSL", "0").lower() in {"1", "true", "yes", "on"}
|
||||
TRIAGE_SMTP_TIMEOUT = int(os.environ.get("TRIAGE_SMTP_TIMEOUT", "20"))
|
||||
|
||||
|
||||
def log_verbose(title: str, content: Any) -> None:
|
||||
"""Emit structured verbose logs when TRIAGE_VERBOSE_LOGS is enabled."""
|
||||
if not TRIAGE_VERBOSE_LOGS:
|
||||
return
|
||||
if isinstance(content, (dict, list)):
|
||||
text = json.dumps(content, indent=2, sort_keys=True)
|
||||
else:
|
||||
text = str(content)
|
||||
LOGGER.info("%s:\n%s", title, text)
|
||||
|
||||
|
||||
def email_notifications_configured() -> bool:
|
||||
if not TRIAGE_EMAIL_ENABLED:
|
||||
return False
|
||||
if not (TRIAGE_SMTP_HOST and TRIAGE_EMAIL_FROM and TRIAGE_EMAIL_TO):
|
||||
LOGGER.warning(
|
||||
"Email notifications requested but TRIAGE_SMTP_HOST/TRIAGE_EMAIL_FROM/TRIAGE_EMAIL_TO are incomplete."
|
||||
)
|
||||
return False
|
||||
return True
|
||||
|
||||
|
||||
def format_command_results_for_email(results: List[Dict[str, Any]]) -> str:
|
||||
if not results:
|
||||
return "No automation commands were executed."
|
||||
lines: List[str] = []
|
||||
for result in results:
|
||||
lines.append(f"- {result.get('summary')} [{result.get('status')}] {result.get('command')}")
|
||||
stdout = result.get("stdout")
|
||||
stderr = result.get("stderr")
|
||||
error = result.get("error")
|
||||
if stdout:
|
||||
lines.append(indent(truncate_text(stdout, 800), " stdout: "))
|
||||
if stderr:
|
||||
lines.append(indent(truncate_text(stderr, 800), " stderr: "))
|
||||
if error and result.get("status") != "ok":
|
||||
lines.append(f" error: {error}")
|
||||
return "\n".join(lines)
|
||||
|
||||
|
||||
def build_email_body(alert: Dict[str, Any], result: Dict[str, Any], context: Dict[str, Any]) -> str:
|
||||
lines = [
|
||||
f"Alert: {result.get('alertname')} ({result.get('rule_uid')})",
|
||||
f"Host: {result.get('host') or context.get('host')}",
|
||||
f"Status: {alert.get('status')}",
|
||||
f"Value: {alert.get('value') or alert.get('annotations', {}).get('value')}",
|
||||
f"Grafana Rule: {context.get('rule_url')}",
|
||||
"",
|
||||
"LLM Summary:",
|
||||
result.get("llm_summary") or "(no summary returned)",
|
||||
"",
|
||||
"Command Results:",
|
||||
format_command_results_for_email(result.get("command_results") or []),
|
||||
]
|
||||
return "\n".join(lines)
|
||||
|
||||
|
||||
def send_summary_email(alert: Dict[str, Any], result: Dict[str, Any], context: Dict[str, Any]) -> None:
|
||||
if not email_notifications_configured():
|
||||
return
|
||||
subject_host = result.get("host") or context.get("host") or "(unknown host)"
|
||||
subject = f"[Grafana] {result.get('alertname')} - {subject_host}"
|
||||
body = build_email_body(alert, result, context)
|
||||
message = EmailMessage()
|
||||
message["Subject"] = subject
|
||||
message["From"] = TRIAGE_EMAIL_FROM
|
||||
message["To"] = ", ".join(TRIAGE_EMAIL_TO)
|
||||
message.set_content(body)
|
||||
try:
|
||||
smtp_class = smtplib.SMTP_SSL if TRIAGE_SMTP_SSL else smtplib.SMTP
|
||||
with smtp_class(TRIAGE_SMTP_HOST, TRIAGE_SMTP_PORT, timeout=TRIAGE_SMTP_TIMEOUT) as client:
|
||||
if TRIAGE_SMTP_STARTTLS and not TRIAGE_SMTP_SSL:
|
||||
client.starttls()
|
||||
if TRIAGE_SMTP_USER:
|
||||
client.login(TRIAGE_SMTP_USER, TRIAGE_SMTP_PASSWORD or "")
|
||||
client.send_message(message)
|
||||
LOGGER.info("Sent summary email to %s for host %s", ", ".join(TRIAGE_EMAIL_TO), subject_host)
|
||||
except Exception as exc: # pylint: disable=broad-except
|
||||
LOGGER.exception("Failed to send summary email: %s", exc)
|
||||
|
||||
app = FastAPI(title="Grafana Alert Webhook", version="1.0.0")
|
||||
|
||||
_RUNBOOK_INDEX: Dict[str, Dict[str, Any]] = {}
|
||||
_INVENTORY_INDEX: Dict[str, Dict[str, Any]] = {}
|
||||
_INVENTORY_GROUP_VARS: Dict[str, Dict[str, str]] = {}
|
||||
_TEMPLATE_PATTERN = re.compile(r"{{\s*([a-zA-Z0-9_]+)\s*}}")
|
||||
|
||||
|
||||
DEFAULT_SYSTEM_PROMPT = TRIAGE_SYSTEM_PROMPT
|
||||
|
||||
|
||||
class OpenRouterLLM(LLM):
|
||||
"""LangChain-compatible LLM that calls OpenRouter chat completions."""
|
||||
|
||||
api_key: str
|
||||
model_name: str
|
||||
|
||||
def __init__(self, api_key: str, model_name: str, **kwargs: Any) -> None:
|
||||
super().__init__(api_key=api_key, model_name=model_name, **kwargs)
|
||||
|
||||
@property
|
||||
def _llm_type(self) -> str:
|
||||
return "openrouter"
|
||||
|
||||
def __call__(self, prompt: str, stop: Optional[List[str]] = None) -> str:
|
||||
return self._call(prompt, stop=stop)
|
||||
|
||||
def _call(self, prompt: str, stop: Optional[List[str]] = None) -> str:
|
||||
payload = {
|
||||
"model": self.model_name,
|
||||
"messages": [
|
||||
{"role": "system", "content": DEFAULT_SYSTEM_PROMPT},
|
||||
{"role": "user", "content": prompt},
|
||||
],
|
||||
}
|
||||
log_verbose("OpenRouter request payload", payload)
|
||||
if stop:
|
||||
payload["stop"] = stop
|
||||
LOGGER.info("Posting to OpenRouter model=%s via LangChain", self.model_name)
|
||||
headers = {
|
||||
"Authorization": f"Bearer {self.api_key}",
|
||||
"Content-Type": "application/json",
|
||||
}
|
||||
if OPENROUTER_REFERER:
|
||||
headers["HTTP-Referer"] = OPENROUTER_REFERER
|
||||
if OPENROUTER_TITLE:
|
||||
headers["X-Title"] = OPENROUTER_TITLE
|
||||
response = requests.post("https://openrouter.ai/api/v1/chat/completions", json=payload, headers=headers, timeout=90)
|
||||
if response.status_code >= 400:
|
||||
try:
|
||||
detail = response.json()
|
||||
except ValueError:
|
||||
detail = response.text
|
||||
raise RuntimeError(f"OpenRouter error {response.status_code}: {detail}")
|
||||
data = response.json()
|
||||
log_verbose("OpenRouter raw response", data)
|
||||
choices = data.get("choices")
|
||||
if not choices:
|
||||
raise RuntimeError("OpenRouter returned no choices")
|
||||
return choices[0]["message"]["content"].strip()
|
||||
|
||||
|
||||
def load_runbook() -> Dict[str, Dict[str, Any]]:
|
||||
"""Load runbook YAML into a dict keyed by rule_uid."""
|
||||
if not RUNBOOK_PATH.exists():
|
||||
raise FileNotFoundError(f"Runbook file not found: {RUNBOOK_PATH}")
|
||||
with RUNBOOK_PATH.open("r", encoding="utf-8") as handle:
|
||||
data = yaml.safe_load(handle) or {}
|
||||
alerts = data.get("alerts", [])
|
||||
index: Dict[str, Dict[str, Any]] = {}
|
||||
for entry in alerts:
|
||||
uid = entry.get("rule_uid")
|
||||
if uid:
|
||||
index[str(uid)] = entry
|
||||
LOGGER.info("Loaded %d runbook entries from %s", len(index), RUNBOOK_PATH)
|
||||
return index
|
||||
|
||||
|
||||
def _normalize_host_key(host: str) -> str:
|
||||
return host.strip().lower()
|
||||
|
||||
|
||||
def _parse_key_value_tokens(tokens: List[str]) -> Dict[str, str]:
|
||||
data: Dict[str, str] = {}
|
||||
for token in tokens:
|
||||
if "=" not in token:
|
||||
continue
|
||||
key, value = token.split("=", 1)
|
||||
data[key] = value
|
||||
return data
|
||||
|
||||
|
||||
def load_ansible_inventory() -> Tuple[Dict[str, Dict[str, Any]], Dict[str, Dict[str, str]]]:
|
||||
"""Parse a simple INI-style Ansible hosts file into host/group maps."""
|
||||
if not ANSIBLE_HOSTS_PATH.exists():
|
||||
LOGGER.warning("Ansible inventory not found at %s", ANSIBLE_HOSTS_PATH)
|
||||
return {}, {}
|
||||
hosts: Dict[str, Dict[str, Any]] = {}
|
||||
group_vars: Dict[str, Dict[str, str]] = {}
|
||||
current_group: Optional[str] = None
|
||||
current_section: str = "hosts"
|
||||
|
||||
with ANSIBLE_HOSTS_PATH.open("r", encoding="utf-8") as handle:
|
||||
for raw_line in handle:
|
||||
line = raw_line.strip()
|
||||
if not line or line.startswith("#"):
|
||||
continue
|
||||
if line.startswith("[") and line.endswith("]"):
|
||||
header = line[1:-1].strip()
|
||||
if ":" in header:
|
||||
group_name, suffix = header.split(":", 1)
|
||||
current_group = group_name
|
||||
current_section = suffix
|
||||
else:
|
||||
current_group = header
|
||||
current_section = "hosts"
|
||||
group_vars.setdefault(current_group, {})
|
||||
continue
|
||||
cleaned = line.split("#", 1)[0].strip()
|
||||
if not cleaned:
|
||||
continue
|
||||
tokens = shlex.split(cleaned)
|
||||
if not tokens:
|
||||
continue
|
||||
if current_section == "vars":
|
||||
vars_dict = _parse_key_value_tokens(tokens)
|
||||
group_vars.setdefault(current_group or "all", {}).update(vars_dict)
|
||||
continue
|
||||
host_token = tokens[0]
|
||||
host_key = _normalize_host_key(host_token)
|
||||
entry = hosts.setdefault(host_key, {"name": host_token, "definitions": [], "groups": set()})
|
||||
vars_dict = _parse_key_value_tokens(tokens[1:])
|
||||
entry["definitions"].append({"group": current_group, "vars": vars_dict})
|
||||
if current_group:
|
||||
entry["groups"].add(current_group)
|
||||
|
||||
LOGGER.info("Loaded %d Ansible inventory hosts from %s", len(hosts), ANSIBLE_HOSTS_PATH)
|
||||
return hosts, group_vars
|
||||
|
||||
|
||||
def _lookup_inventory(host: Optional[str]) -> Optional[Dict[str, Any]]:
|
||||
if not host:
|
||||
return None
|
||||
key = _normalize_host_key(host)
|
||||
entry = _INVENTORY_INDEX.get(key)
|
||||
if entry:
|
||||
return entry
|
||||
# try stripping domain suffix
|
||||
short = key.split(".", 1)[0]
|
||||
if short != key:
|
||||
return _INVENTORY_INDEX.get(short)
|
||||
return None
|
||||
|
||||
|
||||
def _merge_group_vars(groups: List[str], host_os: Optional[str]) -> Dict[str, str]:
|
||||
merged: Dict[str, str] = {}
|
||||
global_vars = _INVENTORY_GROUP_VARS.get("all")
|
||||
if global_vars:
|
||||
merged.update(global_vars)
|
||||
normalized_os = (host_os or "").lower()
|
||||
for group in groups:
|
||||
vars_dict = _INVENTORY_GROUP_VARS.get(group)
|
||||
if not vars_dict:
|
||||
continue
|
||||
connection = (vars_dict.get("ansible_connection") or "").lower()
|
||||
if connection == "winrm" and normalized_os == "linux":
|
||||
continue
|
||||
merged.update(vars_dict)
|
||||
return merged
|
||||
|
||||
|
||||
def _should_include_definition(group: Optional[str], vars_dict: Dict[str, str], host_os: Optional[str]) -> bool:
|
||||
if not vars_dict:
|
||||
return False
|
||||
normalized_os = (host_os or "").lower()
|
||||
connection = (vars_dict.get("ansible_connection") or "").lower()
|
||||
if connection == "winrm" and normalized_os != "windows":
|
||||
return False
|
||||
if connection == "local":
|
||||
return True
|
||||
if group and "windows" in group.lower() and normalized_os == "linux" and not connection:
|
||||
return False
|
||||
return True
|
||||
|
||||
|
||||
def apply_inventory_context(context: Dict[str, Any]) -> None:
|
||||
"""Augment the alert context with SSH metadata from the Ansible inventory."""
|
||||
host = context.get("host")
|
||||
entry = _lookup_inventory(host)
|
||||
if not entry:
|
||||
return
|
||||
merged_vars = _merge_group_vars(list(entry.get("groups", [])), context.get("host_os"))
|
||||
for definition in entry.get("definitions", []):
|
||||
group_name = definition.get("group")
|
||||
vars_dict = definition.get("vars", {})
|
||||
if _should_include_definition(group_name, vars_dict, context.get("host_os")):
|
||||
merged_vars.update(vars_dict)
|
||||
ansible_host = merged_vars.get("ansible_host") or entry.get("name")
|
||||
ansible_user = merged_vars.get("ansible_user")
|
||||
ansible_port = merged_vars.get("ansible_port")
|
||||
ssh_common_args = merged_vars.get("ansible_ssh_common_args")
|
||||
ssh_key = merged_vars.get("ansible_ssh_private_key_file")
|
||||
connection = (merged_vars.get("ansible_connection") or "").lower()
|
||||
host_os = (context.get("host_os") or "").lower()
|
||||
if connection == "winrm" and host_os != "windows":
|
||||
for key in (
|
||||
"ansible_connection",
|
||||
"ansible_port",
|
||||
"ansible_password",
|
||||
"ansible_winrm_server_cert_validation",
|
||||
"ansible_winrm_scheme",
|
||||
):
|
||||
merged_vars.pop(key, None)
|
||||
connection = ""
|
||||
|
||||
context.setdefault("ssh_host", ansible_host or host)
|
||||
if ansible_user:
|
||||
context["ssh_user"] = ansible_user
|
||||
if ansible_port:
|
||||
context["ssh_port"] = ansible_port
|
||||
if ssh_common_args:
|
||||
context["ssh_common_args"] = ssh_common_args
|
||||
if ssh_key:
|
||||
context["ssh_identity_file"] = ssh_key
|
||||
context.setdefault("inventory_groups", list(entry.get("groups", [])))
|
||||
if connection == "local":
|
||||
context.setdefault("preferred_runner", "local")
|
||||
elif connection in {"", "ssh", "smart"}:
|
||||
context.setdefault("preferred_runner", "ssh")
|
||||
context.setdefault("inventory_groups", list(entry.get("groups", [])))
|
||||
|
||||
|
||||
def render_template(template: str, context: Dict[str, Any]) -> str:
|
||||
"""Very small mustache-style renderer for {{ var }} placeholders."""
|
||||
def replace(match: re.Match[str]) -> str:
|
||||
key = match.group(1)
|
||||
return str(context.get(key, match.group(0)))
|
||||
|
||||
return _TEMPLATE_PATTERN.sub(replace, template)
|
||||
|
||||
|
||||
def extract_rule_uid(alert: Dict[str, Any], parent_payload: Dict[str, Any]) -> Optional[str]:
|
||||
"""Grafana webhooks may include rule UID in different fields."""
|
||||
candidates: List[Any] = [
|
||||
alert.get("ruleUid"),
|
||||
alert.get("rule_uid"),
|
||||
alert.get("ruleId"),
|
||||
alert.get("uid"),
|
||||
alert.get("labels", {}).get("rule_uid"),
|
||||
alert.get("labels", {}).get("ruleUid"),
|
||||
parent_payload.get("ruleUid"),
|
||||
parent_payload.get("rule_uid"),
|
||||
parent_payload.get("ruleId"),
|
||||
]
|
||||
for candidate in candidates:
|
||||
if candidate:
|
||||
return str(candidate)
|
||||
# Fall back to Grafana URL parsing if present
|
||||
url = (
|
||||
alert.get("ruleUrl")
|
||||
or parent_payload.get("ruleUrl")
|
||||
or alert.get("generatorURL")
|
||||
or parent_payload.get("generatorURL")
|
||||
)
|
||||
if url and "/alerting/" in url:
|
||||
return url.rstrip("/").split("/")[-2]
|
||||
return None
|
||||
|
||||
|
||||
def derive_fallback_rule_uid(alert: Dict[str, Any], parent_payload: Dict[str, Any]) -> str:
|
||||
"""Construct a deterministic identifier when Grafana omits rule UIDs."""
|
||||
labels = alert.get("labels", {})
|
||||
candidates = [
|
||||
alert.get("fingerprint"),
|
||||
labels.get("alertname"),
|
||||
labels.get("host"),
|
||||
labels.get("instance"),
|
||||
parent_payload.get("groupKey"),
|
||||
parent_payload.get("title"),
|
||||
]
|
||||
for candidate in candidates:
|
||||
if candidate:
|
||||
return str(candidate)
|
||||
return "unknown-alert"
|
||||
|
||||
|
||||
def build_fallback_runbook_entry(alert: Dict[str, Any], parent_payload: Dict[str, Any]) -> Dict[str, Any]:
|
||||
"""Return a generic runbook entry so every alert can be processed."""
|
||||
labels = alert.get("labels", {})
|
||||
alertname = labels.get("alertname") or parent_payload.get("title") or "Grafana Alert"
|
||||
host = labels.get("host") or labels.get("instance") or "(unknown host)"
|
||||
return {
|
||||
"name": f"{alertname} (auto)",
|
||||
"llm_prompt": (
|
||||
"Grafana alert {{ alertname }} fired for {{ host }}.\n"
|
||||
"No dedicated runbook entry exists. Use the payload details, command outputs, "
|
||||
"and your own reasoning to propose likely causes, evidence to gather, and remediation steps."
|
||||
),
|
||||
"triage": [],
|
||||
"evidence_to_collect": [],
|
||||
"remediation": [],
|
||||
"metadata": {"host": host},
|
||||
}
|
||||
|
||||
|
||||
def summarize_dict(prefix: str, data: Optional[Dict[str, Any]]) -> str:
|
||||
if not data:
|
||||
return f"{prefix}: (none)"
|
||||
parts = ", ".join(f"{key}={value}" for key, value in sorted(data.items()))
|
||||
return f"{prefix}: {parts}"
|
||||
|
||||
|
||||
def determine_host_os(alert: Dict[str, Any]) -> str:
|
||||
"""Infer host operating system from labels or defaults."""
|
||||
labels = alert.get("labels", {})
|
||||
candidates = [
|
||||
labels.get("os"),
|
||||
labels.get("platform"),
|
||||
labels.get("system"),
|
||||
alert.get("os"),
|
||||
]
|
||||
for candidate in candidates:
|
||||
if candidate:
|
||||
value = str(candidate).lower()
|
||||
if "win" in value:
|
||||
return "windows"
|
||||
if any(token in value for token in ("linux", "unix", "darwin")):
|
||||
return "linux"
|
||||
host = (labels.get("host") or labels.get("instance") or "").lower()
|
||||
if host.startswith("win") or host.endswith(".localdomain") and "win" in host:
|
||||
return "windows"
|
||||
inventory_os = infer_os_from_inventory(labels.get("host") or labels.get("instance"))
|
||||
if inventory_os:
|
||||
return inventory_os
|
||||
return TRIAGE_DEFAULT_OS
|
||||
|
||||
|
||||
def infer_os_from_inventory(host: Optional[str]) -> Optional[str]:
|
||||
if not host:
|
||||
return None
|
||||
entry = _lookup_inventory(host)
|
||||
if not entry:
|
||||
return None
|
||||
for definition in entry.get("definitions", []):
|
||||
vars_dict = definition.get("vars", {}) or {}
|
||||
connection = (vars_dict.get("ansible_connection") or "").lower()
|
||||
if connection == "winrm":
|
||||
return "windows"
|
||||
for group in entry.get("groups", []):
|
||||
if "windows" in (group or "").lower():
|
||||
return "windows"
|
||||
return None
|
||||
|
||||
|
||||
def truncate_text(text: str, limit: int = TRIAGE_OUTPUT_LIMIT) -> str:
|
||||
"""Trim long outputs to keep prompts manageable."""
|
||||
if not text:
|
||||
return ""
|
||||
cleaned = text.strip()
|
||||
if len(cleaned) <= limit:
|
||||
return cleaned
|
||||
return cleaned[:limit] + "... [truncated]"
|
||||
|
||||
|
||||
def gather_command_specs(entry: Dict[str, Any], host_os: str) -> List[Dict[str, Any]]:
|
||||
"""Collect command specs from triage steps and optional automation sections."""
|
||||
specs: List[Dict[str, Any]] = []
|
||||
for step in entry.get("triage", []):
|
||||
cmd = step.get(host_os)
|
||||
if not cmd:
|
||||
continue
|
||||
specs.append(
|
||||
{
|
||||
"summary": step.get("summary") or entry.get("name") or "triage",
|
||||
"shell": cmd,
|
||||
"runner": step.get("runner"),
|
||||
"os": host_os,
|
||||
}
|
||||
)
|
||||
for item in entry.get("automation_commands", []):
|
||||
target_os = item.get("os", host_os)
|
||||
if target_os and target_os.lower() != host_os:
|
||||
continue
|
||||
specs.append(item)
|
||||
if TRIAGE_MAX_COMMANDS > 0:
|
||||
return specs[:TRIAGE_MAX_COMMANDS]
|
||||
return specs
|
||||
|
||||
|
||||
def build_runner_command(
|
||||
rendered_command: str,
|
||||
runner: str,
|
||||
context: Dict[str, Any],
|
||||
spec: Dict[str, Any],
|
||||
) -> Tuple[Any, str, bool, str]:
|
||||
"""Return the subprocess args, display string, shell flag, and runner label."""
|
||||
runner = runner or TRIAGE_COMMAND_RUNNER
|
||||
runner = runner.lower()
|
||||
if runner == "ssh":
|
||||
host = spec.get("host") or context.get("ssh_host") or context.get("host")
|
||||
if not host:
|
||||
raise RuntimeError("Host not provided for ssh runner.")
|
||||
ssh_user = spec.get("ssh_user") or context.get("ssh_user") or TRIAGE_SSH_USER
|
||||
ssh_target = spec.get("ssh_target") or f"{ssh_user}@{host}"
|
||||
ssh_options = list(TRIAGE_SSH_OPTIONS)
|
||||
common_args = spec.get("ssh_common_args") or context.get("ssh_common_args")
|
||||
if common_args:
|
||||
ssh_options.extend(shlex.split(common_args))
|
||||
ssh_port = spec.get("ssh_port") or context.get("ssh_port")
|
||||
if ssh_port:
|
||||
ssh_options.extend(["-p", str(ssh_port)])
|
||||
identity_file = spec.get("ssh_identity_file") or context.get("ssh_identity_file")
|
||||
if identity_file:
|
||||
ssh_options.extend(["-i", identity_file])
|
||||
command_list = ["ssh", *ssh_options, ssh_target, rendered_command]
|
||||
display = " ".join(shlex.quote(part) for part in command_list)
|
||||
return command_list, display, False, "ssh"
|
||||
# default to local shell execution
|
||||
display = rendered_command
|
||||
return rendered_command, display, True, "local"
|
||||
|
||||
|
||||
def run_subprocess_command(
|
||||
command: Any,
|
||||
display: str,
|
||||
summary: str,
|
||||
use_shell: bool,
|
||||
runner_label: str,
|
||||
) -> Dict[str, Any]:
|
||||
"""Execute subprocess command and capture results."""
|
||||
LOGGER.info("Executing command (%s) via %s: %s", summary, runner_label, display)
|
||||
try:
|
||||
completed = subprocess.run(
|
||||
command,
|
||||
capture_output=True,
|
||||
text=True,
|
||||
timeout=TRIAGE_COMMAND_TIMEOUT,
|
||||
shell=use_shell,
|
||||
check=False,
|
||||
)
|
||||
result = {
|
||||
"summary": summary,
|
||||
"command": display,
|
||||
"runner": runner_label,
|
||||
"exit_code": completed.returncode,
|
||||
"stdout": (completed.stdout or "").strip(),
|
||||
"stderr": (completed.stderr or "").strip(),
|
||||
"status": "ok" if completed.returncode == 0 else "failed",
|
||||
}
|
||||
log_verbose(f"Command result ({summary})", result)
|
||||
return result
|
||||
except subprocess.TimeoutExpired as exc:
|
||||
result = {
|
||||
"summary": summary,
|
||||
"command": display,
|
||||
"runner": runner_label,
|
||||
"exit_code": None,
|
||||
"stdout": truncate_text((exc.stdout or "").strip()),
|
||||
"stderr": truncate_text((exc.stderr or "").strip()),
|
||||
"status": "timeout",
|
||||
"error": f"Command timed out after {TRIAGE_COMMAND_TIMEOUT}s",
|
||||
}
|
||||
log_verbose(f"Command timeout ({summary})", result)
|
||||
return result
|
||||
except Exception as exc: # pylint: disable=broad-except
|
||||
LOGGER.exception("Command execution failed (%s): %s", summary, exc)
|
||||
result = {
|
||||
"summary": summary,
|
||||
"command": display,
|
||||
"runner": runner_label,
|
||||
"exit_code": None,
|
||||
"stdout": "",
|
||||
"stderr": "",
|
||||
"status": "error",
|
||||
"error": str(exc),
|
||||
}
|
||||
log_verbose(f"Command error ({summary})", result)
|
||||
return result
|
||||
|
||||
|
||||
def run_command_spec(spec: Dict[str, Any], context: Dict[str, Any]) -> Dict[str, Any]:
|
||||
summary = spec.get("summary") or spec.get("name") or "command"
|
||||
shell_cmd = spec.get("shell")
|
||||
if not shell_cmd:
|
||||
return {"summary": summary, "status": "skipped", "error": "No shell command provided."}
|
||||
rendered = render_template(shell_cmd, context)
|
||||
preferred_runner = context.get("preferred_runner")
|
||||
runner_choice = (spec.get("runner") or preferred_runner or TRIAGE_COMMAND_RUNNER).lower()
|
||||
try:
|
||||
command, display, use_shell, runner_label = build_runner_command(rendered, runner_choice, context, spec)
|
||||
except RuntimeError as exc:
|
||||
LOGGER.warning("Skipping command '%s': %s", summary, exc)
|
||||
return {"summary": summary, "status": "skipped", "error": str(exc), "command": rendered}
|
||||
return run_subprocess_command(command, display, summary, use_shell, runner_label)
|
||||
|
||||
|
||||
def execute_triage_commands(entry: Dict[str, Any], alert: Dict[str, Any], context: Dict[str, Any]) -> List[Dict[str, Any]]:
|
||||
host_os = context.get("host_os") or determine_host_os(alert)
|
||||
context["host_os"] = host_os
|
||||
specs = gather_command_specs(entry, host_os)
|
||||
if not specs:
|
||||
LOGGER.info("No triage commands defined for host_os=%s", host_os)
|
||||
return []
|
||||
if not TRIAGE_ENABLE_COMMANDS:
|
||||
LOGGER.info("Command execution disabled; %d commands queued but skipped.", len(specs))
|
||||
return []
|
||||
LOGGER.info("Executing up to %d triage commands for host_os=%s", len(specs), host_os)
|
||||
results = []
|
||||
for spec in specs:
|
||||
results.append(run_command_spec(spec, context))
|
||||
return results
|
||||
|
||||
|
||||
def format_command_results_for_llm(results: List[Dict[str, Any]]) -> str:
|
||||
lines: List[str] = []
|
||||
for idx, result in enumerate(results, start=1):
|
||||
lines.append(f"{idx}. {result.get('summary')} [{result.get('status')}] {result.get('command')}")
|
||||
stdout = result.get("stdout")
|
||||
stderr = result.get("stderr")
|
||||
error = result.get("error")
|
||||
if stdout:
|
||||
lines.append(" stdout:")
|
||||
lines.append(indent(truncate_text(stdout), " "))
|
||||
if stderr:
|
||||
lines.append(" stderr:")
|
||||
lines.append(indent(truncate_text(stderr), " "))
|
||||
if error and result.get("status") != "ok":
|
||||
lines.append(f" error: {error}")
|
||||
if not lines:
|
||||
return "No command results were available."
|
||||
return "\n".join(lines)
|
||||
|
||||
|
||||
def parse_structured_response(text: str) -> Optional[Dict[str, Any]]:
|
||||
cleaned = text.strip()
|
||||
try:
|
||||
return json.loads(cleaned)
|
||||
except json.JSONDecodeError:
|
||||
start = cleaned.find("{")
|
||||
end = cleaned.rfind("}")
|
||||
if start != -1 and end != -1 and end > start:
|
||||
snippet = cleaned[start : end + 1]
|
||||
try:
|
||||
return json.loads(snippet)
|
||||
except json.JSONDecodeError:
|
||||
return None
|
||||
return None
|
||||
|
||||
|
||||
def normalize_followup_command(item: Dict[str, Any]) -> Dict[str, Any]:
|
||||
return {
|
||||
"summary": item.get("summary") or item.get("name") or "Follow-up command",
|
||||
"shell": item.get("command") or item.get("shell"),
|
||||
"runner": item.get("runner"),
|
||||
"host": item.get("host") or item.get("target"),
|
||||
"ssh_user": item.get("ssh_user"),
|
||||
"os": (item.get("os") or item.get("platform") or "").lower() or None,
|
||||
}
|
||||
|
||||
|
||||
def investigate_with_langchain(
|
||||
entry: Dict[str, Any],
|
||||
alert: Dict[str, Any],
|
||||
parent_payload: Dict[str, Any],
|
||||
context: Dict[str, Any],
|
||||
initial_outputs: List[Dict[str, Any]],
|
||||
) -> Tuple[str, List[Dict[str, Any]]]:
|
||||
command_outputs = list(initial_outputs)
|
||||
prompt = build_prompt(entry, alert, parent_payload, context, command_outputs)
|
||||
log_verbose("Initial investigation prompt", prompt)
|
||||
if not OPENROUTER_API_KEY:
|
||||
return "OPENROUTER_API_KEY is not configured; unable to analyze alert.", command_outputs
|
||||
|
||||
llm = OpenRouterLLM(api_key=OPENROUTER_API_KEY, model_name=OPENROUTER_MODEL)
|
||||
dialogue = (
|
||||
prompt
|
||||
+ "\n\nRespond with JSON containing fields analysis, followup_commands, and complete. "
|
||||
"Request commands only when more evidence is required."
|
||||
)
|
||||
total_followup = 0
|
||||
final_summary = ""
|
||||
for iteration in range(TRIAGE_MAX_ITERATIONS):
|
||||
log_verbose(f"LLM dialogue iteration {iteration + 1}", dialogue)
|
||||
llm_text = llm(dialogue)
|
||||
log_verbose(f"LLM iteration {iteration + 1} output", llm_text)
|
||||
dialogue += f"\nAssistant:\n{llm_text}\n"
|
||||
parsed = parse_structured_response(llm_text)
|
||||
if parsed:
|
||||
log_verbose(f"LLM iteration {iteration + 1} parsed response", parsed)
|
||||
if not parsed:
|
||||
final_summary = llm_text
|
||||
break
|
||||
|
||||
analysis = parsed.get("analysis") or ""
|
||||
followups = parsed.get("followup_commands") or parsed.get("commands") or []
|
||||
final_summary = analysis
|
||||
complete_flag = bool(parsed.get("complete"))
|
||||
|
||||
if complete_flag or not followups:
|
||||
break
|
||||
|
||||
log_verbose(f"LLM iteration {iteration + 1} requested follow-ups", followups)
|
||||
allowed = max(0, TRIAGE_FOLLOWUP_MAX_COMMANDS - total_followup)
|
||||
if not TRIAGE_ENABLE_COMMANDS or allowed <= 0:
|
||||
dialogue += (
|
||||
"\nUser:\nCommand execution is disabled or budget exhausted. Provide final analysis with JSON format.\n"
|
||||
)
|
||||
continue
|
||||
|
||||
normalized_cmds: List[Dict[str, Any]] = []
|
||||
for raw in followups:
|
||||
if not isinstance(raw, dict):
|
||||
continue
|
||||
normalized = normalize_followup_command(raw)
|
||||
if not normalized.get("shell"):
|
||||
continue
|
||||
cmd_os = normalized.get("os")
|
||||
if cmd_os and cmd_os != context.get("host_os"):
|
||||
continue
|
||||
normalized_cmds.append(normalized)
|
||||
|
||||
log_verbose(f"Normalized follow-up commands (iteration {iteration + 1})", normalized_cmds)
|
||||
if not normalized_cmds:
|
||||
dialogue += "\nUser:\nNo valid commands to run. Finalize analysis in JSON format.\n"
|
||||
continue
|
||||
|
||||
normalized_cmds = normalized_cmds[:allowed]
|
||||
executed_batch: List[Dict[str, Any]] = []
|
||||
for spec in normalized_cmds:
|
||||
executed = run_command_spec(spec, context)
|
||||
command_outputs.append(executed)
|
||||
executed_batch.append(executed)
|
||||
total_followup += 1
|
||||
|
||||
result_text = "Follow-up command results:\n" + format_command_results_for_llm(executed_batch)
|
||||
dialogue += (
|
||||
"\nUser:\n"
|
||||
+ result_text
|
||||
+ "\nUpdate your analysis and respond with JSON (analysis, followup_commands, complete).\n"
|
||||
)
|
||||
log_verbose("Executed follow-up commands", result_text)
|
||||
else:
|
||||
final_summary = final_summary or "Reached maximum iterations without a conclusive response."
|
||||
|
||||
if not final_summary:
|
||||
final_summary = "LLM did not return a valid analysis."
|
||||
|
||||
log_verbose("Final LLM summary", final_summary)
|
||||
return final_summary, command_outputs
|
||||
|
||||
|
||||
def build_context(alert: Dict[str, Any], parent_payload: Dict[str, Any]) -> Dict[str, Any]:
|
||||
labels = alert.get("labels", {})
|
||||
annotations = alert.get("annotations", {})
|
||||
context = {
|
||||
"alertname": labels.get("alertname") or alert.get("title") or parent_payload.get("title") or parent_payload.get("ruleName"),
|
||||
"host": labels.get("host") or labels.get("instance"),
|
||||
"iface": labels.get("interface"),
|
||||
"device": labels.get("device"),
|
||||
"vmid": labels.get("vmid"),
|
||||
"status": alert.get("status") or parent_payload.get("status"),
|
||||
"value": alert.get("value") or annotations.get("value"),
|
||||
"rule_url": alert.get("ruleUrl") or parent_payload.get("ruleUrl"),
|
||||
}
|
||||
context.setdefault("ssh_user", TRIAGE_SSH_USER)
|
||||
return context
|
||||
|
||||
|
||||
def build_prompt(
|
||||
entry: Dict[str, Any],
|
||||
alert: Dict[str, Any],
|
||||
parent_payload: Dict[str, Any],
|
||||
context: Dict[str, Any],
|
||||
command_outputs: Optional[List[Dict[str, Any]]] = None,
|
||||
) -> str:
|
||||
template = entry.get("llm_prompt", "Alert {{ alertname }} fired for {{ host }}.")
|
||||
rendered_template = render_template(template, {k: v or "" for k, v in context.items()})
|
||||
|
||||
evidence = entry.get("evidence_to_collect", [])
|
||||
triage_steps = entry.get("triage", [])
|
||||
remediation = entry.get("remediation", [])
|
||||
|
||||
lines = [
|
||||
rendered_template.strip(),
|
||||
"",
|
||||
"Alert payload summary:",
|
||||
f"- Status: {context.get('status') or alert.get('status')}",
|
||||
f"- Host: {context.get('host')}",
|
||||
f"- Value: {context.get('value')}",
|
||||
f"- StartsAt: {alert.get('startsAt')}",
|
||||
f"- EndsAt: {alert.get('endsAt')}",
|
||||
f"- RuleURL: {context.get('rule_url')}",
|
||||
f"- Host OS (inferred): {context.get('host_os')}",
|
||||
"- Note: All timestamps are UTC/RFC3339 as provided by Grafana.",
|
||||
summarize_dict("- Labels", alert.get("labels")),
|
||||
summarize_dict("- Annotations", alert.get("annotations")),
|
||||
]
|
||||
|
||||
if evidence:
|
||||
lines.append("")
|
||||
lines.append("Evidence to gather (for automation reference):")
|
||||
for item in evidence:
|
||||
lines.append(f"- {item}")
|
||||
|
||||
if triage_steps:
|
||||
lines.append("")
|
||||
lines.append("Suggested manual checks:")
|
||||
for step in triage_steps:
|
||||
summary = step.get("summary")
|
||||
linux = step.get("linux")
|
||||
windows = step.get("windows")
|
||||
lines.append(f"- {summary}")
|
||||
if linux:
|
||||
lines.append(f" Linux: {linux}")
|
||||
if windows:
|
||||
lines.append(f" Windows: {windows}")
|
||||
|
||||
if remediation:
|
||||
lines.append("")
|
||||
lines.append("Remediation ideas:")
|
||||
for item in remediation:
|
||||
lines.append(f"- {item}")
|
||||
|
||||
if command_outputs:
|
||||
lines.append("")
|
||||
lines.append("Command execution results:")
|
||||
for result in command_outputs:
|
||||
status = result.get("status", "unknown")
|
||||
cmd_display = result.get("command", "")
|
||||
lines.append(f"- {result.get('summary')} [{status}] {cmd_display}")
|
||||
stdout = result.get("stdout")
|
||||
stderr = result.get("stderr")
|
||||
error = result.get("error")
|
||||
if stdout:
|
||||
lines.append(" stdout:")
|
||||
lines.append(indent(truncate_text(stdout), " "))
|
||||
if stderr:
|
||||
lines.append(" stderr:")
|
||||
lines.append(indent(truncate_text(stderr), " "))
|
||||
if error and status != "ok":
|
||||
lines.append(f" error: {error}")
|
||||
|
||||
return "\n".join(lines).strip()
|
||||
|
||||
|
||||
def get_alerts(payload: Dict[str, Any]) -> List[Dict[str, Any]]:
|
||||
alerts = payload.get("alerts")
|
||||
if isinstance(alerts, list) and alerts:
|
||||
return alerts
|
||||
return [payload]
|
||||
|
||||
|
||||
@app.on_event("startup")
|
||||
def startup_event() -> None:
|
||||
global _RUNBOOK_INDEX, _INVENTORY_INDEX, _INVENTORY_GROUP_VARS
|
||||
_RUNBOOK_INDEX = load_runbook()
|
||||
_INVENTORY_INDEX, _INVENTORY_GROUP_VARS = load_ansible_inventory()
|
||||
LOGGER.info(
|
||||
"Alert webhook server ready with %d runbook entries and %d inventory hosts.",
|
||||
len(_RUNBOOK_INDEX),
|
||||
len(_INVENTORY_INDEX),
|
||||
)
|
||||
|
||||
|
||||
@app.post("/alerts")
|
||||
async def handle_alert(request: Request) -> Dict[str, Any]:
|
||||
payload = await request.json()
|
||||
LOGGER.info("Received Grafana payload: %s", json.dumps(payload, indent=2, sort_keys=True))
|
||||
results = []
|
||||
unmatched = []
|
||||
for alert in get_alerts(payload):
|
||||
LOGGER.info("Processing alert: %s", json.dumps(alert, indent=2, sort_keys=True))
|
||||
unmatched_reason: Optional[str] = None
|
||||
alert_status = str(alert.get("status") or payload.get("status") or "").lower()
|
||||
if alert_status and alert_status != "firing":
|
||||
details = {"reason": "non_firing_status", "status": alert_status, "alert": alert}
|
||||
unmatched.append(details)
|
||||
LOGGER.info("Skipping alert with status=%s (only 'firing' alerts are processed).", alert_status)
|
||||
continue
|
||||
rule_uid = extract_rule_uid(alert, payload)
|
||||
if not rule_uid:
|
||||
unmatched_reason = "missing_rule_uid"
|
||||
derived_uid = derive_fallback_rule_uid(alert, payload)
|
||||
details = {"reason": unmatched_reason, "derived_rule_uid": derived_uid, "alert": alert}
|
||||
unmatched.append(details)
|
||||
LOGGER.warning("Alert missing rule UID, using fallback identifier %s", derived_uid)
|
||||
rule_uid = derived_uid
|
||||
entry = _RUNBOOK_INDEX.get(rule_uid)
|
||||
runbook_matched = entry is not None
|
||||
if not entry:
|
||||
unmatched_reason = unmatched_reason or "no_runbook_entry"
|
||||
details = {"reason": unmatched_reason, "rule_uid": rule_uid, "alert": alert}
|
||||
unmatched.append(details)
|
||||
LOGGER.warning("No runbook entry for rule_uid=%s, using generic fallback.", rule_uid)
|
||||
entry = build_fallback_runbook_entry(alert, payload)
|
||||
context = build_context(alert, payload)
|
||||
context["host_os"] = determine_host_os(alert)
|
||||
context["rule_uid"] = rule_uid
|
||||
apply_inventory_context(context)
|
||||
initial_outputs = execute_triage_commands(entry, alert, context)
|
||||
try:
|
||||
llm_text, command_outputs = investigate_with_langchain(entry, alert, payload, context, initial_outputs)
|
||||
except Exception as exc: # pylint: disable=broad-except
|
||||
LOGGER.exception("Investigation failed for rule_uid=%s: %s", rule_uid, exc)
|
||||
raise HTTPException(status_code=502, detail=f"LLM investigation error: {exc}") from exc
|
||||
result = {
|
||||
"rule_uid": rule_uid,
|
||||
"alertname": entry.get("name"),
|
||||
"host": alert.get("labels", {}).get("host"),
|
||||
"llm_summary": llm_text,
|
||||
"command_results": command_outputs,
|
||||
"runbook_matched": runbook_matched,
|
||||
}
|
||||
if not runbook_matched and unmatched_reason:
|
||||
result["fallback_reason"] = unmatched_reason
|
||||
results.append(result)
|
||||
send_summary_email(alert, result, context)
|
||||
return {"processed": len(results), "results": results, "unmatched": unmatched}
|
||||
|
||||
|
||||
@app.post("/reload-runbook")
|
||||
def reload_runbook() -> Dict[str, Any]:
|
||||
global _RUNBOOK_INDEX, _INVENTORY_INDEX, _INVENTORY_GROUP_VARS
|
||||
_RUNBOOK_INDEX = load_runbook()
|
||||
_INVENTORY_INDEX, _INVENTORY_GROUP_VARS = load_ansible_inventory()
|
||||
return {"entries": len(_RUNBOOK_INDEX), "inventory_hosts": len(_INVENTORY_INDEX)}
|
||||
178
stacks/mllogwatcher/scripts/log_monitor.py
Executable file
178
stacks/mllogwatcher/scripts/log_monitor.py
Executable file
@ -0,0 +1,178 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Log anomaly checker that queries Elasticsearch and asks an OpenRouter-hosted LLM
|
||||
for a quick triage summary. Intended to be run on a schedule (cron/systemd).
|
||||
|
||||
Required environment variables:
|
||||
ELASTIC_HOST e.g. https://casper.localdomain:9200
|
||||
ELASTIC_API_KEY Base64 ApiKey used for Elasticsearch requests
|
||||
OPENROUTER_API_KEY Token for https://openrouter.ai/
|
||||
|
||||
Optional environment variables:
|
||||
OPENROUTER_MODEL Model identifier (default: openai/gpt-4o-mini)
|
||||
OPENROUTER_REFERER Passed through as HTTP-Referer header
|
||||
OPENROUTER_TITLE Passed through as X-Title header
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import argparse
|
||||
import datetime as dt
|
||||
import os
|
||||
import sys
|
||||
from typing import Any, Iterable
|
||||
|
||||
import requests
|
||||
|
||||
|
||||
def utc_iso(ts: dt.datetime) -> str:
|
||||
"""Return an ISO8601 string with Z suffix."""
|
||||
return ts.replace(microsecond=0).isoformat() + "Z"
|
||||
|
||||
|
||||
def query_elasticsearch(
|
||||
host: str,
|
||||
api_key: str,
|
||||
index_pattern: str,
|
||||
minutes: int,
|
||||
size: int,
|
||||
verify: bool,
|
||||
) -> list[dict[str, Any]]:
|
||||
"""Fetch recent logs from Elasticsearch."""
|
||||
end = dt.datetime.utcnow()
|
||||
start = end - dt.timedelta(minutes=minutes)
|
||||
url = f"{host.rstrip('/')}/{index_pattern}/_search"
|
||||
payload = {
|
||||
"size": size,
|
||||
"sort": [{"@timestamp": {"order": "desc"}}],
|
||||
"query": {
|
||||
"range": {
|
||||
"@timestamp": {
|
||||
"gte": utc_iso(start),
|
||||
"lte": utc_iso(end),
|
||||
}
|
||||
}
|
||||
},
|
||||
"_source": ["@timestamp", "message", "host.name", "container.image.name", "log.level"],
|
||||
}
|
||||
headers = {
|
||||
"Authorization": f"ApiKey {api_key}",
|
||||
"Content-Type": "application/json",
|
||||
}
|
||||
response = requests.post(url, json=payload, headers=headers, timeout=30, verify=verify)
|
||||
response.raise_for_status()
|
||||
hits = response.json().get("hits", {}).get("hits", [])
|
||||
return hits
|
||||
|
||||
|
||||
def build_prompt(logs: Iterable[dict[str, Any]], limit_messages: int) -> str:
|
||||
"""Create the prompt that will be sent to the LLM."""
|
||||
selected = []
|
||||
for idx, hit in enumerate(logs):
|
||||
if idx >= limit_messages:
|
||||
break
|
||||
source = hit.get("_source", {})
|
||||
message = source.get("message") or source.get("event", {}).get("original") or ""
|
||||
timestamp = source.get("@timestamp", "unknown time")
|
||||
host = source.get("host", {}).get("name") or source.get("host", {}).get("hostname") or "unknown-host"
|
||||
container = source.get("container", {}).get("image", {}).get("name") or ""
|
||||
level = source.get("log", {}).get("level") or source.get("log.level") or ""
|
||||
selected.append(
|
||||
f"[{timestamp}] host={host} level={level} container={container}\n{message}".strip()
|
||||
)
|
||||
|
||||
if not selected:
|
||||
return "No logs were returned from Elasticsearch in the requested window."
|
||||
|
||||
prompt = (
|
||||
"You are assisting with HomeLab observability. Review the following log entries collected from "
|
||||
"Elasticsearch and highlight any notable anomalies, errors, or emerging issues. "
|
||||
"Explain the impact and suggest next steps when applicable. "
|
||||
"Use concise bullet points. Logs:\n\n"
|
||||
+ "\n\n".join(selected)
|
||||
)
|
||||
return prompt
|
||||
|
||||
|
||||
def call_openrouter(prompt: str, model: str, api_key: str, referer: str | None, title: str | None) -> str:
|
||||
"""Send prompt to OpenRouter and return the model response text."""
|
||||
url = "https://openrouter.ai/api/v1/chat/completions"
|
||||
headers = {
|
||||
"Authorization": f"Bearer {api_key}",
|
||||
"Content-Type": "application/json",
|
||||
}
|
||||
if referer:
|
||||
headers["HTTP-Referer"] = referer
|
||||
if title:
|
||||
headers["X-Title"] = title
|
||||
|
||||
body = {
|
||||
"model": model,
|
||||
"messages": [
|
||||
{"role": "system", "content": "You are a senior SRE helping analyze log anomalies."},
|
||||
{"role": "user", "content": prompt},
|
||||
],
|
||||
}
|
||||
|
||||
response = requests.post(url, json=body, headers=headers, timeout=60)
|
||||
response.raise_for_status()
|
||||
data = response.json()
|
||||
choices = data.get("choices", [])
|
||||
if not choices:
|
||||
raise RuntimeError("OpenRouter response did not include choices")
|
||||
return choices[0]["message"]["content"]
|
||||
|
||||
|
||||
def parse_args() -> argparse.Namespace:
|
||||
parser = argparse.ArgumentParser(description="Query Elasticsearch and summarize logs with OpenRouter.")
|
||||
parser.add_argument("--host", default=os.environ.get("ELASTIC_HOST"), help="Elasticsearch host URL")
|
||||
parser.add_argument("--api-key", default=os.environ.get("ELASTIC_API_KEY"), help="Elasticsearch ApiKey")
|
||||
parser.add_argument("--index", default="log*", help="Index pattern (default: log*)")
|
||||
parser.add_argument("--minutes", type=int, default=60, help="Lookback window in minutes (default: 60)")
|
||||
parser.add_argument("--size", type=int, default=200, help="Max number of logs to fetch (default: 200)")
|
||||
parser.add_argument("--message-limit", type=int, default=50, help="Max log lines sent to LLM (default: 50)")
|
||||
parser.add_argument("--openrouter-model", default=os.environ.get("OPENROUTER_MODEL", "openai/gpt-4o-mini"))
|
||||
parser.add_argument("--insecure", action="store_true", help="Disable TLS verification for Elasticsearch")
|
||||
return parser.parse_args()
|
||||
|
||||
|
||||
def main() -> int:
|
||||
args = parse_args()
|
||||
if not args.host or not args.api_key:
|
||||
print("ELASTIC_HOST and ELASTIC_API_KEY must be provided via environment or CLI", file=sys.stderr)
|
||||
return 1
|
||||
|
||||
logs = query_elasticsearch(
|
||||
host=args.host,
|
||||
api_key=args.api_key,
|
||||
index_pattern=args.index,
|
||||
minutes=args.minutes,
|
||||
size=args.size,
|
||||
verify=not args.insecure,
|
||||
)
|
||||
|
||||
prompt = build_prompt(logs, limit_messages=args.message_limit)
|
||||
if not prompt.strip() or prompt.startswith("No logs"):
|
||||
print(prompt)
|
||||
return 0
|
||||
|
||||
openrouter_key = os.environ.get("OPENROUTER_API_KEY")
|
||||
if not openrouter_key:
|
||||
print("OPENROUTER_API_KEY is required to summarize logs", file=sys.stderr)
|
||||
return 1
|
||||
|
||||
referer = os.environ.get("OPENROUTER_REFERER")
|
||||
title = os.environ.get("OPENROUTER_TITLE", "Elastic Log Monitor")
|
||||
response_text = call_openrouter(
|
||||
prompt=prompt,
|
||||
model=args.openrouter_model,
|
||||
api_key=openrouter_key,
|
||||
referer=referer,
|
||||
title=title,
|
||||
)
|
||||
print(response_text.strip())
|
||||
return 0
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
raise SystemExit(main())
|
||||
17
stacks/mllogwatcher/testing.py
Executable file
17
stacks/mllogwatcher/testing.py
Executable file
@ -0,0 +1,17 @@
|
||||
# pip install -qU langchain "langchain[anthropic]"
|
||||
from langchain.agents import create_agent
|
||||
|
||||
def get_weather(city: str) -> str:
|
||||
"""Get weather for a given city."""
|
||||
return f"It's always sunny in {city}!"
|
||||
|
||||
agent = create_agent(
|
||||
model="claude-sonnet-4-5-20250929",
|
||||
tools=[get_weather],
|
||||
system_prompt="You are a helpful assistant",
|
||||
)
|
||||
|
||||
# Run the agent
|
||||
agent.invoke(
|
||||
{"messages": [{"role": "user", "content": "what is the weather in sf"}]}
|
||||
)
|
||||
10
stacks/mllogwatcher/worklog-2025-12-29.txt
Normal file
10
stacks/mllogwatcher/worklog-2025-12-29.txt
Normal file
@ -0,0 +1,10 @@
|
||||
# Worklog – 2025-12-29
|
||||
|
||||
1. Added containerization assets for grafana_alert_webhook:
|
||||
- `Dockerfile`, `.dockerignore`, `docker-compose.yml`, `.env.example`, and consolidated `requirements.txt`.
|
||||
- Compose mounts the runbook, `/etc/ansible/hosts`, and `.ssh` so SSH automation works inside the container.
|
||||
- README now documents the compose workflow.
|
||||
2. Copied knight’s SSH key to `.ssh/webhook_id_rsa` and updated `jet-alone` inventory entry with `ansible_user` + `ansible_ssh_private_key_file` so remote commands can run non-interactively.
|
||||
3. Updated `OpenRouterLLM` to satisfy Pydantic’s field validation inside the container.
|
||||
4. Brought the webhook up under Docker Compose, tested alerts end-to-end, and reverted `OPENROUTER_MODEL` to the valid `openai/gpt-5.1-codex-max`.
|
||||
5. Created `/var/core/ansible/ops_baseline.yml` to install sysstat/iotop/smartmontools/hdparm and enforce synchronized Bash history (`/etc/profile.d/99-bash-history.sh`). Ran the playbook against the primary LAN hosts; noted remediation items for the few that failed (outdated mirrors, pending grub configuration, missing sudo password).
|
||||
26
stacks/network-mcp/.env.example
Normal file
26
stacks/network-mcp/.env.example
Normal file
@ -0,0 +1,26 @@
|
||||
ES_URL=http://elasticsearch:9200
|
||||
# Elasticsearch API Key authentication (preferred over user/pass)
|
||||
ES_API_ID=
|
||||
ES_API_KEY=
|
||||
|
||||
# Or, Elasticsearch Basic authentication (if no API key)
|
||||
# ES_USER=elastic
|
||||
# ES_PASS=changeme
|
||||
ES_VERIFY_SSL=false
|
||||
|
||||
OPNSENSE_URL=https://192.168.1.1
|
||||
OPNSENSE_API_KEY=your_key
|
||||
OPNSENSE_API_SECRET=your_secret
|
||||
|
||||
COLLECTOR_INTERVAL_SECONDS=60
|
||||
|
||||
NMAP_INTERVAL_SECONDS=300
|
||||
NMAP_PORT_RANGE=1-1024
|
||||
NMAP_BATCH_SIZE=10
|
||||
NMAP_DISCOVERY_ENABLED=true
|
||||
NMAP_DISCOVERY_INTERVAL_SECONDS=3600
|
||||
NMAP_DISCOVERY_VLANS=
|
||||
NMAP_DISCOVERY_EXTRA_ARGS="-sn -n"
|
||||
NMAP_QUICK_BATCH_SIZE=30
|
||||
NMAP_QUICK_EXTRA_ARGS="-sS --top-ports 100 -T4 --open -Pn"
|
||||
NMAP_FULL_INTERVAL_SECONDS=86400
|
||||
26
stacks/network-mcp/.env.template
Normal file
26
stacks/network-mcp/.env.template
Normal file
@ -0,0 +1,26 @@
|
||||
ES_URL=http://elasticsearch:9200
|
||||
# Elasticsearch API Key authentication (preferred over user/pass)
|
||||
ES_API_ID=
|
||||
ES_API_KEY=
|
||||
|
||||
# Or, Elasticsearch Basic authentication (if no API key)
|
||||
# ES_USER=elastic
|
||||
# ES_PASS=changeme
|
||||
ES_VERIFY_SSL=false
|
||||
|
||||
OPNSENSE_URL=https://192.168.1.1
|
||||
OPNSENSE_API_KEY=your_key
|
||||
OPNSENSE_API_SECRET=your_secret
|
||||
|
||||
COLLECTOR_INTERVAL_SECONDS=60
|
||||
|
||||
NMAP_INTERVAL_SECONDS=300
|
||||
NMAP_PORT_RANGE=1-1024
|
||||
NMAP_BATCH_SIZE=10
|
||||
NMAP_DISCOVERY_ENABLED=true
|
||||
NMAP_DISCOVERY_INTERVAL_SECONDS=3600
|
||||
NMAP_DISCOVERY_VLANS=
|
||||
NMAP_DISCOVERY_EXTRA_ARGS="-sn -n"
|
||||
NMAP_QUICK_BATCH_SIZE=30
|
||||
NMAP_QUICK_EXTRA_ARGS="-sS --top-ports 100 -T4 --open -Pn"
|
||||
NMAP_FULL_INTERVAL_SECONDS=86400
|
||||
11
stacks/network-mcp/.gitignore
vendored
Normal file
11
stacks/network-mcp/.gitignore
vendored
Normal file
@ -0,0 +1,11 @@
|
||||
.env
|
||||
.venv/
|
||||
__pycache__/
|
||||
*.pyc
|
||||
.DS_Store
|
||||
|
||||
# Local/infra
|
||||
node_modules/
|
||||
|
||||
# Logs
|
||||
*.log
|
||||
76
stacks/network-mcp/PROJECT_SUMMARY.md
Normal file
76
stacks/network-mcp/PROJECT_SUMMARY.md
Normal file
@ -0,0 +1,76 @@
|
||||
# Network MCP - Project Summary
|
||||
|
||||
## Overview
|
||||
This project is a long-running Network MCP service that merges OPNsense discovery data, Nmap scans, and static inventory into Elasticsearch, then exposes both a minimal web UI and a full MCP JSON-RPC interface for LLM agents. It runs via Docker Compose and is now located at `/var/core/network-mcp`.
|
||||
|
||||
## What We Built
|
||||
- **Collectors**
|
||||
- OPNsense collector ingests DHCP/ARP/DNS and overlays inventory targets.
|
||||
- Nmap collector performs discovery and port scans.
|
||||
- Data lands in Elasticsearch: `network-hosts` (current state) and `network-events-*` (historical events).
|
||||
- **Inventory merge**
|
||||
- Inventory data from `inventory_targets.yml` is merged onto live hosts by IP when a MAC is known (so live MAC-based records carry inventory notes/expected ports).
|
||||
- **Frontend**
|
||||
- Flask UI + JSON API, containerized with Gunicorn and exposed on port `5001` for LAN access.
|
||||
- **MCP server**
|
||||
- JSON-RPC endpoint at `/.well-known/mcp.json` (and `/api/mcp`) supports:
|
||||
- `initialize`, `ping`, `tools/list`, `tools/call`
|
||||
- `resources/list`, `resources/read`, `resources/templates/list`
|
||||
- Tool schemas include titles, descriptions, input/output schemas, and annotations (read-only hints).
|
||||
- Resource templates provide snapshot + query access (e.g. `network://hosts?q=...`).
|
||||
- **Search behavior**
|
||||
- Host search is case-insensitive across name/hostname/IP/MAC.
|
||||
- **Tests**
|
||||
- Unit tests for REST and MCP search by hostname/IP/MAC, MCP resource reads, and MCP notifications.
|
||||
|
||||
## Key Endpoints
|
||||
- UI: `http://<host>:5001/`
|
||||
- REST:
|
||||
- `GET /api/hosts` (supports `q`, `source`, `limit`)
|
||||
- `GET /api/hosts/<host_id>`
|
||||
- `GET /api/events`
|
||||
- `GET /api/hosts/<host_id>/events`
|
||||
- `GET /api/map`
|
||||
- MCP JSON-RPC: `POST /.well-known/mcp.json`
|
||||
|
||||
## MCP Tools (JSON-RPC)
|
||||
- `list_hosts` (search by hostname/IP/MAC; case-insensitive)
|
||||
- `get_host` (optional events)
|
||||
- `list_events`
|
||||
- `host_events`
|
||||
- `network_map`
|
||||
|
||||
## MCP Resources
|
||||
- `resources/list` -> `network://hosts`, `network://map`, `network://events`
|
||||
- `resources/templates/list` -> query templates such as:
|
||||
- `network://hosts{?q,source,limit}`
|
||||
- `network://host/{host_id}{?include_events,events_limit}`
|
||||
- `network://events{?host_id,type,since,limit}`
|
||||
|
||||
## Docker & Repo State
|
||||
- Repo path: `/var/core/network-mcp`
|
||||
- `inventory_targets.yml` lives in the repo and is mounted via compose.
|
||||
- Services run via `docker-compose up -d`.
|
||||
- Git repo initialized and initial commit created.
|
||||
|
||||
## Gotchas / Pitfalls We Hit
|
||||
- **MCP handshake**: Codex sent `notifications/initialized` without `id` (notification). Returning a response caused the transport to close. Fixed by treating notifications as no-response.
|
||||
- **Case-sensitive search**: Elasticsearch wildcard on `.keyword` fields was case-sensitive, so `seele` didn’t match `SEELE`. Fixed via `case_insensitive: true` in wildcard queries.
|
||||
- **Inventory merge duplication**: Initial inventory-only docs were `ip:*` and live docs were `mac:*`, so both existed. Merge now attaches inventory to live MAC records by IP. Legacy `ip:*` docs may remain stale unless cleaned.
|
||||
- **MCP errors**: Tool errors are now returned as `CallToolResult` with `isError: true` (instead of JSON-RPC errors), so LLMs can see and correct issues.
|
||||
- **Service move**: Repo moved from `/var/core/ansible/network-mcp` to `/var/core/network-mcp`. Compose mount paths updated.
|
||||
|
||||
## Verification Performed
|
||||
- REST search works for hostname/IP/MAC.
|
||||
- MCP `initialize`, `tools/list`, `tools/call` work.
|
||||
- MCP resource list/templates/read work.
|
||||
- Services verified running via `docker-compose up -d`.
|
||||
|
||||
## Future Work Ideas
|
||||
- **Cleanup**: Add a cleanup job to remove stale `ip:*` docs after successful MAC merge.
|
||||
- **Resource subscriptions**: Implement `resources/subscribe` if clients need push updates.
|
||||
- **Auth**: Optional token on the MCP endpoint for shared LAN exposure.
|
||||
- **More UI**: Add filters/alerts for stale hosts or missing expected ports.
|
||||
- **Metrics**: Export collector stats to detect scan/ingest failures.
|
||||
- **Schema mapping**: Improve Elasticsearch mappings for search (e.g., lowercase normalizers for names/hostnames).
|
||||
|
||||
105
stacks/network-mcp/README.md
Normal file
105
stacks/network-mcp/README.md
Normal file
@ -0,0 +1,105 @@
|
||||
# Network MCP
|
||||
|
||||
A "source of truth" for network devices and ports, backed by Elasticsearch, OPNsense, and Nmap.
|
||||
|
||||
## Architecture
|
||||
|
||||
- **Elasticsearch**: Stores current state (`network-hosts`) and historical events (`network-events-*`).
|
||||
- **OPNsense Collector**: Fetches DHCP/ARP/DNS data to discover hosts.
|
||||
- **Nmap Collector**: Scans discovered hosts for open ports and OS info.
|
||||
|
||||
## Setup
|
||||
|
||||
1. **Environment Config**
|
||||
Copy `.env.example` to `.env` and fill in your details:
|
||||
```bash
|
||||
cp .env.example .env
|
||||
# Edit .env
|
||||
```
|
||||
|
||||
2. **Bootstrap Elastic**
|
||||
Run the bootstrap script (requires `requests` installed locally, or you can run it inside a container):
|
||||
```bash
|
||||
python3 scripts/bootstrap_indices.py
|
||||
```
|
||||
*Note: Ensure you have connectivity to your Elasticsearch instance.*
|
||||
|
||||
3. **Start Services**
|
||||
```bash
|
||||
docker-compose up -d --build
|
||||
```
|
||||
This brings up the collectors and the lightweight frontend (reachable on port `5001`).
|
||||
|
||||
## Configuration
|
||||
|
||||
- **Static Metadata**: Edit `static/host_metadata.json` to add manual notes, roles, or tags to hosts (keyed by `mac:xx:xx...`).
|
||||
- **Intervals**: Adjust polling intervals in `.env`.
|
||||
- **VLAN Discovery (default on)**: Discovery sweeps (`nmap -sn`) run periodically across the OPNsense interfaces listed in `NMAP_DISCOVERY_VLANS`. Adjust the list (or set the flag to `false`) if you only want targeted subnets.
|
||||
- **Quick vs Full Port Scans**: Each collector loop runs a fast, common-port sweep (`NMAP_QUICK_EXTRA_ARGS`, `NMAP_QUICK_BATCH_SIZE`) while a deeper service scan (`NMAP_PORT_RANGE`, `NMAP_BATCH_SIZE`) is triggered once per `NMAP_FULL_INTERVAL_SECONDS` (default daily). Tune these env vars to balance coverage vs. runtime.
|
||||
- **Inventory Overlay**: Entries in `./inventory_targets.yml` are mounted into the OPNsense collector and merged by IP—offline/static hosts from that file (names, notes, expected ports) now appear in `network-hosts` with `source: inventory`.
|
||||
|
||||
## Data Model
|
||||
|
||||
- **`network-hosts`**: Current state of every known host.
|
||||
- **`network-events-YYYY.MM.DD`**: Immutable log of scans and discovery events.
|
||||
|
||||
## Usage
|
||||
|
||||
Query `network-hosts` for the latest view of your network:
|
||||
```json
|
||||
GET network-hosts/_search
|
||||
{
|
||||
"query": {
|
||||
"match_all": {}
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
### Quick Frontend
|
||||
|
||||
A minimal Flask frontend is bundled in docker-compose (service `frontend`) and is exposed on port `5001` so it can be reached from other machines:
|
||||
|
||||
```bash
|
||||
docker-compose up -d frontend
|
||||
```
|
||||
|
||||
Then visit `http://<host-ip>:5001/` to see the merged view (inventory entries are marked with `source: inventory`). If you prefer to run it without Docker for debugging, follow the steps below:
|
||||
|
||||
```bash
|
||||
cd network-mcp
|
||||
python3 -m venv .venv && source .venv/bin/activate
|
||||
pip install -r frontend/requirements.txt
|
||||
python frontend/app.py
|
||||
```
|
||||
|
||||
### MCP / API Endpoints
|
||||
|
||||
The frontend doubles as a Model Context Protocol server. It exposes the manifest at `/.well-known/mcp.json` (or `/api/mcp`) and supports the standard JSON-RPC handshake (`initialize`, `tools/list`, `tools/call`) on the same URL. Agents can either use the RPC tools below or hit the underlying REST endpoints directly.
|
||||
|
||||
- MCP Resources are also available (`resources/list`, `resources/read`, `resources/templates/list`) for clients that prefer resource-style access to snapshots and queries.
|
||||
|
||||
- `GET /api/hosts` – merged host list (supports `limit`, `source`, and repeated `q` params to fuzzy search names, hostnames, IPs, or MACs in a single call).
|
||||
- `GET /api/hosts/<host_id>` – single host document with optional `include_events=true`.
|
||||
- `GET /api/events` – recent scan/discovery events (`limit`, `host_id`, `type`, `since` filters).
|
||||
- `GET /api/hosts/<host_id>/events` – scoped events for a host.
|
||||
- `GET /api/map` – high-level “network map” grouping hosts by detected /24 (IPv4) or /64 (IPv6).
|
||||
|
||||
RPC tool names (mirrored in the manifest) are:
|
||||
|
||||
- `list_hosts` – accepts `{limit, source, terms}` and returns the merged host list.
|
||||
- `network_map` – optional `{limit}` for building /24-/64 summaries.
|
||||
- `get_host` – requires `{host_id}` plus optional `include_events`, `events_limit`.
|
||||
- `list_events` – `{limit, host_id, type, since}`.
|
||||
- `host_events` – requires `{host_id}` plus optional `limit`, `type`, `since`.
|
||||
|
||||
Resource URI examples:
|
||||
|
||||
- `network://hosts?q=seele&limit=50`
|
||||
- `network://host/mac:dc:a6:32:67:55:dc?include_events=true&events_limit=50`
|
||||
- `network://events?type=discovery&limit=100`
|
||||
|
||||
All RPC and REST calls share the Elasticsearch credentials from `.env`, so an agent only needs HTTP access to port `5001` to query hosts, notes, and event timelines. Registering the MCP with Codex looks like:
|
||||
|
||||
```bash
|
||||
codex mcp install network-mcp http://<host>:5001/.well-known/mcp.json
|
||||
```
|
||||
0
stacks/network-mcp/collectors/common/__init__.py
Normal file
0
stacks/network-mcp/collectors/common/__init__.py
Normal file
55
stacks/network-mcp/collectors/common/es_auth.py
Normal file
55
stacks/network-mcp/collectors/common/es_auth.py
Normal file
@ -0,0 +1,55 @@
|
||||
import base64
|
||||
from typing import Optional, Tuple
|
||||
|
||||
|
||||
def _clean(value: Optional[str]) -> str:
|
||||
"""
|
||||
Normalize values coming from env files where quotes might be preserved.
|
||||
"""
|
||||
if not value:
|
||||
return ""
|
||||
return value.strip().strip('"').strip()
|
||||
|
||||
|
||||
def resolve_api_key(api_id: Optional[str], api_key: Optional[str]) -> Tuple[Optional[str], Optional[str]]:
|
||||
"""
|
||||
Accept various API key formats and return (api_id, api_key).
|
||||
Supported formats:
|
||||
- Explicit ES_API_ID and ES_API_KEY values.
|
||||
- ES_API_KEY that already contains \"id:key\".
|
||||
- ES_API_KEY that is the base64 encoding of \"id:key\".
|
||||
"""
|
||||
cleaned_id = _clean(api_id)
|
||||
cleaned_key = _clean(api_key)
|
||||
|
||||
if cleaned_id and cleaned_key:
|
||||
return cleaned_id, cleaned_key
|
||||
|
||||
if not cleaned_key:
|
||||
return None, None
|
||||
|
||||
# Raw "id:key" format
|
||||
if ":" in cleaned_key:
|
||||
potential_id, potential_key = cleaned_key.split(":", 1)
|
||||
if potential_id and potential_key:
|
||||
return potential_id, potential_key
|
||||
|
||||
# Base64 encoded "id:key" format
|
||||
try:
|
||||
decoded = base64.b64decode(cleaned_key, validate=True).decode()
|
||||
if ":" in decoded:
|
||||
potential_id, potential_key = decoded.split(":", 1)
|
||||
if potential_id and potential_key:
|
||||
return potential_id, potential_key
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
return None, None
|
||||
|
||||
|
||||
def build_api_key_header(api_id: str, api_key: str) -> str:
|
||||
"""
|
||||
Return the value for the Authorization header using ApiKey auth.
|
||||
"""
|
||||
token = base64.b64encode(f"{api_id}:{api_key}".encode()).decode()
|
||||
return f"ApiKey {token}"
|
||||
85
stacks/network-mcp/collectors/common/es_client.py
Normal file
85
stacks/network-mcp/collectors/common/es_client.py
Normal file
@ -0,0 +1,85 @@
|
||||
import os
|
||||
import time
|
||||
import urllib3
|
||||
from elasticsearch import Elasticsearch, helpers
|
||||
from .es_auth import resolve_api_key
|
||||
from .logging_config import setup_logging
|
||||
|
||||
# Suppress insecure request warnings if SSL verification is disabled
|
||||
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
|
||||
|
||||
logger = setup_logging("es_client")
|
||||
|
||||
class ESClient:
|
||||
def __init__(self):
|
||||
self.url = os.getenv("ES_URL", "http://localhost:9200")
|
||||
env_api_id = os.getenv("ES_API_ID")
|
||||
env_api_key = os.getenv("ES_API_KEY")
|
||||
self.api_id, self.api_key = resolve_api_key(env_api_id, env_api_key)
|
||||
self.user = os.getenv("ES_USER", "elastic")
|
||||
self.password = os.getenv("ES_PASS", "changeme")
|
||||
self.verify_ssl = os.getenv("ES_VERIFY_SSL", "true").lower() == "true"
|
||||
|
||||
if self.api_id and self.api_key:
|
||||
# Use API key authentication
|
||||
self.client = Elasticsearch(
|
||||
self.url,
|
||||
api_key=(self.api_id, self.api_key),
|
||||
verify_certs=self.verify_ssl,
|
||||
ssl_show_warn=False
|
||||
)
|
||||
logger.info("Using Elasticsearch API key authentication.")
|
||||
else:
|
||||
# Fallback to basic auth
|
||||
self.client = Elasticsearch(
|
||||
self.url,
|
||||
basic_auth=(self.user, self.password),
|
||||
verify_certs=self.verify_ssl,
|
||||
ssl_show_warn=False
|
||||
)
|
||||
logger.info("Using Elasticsearch basic authentication.")
|
||||
|
||||
def check_connection(self):
|
||||
try:
|
||||
return self.client.info()
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to connect to Elasticsearch: {e}")
|
||||
raise
|
||||
|
||||
def bulk_index(self, actions):
|
||||
"""
|
||||
Bulk index a list of actions.
|
||||
actions: list of dicts compatible with elasticsearch.helpers.bulk
|
||||
"""
|
||||
if not actions:
|
||||
return 0, []
|
||||
|
||||
try:
|
||||
success, failed = helpers.bulk(self.client, actions, stats_only=False, raise_on_error=False)
|
||||
if failed:
|
||||
logger.warning(f"Bulk index had failures: {len(failed)} items failed.")
|
||||
for item in failed[:5]: # Log first 5 failures
|
||||
logger.warning(f"Failure sample: {item}")
|
||||
else:
|
||||
logger.info(f"Bulk index successful: {success} items.")
|
||||
return success, failed
|
||||
except Exception as e:
|
||||
logger.error(f"Bulk index exception: {e}")
|
||||
raise
|
||||
|
||||
def search_hosts(self, index="network-hosts", query=None, size=1000):
|
||||
"""
|
||||
Search for hosts in network-hosts index.
|
||||
"""
|
||||
if query is None:
|
||||
query = {"match_all": {}}
|
||||
|
||||
try:
|
||||
resp = self.client.search(index=index, query=query, size=size)
|
||||
return [hit["_source"] for hit in resp["hits"]["hits"]]
|
||||
except Exception as e:
|
||||
logger.error(f"Search failed: {e}")
|
||||
return []
|
||||
|
||||
def get_es_client():
|
||||
return ESClient()
|
||||
21
stacks/network-mcp/collectors/common/logging_config.py
Normal file
21
stacks/network-mcp/collectors/common/logging_config.py
Normal file
@ -0,0 +1,21 @@
|
||||
import logging
|
||||
import os
|
||||
import sys
|
||||
|
||||
def setup_logging(name: str = "collector") -> logging.Logger:
|
||||
"""
|
||||
Sets up a structured logger.
|
||||
"""
|
||||
logger = logging.getLogger(name)
|
||||
level = os.getenv("LOG_LEVEL", "INFO").upper()
|
||||
logger.setLevel(level)
|
||||
|
||||
if not logger.handlers:
|
||||
handler = logging.StreamHandler(sys.stdout)
|
||||
formatter = logging.Formatter(
|
||||
'%(asctime)s [%(levelname)s] %(name)s: %(message)s'
|
||||
)
|
||||
handler.setFormatter(formatter)
|
||||
logger.addHandler(handler)
|
||||
|
||||
return logger
|
||||
131
stacks/network-mcp/collectors/common/nmap_parser.py
Normal file
131
stacks/network-mcp/collectors/common/nmap_parser.py
Normal file
@ -0,0 +1,131 @@
|
||||
import subprocess
|
||||
import xml.etree.ElementTree as ET
|
||||
import shutil
|
||||
from typing import List, Dict, Optional
|
||||
from .logging_config import setup_logging
|
||||
|
||||
logger = setup_logging("nmap_parser")
|
||||
|
||||
def run_nmap_scan(ips: List[str], extra_args: Optional[List[str]] = None) -> List[Dict]:
|
||||
"""
|
||||
Run nmap on the given IPs and return a list of parsed host dicts.
|
||||
"""
|
||||
if not ips:
|
||||
return []
|
||||
|
||||
if not shutil.which("nmap"):
|
||||
logger.error("nmap binary not found in PATH")
|
||||
return []
|
||||
|
||||
# Default args: -oX - (XML to stdout)
|
||||
cmd = ["nmap", "-oX", "-"]
|
||||
if extra_args:
|
||||
cmd.extend(extra_args)
|
||||
|
||||
# Append IPs
|
||||
cmd.extend(ips)
|
||||
|
||||
logger.info(f"Running nmap command: {' '.join(cmd)}")
|
||||
|
||||
try:
|
||||
result = subprocess.run(cmd, capture_output=True, text=True, check=True)
|
||||
xml_output = result.stdout
|
||||
return parse_nmap_xml(xml_output)
|
||||
except subprocess.CalledProcessError as e:
|
||||
logger.error(f"Nmap failed: {e.stderr}")
|
||||
return []
|
||||
except Exception as e:
|
||||
logger.error(f"Error running nmap: {e}")
|
||||
return []
|
||||
|
||||
def parse_nmap_xml(xml_string: str) -> List[Dict]:
|
||||
"""
|
||||
Parse Nmap XML output into our internal host/port structure.
|
||||
"""
|
||||
try:
|
||||
root = ET.fromstring(xml_string)
|
||||
except ET.ParseError as e:
|
||||
logger.error(f"Failed to parse Nmap XML: {e}")
|
||||
return []
|
||||
|
||||
hosts = []
|
||||
|
||||
for host_node in root.findall("host"):
|
||||
# Helper to find basic info
|
||||
ip = None
|
||||
mac = None
|
||||
hostname = None
|
||||
vendor = None
|
||||
|
||||
# Addresses
|
||||
for addr in host_node.findall("address"):
|
||||
addr_type = addr.get("addrtype")
|
||||
if addr_type == "ipv4":
|
||||
ip = addr.get("addr")
|
||||
elif addr_type == "mac":
|
||||
mac = addr.get("addr")
|
||||
vendor = addr.get("vendor")
|
||||
|
||||
# Hostnames
|
||||
hostnames_node = host_node.find("hostnames")
|
||||
if hostnames_node is not None:
|
||||
# Pick first for now
|
||||
hn = hostnames_node.find("hostname")
|
||||
if hn is not None:
|
||||
hostname = hn.get("name")
|
||||
|
||||
# Ports
|
||||
ports = []
|
||||
ports_node = host_node.find("ports")
|
||||
if ports_node is not None:
|
||||
for port_node in ports_node.findall("port"):
|
||||
state_node = port_node.find("state")
|
||||
state = state_node.get("state") if state_node is not None else "unknown"
|
||||
|
||||
# Only care about open ports usually, but keep all for now if needed
|
||||
if state != "open":
|
||||
continue
|
||||
|
||||
port_id = int(port_node.get("portid"))
|
||||
protocol = port_node.get("protocol")
|
||||
|
||||
service_node = port_node.find("service")
|
||||
service_name = service_node.get("name") if service_node is not None else "unknown"
|
||||
product = service_node.get("product") if service_node is not None else None
|
||||
version = service_node.get("version") if service_node is not None else None
|
||||
|
||||
service_def = {
|
||||
"name": service_name,
|
||||
}
|
||||
if product: service_def["product"] = product
|
||||
if version: service_def["version"] = version
|
||||
|
||||
ports.append({
|
||||
"port": port_id,
|
||||
"proto": protocol,
|
||||
"state": state,
|
||||
"service": service_def
|
||||
})
|
||||
|
||||
# OS detection (basic)
|
||||
os_match = None
|
||||
os_node = host_node.find("os")
|
||||
if os_node is not None:
|
||||
os_match_node = os_node.find("osmatch")
|
||||
if os_match_node is not None:
|
||||
os_match = {
|
||||
"name": os_match_node.get("name"),
|
||||
"accuracy": os_match_node.get("accuracy")
|
||||
}
|
||||
|
||||
host_data = {
|
||||
"ip": ip,
|
||||
"mac": mac, # might be None if scanning remote segment
|
||||
"hostname": hostname,
|
||||
"vendor": vendor,
|
||||
"ports": ports,
|
||||
"os_match": os_match
|
||||
}
|
||||
hosts.append(host_data)
|
||||
|
||||
return hosts
|
||||
105
stacks/network-mcp/collectors/common/opnsense_client.py
Normal file
105
stacks/network-mcp/collectors/common/opnsense_client.py
Normal file
@ -0,0 +1,105 @@
|
||||
import os
|
||||
import requests
|
||||
import json
|
||||
import ipaddress
|
||||
from .logging_config import setup_logging
|
||||
|
||||
logger = setup_logging("opnsense_client")
|
||||
|
||||
class OPNsenseClient:
|
||||
def __init__(self):
|
||||
self.base_url = os.getenv("OPNSENSE_URL", "https://192.168.1.1").rstrip('/')
|
||||
self.api_key = os.getenv("OPNSENSE_API_KEY")
|
||||
self.api_secret = os.getenv("OPNSENSE_API_SECRET")
|
||||
self.verify_ssl = os.getenv("ES_VERIFY_SSL", "true").lower() == "true" # Reusing verify flag or add explicit OPNSENSE_VERIFY_SSL
|
||||
|
||||
if not self.api_key or not self.api_secret:
|
||||
logger.warning("OPNSENSE_API_KEY or OPNSENSE_API_SECRET not set. API calls will fail.")
|
||||
|
||||
def _get(self, endpoint, params=None):
|
||||
url = f"{self.base_url}{endpoint}"
|
||||
try:
|
||||
response = requests.get(
|
||||
url,
|
||||
auth=(self.api_key, self.api_secret),
|
||||
verify=self.verify_ssl,
|
||||
params=params,
|
||||
timeout=10
|
||||
)
|
||||
response.raise_for_status()
|
||||
return response.json()
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to fetch {url}: {e}")
|
||||
return {}
|
||||
|
||||
def get_dhcp_leases_v4(self):
|
||||
# Endpoint: /api/dhcpv4/leases/search
|
||||
# Note: 'search' endpoints in OPNsense often expect POST or GET with params for filtering.
|
||||
# Often a simple GET works for 'searchLeases' or similar.
|
||||
# Standard OPNsense API for leases might be under /api/dhcpv4/leases/searchLeases
|
||||
# Let's try the standard search endpoint.
|
||||
data = self._get("/api/dhcpv4/leases/searchLease")
|
||||
# API return structure usually: {"rows": [...], "total": ...}
|
||||
return data.get("rows", [])
|
||||
|
||||
def get_arp_table(self):
|
||||
# Endpoint: /api/diagnostics/arp/search
|
||||
# This endpoint returns the ARP table.
|
||||
data = self._get("/api/diagnostics/interface/getArp")
|
||||
# Structure varies, let's assume standard response list or rows
|
||||
# If the standard plugin is used, it might be /api/diagnostics/interface/getArp
|
||||
# Or /api/diagnostics/network/arp ...
|
||||
# NOTE: OPNsense API paths can be tricky. /api/diagnostics/interface/getArp is a common one.
|
||||
# It returns a list directly or a dict with rows.
|
||||
# Let's assume list of dicts or {"rows": []}
|
||||
if isinstance(data, list):
|
||||
return data
|
||||
return data.get("rows", [])
|
||||
|
||||
def get_dns_overrides(self):
|
||||
# Endpoint: /api/unbound/settings/searchHostOverride
|
||||
data = self._get("/api/unbound/settings/searchHostOverride")
|
||||
return data.get("rows", [])
|
||||
|
||||
def get_vlan_networks(self):
|
||||
"""
|
||||
Build a list of IPv4 networks (CIDRs) from the routing table, grouped by interface description.
|
||||
"""
|
||||
routes = self._get("/api/diagnostics/interface/getRoutes")
|
||||
networks = []
|
||||
if not isinstance(routes, list):
|
||||
return networks
|
||||
|
||||
seen = set()
|
||||
for route in routes:
|
||||
if route.get("proto") != "ipv4":
|
||||
continue
|
||||
destination = route.get("destination")
|
||||
if not destination or "/" not in destination or destination == "default":
|
||||
continue
|
||||
desc = route.get("intf_description")
|
||||
if not desc:
|
||||
continue
|
||||
try:
|
||||
network = ipaddress.ip_network(destination, strict=False)
|
||||
except ValueError:
|
||||
continue
|
||||
# Skip host routes (/32) which are usually static peers
|
||||
if network.prefixlen == 32:
|
||||
continue
|
||||
if network.prefixlen < 16:
|
||||
continue
|
||||
|
||||
key = (desc, str(network))
|
||||
if key in seen:
|
||||
continue
|
||||
seen.add(key)
|
||||
networks.append({
|
||||
"key": desc,
|
||||
"name": desc,
|
||||
"cidr": str(network)
|
||||
})
|
||||
return networks
|
||||
|
||||
def get_opnsense_client():
|
||||
return OPNsenseClient()
|
||||
14
stacks/network-mcp/collectors/nmap_collector/Dockerfile
Normal file
14
stacks/network-mcp/collectors/nmap_collector/Dockerfile
Normal file
@ -0,0 +1,14 @@
|
||||
FROM python:3.11-slim
|
||||
|
||||
RUN apt-get update && apt-get install -y nmap && rm -rf /var/lib/apt/lists/*
|
||||
|
||||
WORKDIR /app
|
||||
|
||||
COPY collectors/common /app/collectors/common
|
||||
COPY collectors/nmap_collector /app/collectors/nmap_collector
|
||||
|
||||
ENV PYTHONPATH=/app
|
||||
|
||||
RUN pip install requests elasticsearch==8.15.1
|
||||
|
||||
CMD ["python", "collectors/nmap_collector/main.py"]
|
||||
378
stacks/network-mcp/collectors/nmap_collector/main.py
Normal file
378
stacks/network-mcp/collectors/nmap_collector/main.py
Normal file
@ -0,0 +1,378 @@
|
||||
import os
|
||||
import time
|
||||
import datetime
|
||||
import sys
|
||||
import json
|
||||
import shlex
|
||||
|
||||
# Ensure we can import from common
|
||||
sys.path.append(os.path.join(os.path.dirname(__file__), '..', '..'))
|
||||
|
||||
from collectors.common.es_client import get_es_client
|
||||
from collectors.common.opnsense_client import get_opnsense_client
|
||||
from collectors.common.nmap_parser import run_nmap_scan
|
||||
from collectors.common.logging_config import setup_logging
|
||||
|
||||
logger = setup_logging("nmap_collector")
|
||||
|
||||
def get_now_iso():
|
||||
return datetime.datetime.now(datetime.timezone.utc).isoformat()
|
||||
|
||||
def chunk_list(lst, n):
|
||||
for i in range(0, len(lst), n):
|
||||
yield lst[i:i + n]
|
||||
|
||||
def should_scan_vlan(vlan, allowlist):
|
||||
if not allowlist:
|
||||
return True
|
||||
name = (vlan.get("name") or "").strip()
|
||||
key = (vlan.get("key") or "").strip()
|
||||
return name in allowlist or key in allowlist
|
||||
|
||||
def build_discovery_update_action(host_id, mac, ip, hostname, vendor, ts_iso):
|
||||
mac_norm = mac.lower() if mac else None
|
||||
upsert_host = {
|
||||
"host": {
|
||||
"id": host_id,
|
||||
"macs": [mac_norm] if mac_norm else [],
|
||||
"ips": [ip] if ip else [],
|
||||
"name": hostname,
|
||||
"hostnames": [hostname] if hostname else [],
|
||||
"vendor": vendor,
|
||||
"sources": ["nmap-discovery"],
|
||||
"last_seen": ts_iso,
|
||||
"first_seen": ts_iso
|
||||
}
|
||||
}
|
||||
|
||||
script_source = """
|
||||
if (ctx._source.host == null) { ctx._source.host = [:]; }
|
||||
if (ctx._source.host.macs == null) { ctx._source.host.macs = []; }
|
||||
if (ctx._source.host.ips == null) { ctx._source.host.ips = []; }
|
||||
if (ctx._source.host.hostnames == null) { ctx._source.host.hostnames = []; }
|
||||
if (ctx._source.host.sources == null) { ctx._source.host.sources = []; }
|
||||
|
||||
if (params.mac != null && !ctx._source.host.macs.contains(params.mac)) {
|
||||
ctx._source.host.macs.add(params.mac);
|
||||
}
|
||||
if (params.ip != null && !ctx._source.host.ips.contains(params.ip)) {
|
||||
ctx._source.host.ips.add(params.ip);
|
||||
}
|
||||
if (params.hostname != null && !ctx._source.host.hostnames.contains(params.hostname)) {
|
||||
ctx._source.host.hostnames.add(params.hostname);
|
||||
}
|
||||
if (!ctx._source.host.sources.contains(params.source_tag)) {
|
||||
ctx._source.host.sources.add(params.source_tag);
|
||||
}
|
||||
ctx._source.host.last_seen = params.ts;
|
||||
if (ctx._source.host.name == null && params.hostname != null) {
|
||||
ctx._source.host.name = params.hostname;
|
||||
}
|
||||
if (params.vendor != null && (ctx._source.host.vendor == null || ctx._source.host.vendor == \"\")) {
|
||||
ctx._source.host.vendor = params.vendor;
|
||||
}
|
||||
"""
|
||||
|
||||
return {
|
||||
"_index": "network-hosts",
|
||||
"_op_type": "update",
|
||||
"_id": host_id,
|
||||
"script": {
|
||||
"source": script_source,
|
||||
"lang": "painless",
|
||||
"params": {
|
||||
"mac": mac_norm,
|
||||
"ip": ip,
|
||||
"hostname": hostname,
|
||||
"vendor": vendor,
|
||||
"ts": ts_iso,
|
||||
"source_tag": "nmap-discovery"
|
||||
}
|
||||
},
|
||||
"upsert": upsert_host
|
||||
}
|
||||
|
||||
def run_vlan_discovery(es, opnsense_client, discovery_args, vlan_filter):
|
||||
networks = opnsense_client.get_vlan_networks()
|
||||
if not networks:
|
||||
logger.info("VLAN discovery skipped: OPNsense returned no interfaces.")
|
||||
return
|
||||
|
||||
scoped_networks = [n for n in networks if should_scan_vlan(n, vlan_filter)]
|
||||
if not scoped_networks:
|
||||
logger.info("VLAN discovery skipped: no interfaces matched NMAP_DISCOVERY_VLANS.")
|
||||
return
|
||||
|
||||
actions = []
|
||||
today = datetime.datetime.now().strftime("%Y.%m.%d")
|
||||
event_index = f"network-events-{today}"
|
||||
|
||||
for vlan in scoped_networks:
|
||||
cidr = vlan.get("cidr")
|
||||
if not cidr:
|
||||
continue
|
||||
logger.info(f"VLAN discovery scan for {vlan.get('name')} ({cidr})")
|
||||
scan_ts = get_now_iso()
|
||||
scan_id = f"nmap_discovery_{vlan.get('name')}_{scan_ts}"
|
||||
results = run_nmap_scan([cidr], discovery_args)
|
||||
|
||||
for res in results:
|
||||
ip = res.get("ip")
|
||||
if not ip:
|
||||
continue
|
||||
|
||||
mac = res.get("mac")
|
||||
hostname = res.get("hostname")
|
||||
vendor = res.get("vendor")
|
||||
host_id = f"mac:{mac.lower()}" if mac else None
|
||||
|
||||
event_doc = {
|
||||
"@timestamp": scan_ts,
|
||||
"source": "nmap-discovery",
|
||||
"scan_id": scan_id,
|
||||
"vlan": vlan.get("name"),
|
||||
"cidr": cidr,
|
||||
"host": {
|
||||
"id": host_id,
|
||||
"ip": ip,
|
||||
"mac": mac,
|
||||
"hostname": hostname,
|
||||
"vendor": vendor
|
||||
}
|
||||
}
|
||||
actions.append({
|
||||
"_index": event_index,
|
||||
"_op_type": "index",
|
||||
"_source": event_doc
|
||||
})
|
||||
|
||||
if host_id:
|
||||
actions.append(
|
||||
build_discovery_update_action(host_id, mac, ip, hostname, vendor, scan_ts)
|
||||
)
|
||||
|
||||
if actions:
|
||||
logger.info(f"VLAN discovery produced {len(actions)} Elasticsearch actions.")
|
||||
es.bulk_index(actions)
|
||||
else:
|
||||
logger.info("VLAN discovery finished with no hosts discovered.")
|
||||
|
||||
def main():
|
||||
es = get_es_client()
|
||||
opnsense_client = get_opnsense_client()
|
||||
|
||||
interval = int(os.getenv("NMAP_INTERVAL_SECONDS", "300"))
|
||||
full_batch_size = int(os.getenv("NMAP_BATCH_SIZE", "10"))
|
||||
quick_batch_size = int(os.getenv("NMAP_QUICK_BATCH_SIZE", "30"))
|
||||
port_range = os.getenv("NMAP_PORT_RANGE", "1-1024") # Full scan range
|
||||
discovery_enabled = os.getenv("NMAP_DISCOVERY_ENABLED", "false").lower() == "true"
|
||||
discovery_interval = int(os.getenv("NMAP_DISCOVERY_INTERVAL_SECONDS", "3600"))
|
||||
discovery_vlan_filter = [v.strip() for v in os.getenv("NMAP_DISCOVERY_VLANS", "").split(",") if v.strip()]
|
||||
discovery_extra_args = os.getenv("NMAP_DISCOVERY_EXTRA_ARGS", "-sn -n").strip()
|
||||
if discovery_extra_args:
|
||||
discovery_extra_args = shlex.split(discovery_extra_args)
|
||||
else:
|
||||
discovery_extra_args = ["-sn", "-n"]
|
||||
discovery_last_run = time.time() - discovery_interval if discovery_enabled else 0.0
|
||||
full_interval = int(os.getenv("NMAP_FULL_INTERVAL_SECONDS", "86400"))
|
||||
quick_extra_str = os.getenv("NMAP_QUICK_EXTRA_ARGS", "-sS --top-ports 100 -T4 --open -Pn").strip()
|
||||
quick_extra_args = shlex.split(quick_extra_str) if quick_extra_str else ["-sS", "--top-ports", "100", "-T4", "--open", "-Pn"]
|
||||
last_full_scan = time.time()
|
||||
|
||||
# Construct base nmap args
|
||||
# -sV for service version, -O for OS detection (requires root usually), --open to only show open
|
||||
# We run as root in docker (usually) or need capabilities.
|
||||
extra_args = ["-sV", "--open"]
|
||||
|
||||
# Check if port_range looks like a range or specific ports
|
||||
if port_range:
|
||||
extra_args.extend(["-p", port_range])
|
||||
|
||||
# Add user provided extra args
|
||||
user_args = os.getenv("NMAP_EXTRA_ARGS", "")
|
||||
if user_args:
|
||||
extra_args.extend(user_args.split())
|
||||
|
||||
logger.info("Starting Nmap collector loop...")
|
||||
|
||||
while True:
|
||||
try:
|
||||
start_time = time.time()
|
||||
ts_iso = get_now_iso()
|
||||
now = time.time()
|
||||
use_full_scan = (now - last_full_scan) >= full_interval
|
||||
scan_type = "full" if use_full_scan else "quick"
|
||||
scan_id = f"nmap_{scan_type}_{ts_iso}"
|
||||
current_batch_size = full_batch_size if use_full_scan else quick_batch_size
|
||||
scan_args = extra_args if use_full_scan else quick_extra_args
|
||||
|
||||
if use_full_scan:
|
||||
last_full_scan = now
|
||||
logger.info("Running scheduled full service scan.")
|
||||
else:
|
||||
logger.info("Running quick common-port sweep.")
|
||||
|
||||
if discovery_enabled and (time.time() - discovery_last_run) >= discovery_interval:
|
||||
run_vlan_discovery(es, opnsense_client, discovery_extra_args, discovery_vlan_filter)
|
||||
discovery_last_run = time.time()
|
||||
|
||||
# 1. Get targets from ES
|
||||
# We only want hosts that have an IP.
|
||||
hosts = es.search_hosts(index="network-hosts", size=1000)
|
||||
|
||||
# Extract IPs to scan. Map IP -> Host ID to correlate back
|
||||
targets = []
|
||||
ip_to_host_id = {}
|
||||
|
||||
for h in hosts:
|
||||
# h is {"host": {...}, "ports": [...]}
|
||||
host_info = h.get("host", {})
|
||||
hid = host_info.get("id")
|
||||
ips = host_info.get("ips", [])
|
||||
|
||||
if not hid or not ips:
|
||||
continue
|
||||
|
||||
# Pick the "best" IP? Or scan all?
|
||||
# Scaning all might be duplicate work if they point to same box.
|
||||
# Let's pick the first one for now.
|
||||
target_ip = ips[0]
|
||||
targets.append(target_ip)
|
||||
ip_to_host_id[target_ip] = hid
|
||||
|
||||
logger.info(f"Found {len(targets)} targets to scan ({scan_type}).")
|
||||
|
||||
total_processed = 0
|
||||
logger.info(f"Scanning {scan_type} run with {len(targets)} targets.")
|
||||
scan_results = run_nmap_scan(targets, scan_args)
|
||||
actions = []
|
||||
today = datetime.datetime.now().strftime("%Y.%m.%d")
|
||||
event_index = f"network-events-{today}"
|
||||
|
||||
for res in scan_results:
|
||||
ip = res.get("ip")
|
||||
if not ip or ip not in ip_to_host_id:
|
||||
continue
|
||||
|
||||
hid = ip_to_host_id[ip]
|
||||
total_processed += 1
|
||||
|
||||
for p in res["ports"]:
|
||||
p["last_seen"] = ts_iso
|
||||
p["last_scan_id"] = scan_id
|
||||
|
||||
event_doc = {
|
||||
"@timestamp": ts_iso,
|
||||
"source": "nmap",
|
||||
"scan_id": scan_id,
|
||||
"host": {"id": hid, "ip": ip},
|
||||
"ports": res["ports"],
|
||||
"os": res.get("os_match")
|
||||
}
|
||||
actions.append({
|
||||
"_index": event_index,
|
||||
"_op_type": "index",
|
||||
"_source": event_doc
|
||||
})
|
||||
|
||||
script_source = """
|
||||
if (ctx._source.host == null) { ctx._source.host = [:]; }
|
||||
if (ctx._source.host.sources == null) { ctx._source.host.sources = []; }
|
||||
if (!ctx._source.host.sources.contains('nmap')) {
|
||||
ctx._source.host.sources.add('nmap');
|
||||
}
|
||||
ctx._source.host.last_seen = params.ts;
|
||||
|
||||
if (params.os != null) {
|
||||
ctx._source.host.os = params.os;
|
||||
}
|
||||
|
||||
if (ctx._source.ports == null) {
|
||||
ctx._source.ports = [];
|
||||
}
|
||||
|
||||
for (new_p in params.new_ports) {
|
||||
boolean found = false;
|
||||
for (old_p in ctx._source.ports) {
|
||||
if (old_p.port == new_p.port && old_p.proto == new_p.proto) {
|
||||
old_p.last_seen = params.ts;
|
||||
old_p.state = new_p.state;
|
||||
old_p.service = new_p.service;
|
||||
old_p.last_scan_id = params.scan_id;
|
||||
found = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (!found) {
|
||||
new_p.first_seen = params.ts;
|
||||
ctx._source.ports.add(new_p);
|
||||
}
|
||||
}
|
||||
"""
|
||||
|
||||
actions.append({
|
||||
"_index": "network-hosts",
|
||||
"_op_type": "update",
|
||||
"_id": hid,
|
||||
"script": {
|
||||
"source": script_source,
|
||||
"lang": "painless",
|
||||
"params": {
|
||||
"ts": ts_iso,
|
||||
"os": res.get("os_match"),
|
||||
"new_ports": res["ports"],
|
||||
"scan_id": scan_id
|
||||
}
|
||||
}
|
||||
})
|
||||
|
||||
for p in res["ports"]:
|
||||
svc_id = f"{hid}:{p['proto']}:{p['port']}"
|
||||
svc_script = """
|
||||
ctx._source.last_seen = params.ts;
|
||||
ctx._source.state = params.state;
|
||||
ctx._source.service = params.service;
|
||||
if (ctx._source.first_seen == null) {
|
||||
ctx._source.first_seen = params.ts;
|
||||
}
|
||||
"""
|
||||
actions.append({
|
||||
"_index": "network-services",
|
||||
"_op_type": "update",
|
||||
"_id": svc_id,
|
||||
"script": {
|
||||
"source": svc_script,
|
||||
"lang": "painless",
|
||||
"params": {
|
||||
"ts": ts_iso,
|
||||
"state": p["state"],
|
||||
"service": p["service"]
|
||||
}
|
||||
},
|
||||
"upsert": {
|
||||
"host_id": hid,
|
||||
"host_ip": ip,
|
||||
"port": p["port"],
|
||||
"proto": p["proto"],
|
||||
"service": p["service"],
|
||||
"state": p["state"],
|
||||
"last_seen": ts_iso,
|
||||
"first_seen": ts_iso,
|
||||
"sources": ["nmap"]
|
||||
}
|
||||
})
|
||||
|
||||
if actions:
|
||||
es.bulk_index(actions)
|
||||
|
||||
elapsed = time.time() - start_time
|
||||
sleep_time = max(0, interval - elapsed)
|
||||
logger.info(f"Nmap {scan_type} cycle done. Scanned {total_processed} hosts in {elapsed:.2f}s. Sleeping {sleep_time:.2f}s")
|
||||
time.sleep(sleep_time)
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error in Nmap loop: {e}")
|
||||
time.sleep(10)
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
14
stacks/network-mcp/collectors/opnsense_collector/Dockerfile
Normal file
14
stacks/network-mcp/collectors/opnsense_collector/Dockerfile
Normal file
@ -0,0 +1,14 @@
|
||||
FROM python:3.11-slim
|
||||
|
||||
WORKDIR /app
|
||||
|
||||
COPY collectors/common /app/collectors/common
|
||||
COPY collectors/opnsense_collector /app/collectors/opnsense_collector
|
||||
|
||||
# We need to make sure the module path works.
|
||||
# The main.py does sys.path.append, but cleanest is to set PYTHONPATH.
|
||||
ENV PYTHONPATH=/app
|
||||
|
||||
RUN pip install requests elasticsearch==8.15.1 pyyaml
|
||||
|
||||
CMD ["python", "collectors/opnsense_collector/main.py"]
|
||||
261
stacks/network-mcp/collectors/opnsense_collector/main.py
Normal file
261
stacks/network-mcp/collectors/opnsense_collector/main.py
Normal file
@ -0,0 +1,261 @@
|
||||
import os
|
||||
import time
|
||||
import json
|
||||
import datetime
|
||||
import sys
|
||||
import yaml
|
||||
|
||||
# Ensure we can import from common
|
||||
sys.path.append(os.path.join(os.path.dirname(__file__), '..', '..'))
|
||||
|
||||
from collectors.common.es_client import get_es_client
|
||||
from collectors.common.opnsense_client import get_opnsense_client
|
||||
from collectors.common.logging_config import setup_logging
|
||||
|
||||
logger = setup_logging("opnsense_collector")
|
||||
|
||||
def load_static_metadata(path="/app/static/host_metadata.json"):
|
||||
if not os.path.exists(path):
|
||||
logger.info(f"No static metadata found at {path}")
|
||||
return {}
|
||||
try:
|
||||
with open(path, 'r') as f:
|
||||
return json.load(f)
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to load static metadata: {e}")
|
||||
return {}
|
||||
|
||||
def load_inventory_targets(path=None):
|
||||
path = path or os.getenv("INVENTORY_FILE", "/app/inventory_targets.yml")
|
||||
if not os.path.exists(path):
|
||||
logger.info(f"No inventory targets found at {path}")
|
||||
return []
|
||||
try:
|
||||
with open(path, 'r') as f:
|
||||
data = yaml.safe_load(f) or {}
|
||||
return data.get("inventory_targets", [])
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to load inventory targets: {e}")
|
||||
return []
|
||||
|
||||
def normalize_mac(mac):
|
||||
if not mac:
|
||||
return None
|
||||
return mac.lower().replace("-", ":")
|
||||
|
||||
def get_now_iso():
|
||||
return datetime.datetime.now(datetime.timezone.utc).isoformat()
|
||||
|
||||
def main():
|
||||
es = get_es_client()
|
||||
opn = get_opnsense_client()
|
||||
|
||||
interval = int(os.getenv("COLLECTOR_INTERVAL_SECONDS", "60"))
|
||||
|
||||
logger.info("Starting OPNsense collector loop...")
|
||||
|
||||
while True:
|
||||
try:
|
||||
start_time = time.time()
|
||||
ts_iso = get_now_iso()
|
||||
|
||||
# 1. Fetch Data
|
||||
dhcp_v4 = opn.get_dhcp_leases_v4()
|
||||
arp_table = opn.get_arp_table()
|
||||
dns_overrides = opn.get_dns_overrides()
|
||||
|
||||
static_meta = load_static_metadata()
|
||||
inventory_entries = load_inventory_targets()
|
||||
|
||||
# 2. Process Data -> hosts map
|
||||
# Key: identifier (mac:xx... or ip:xxx)
|
||||
hosts_map = {}
|
||||
|
||||
def create_host_entry(identifier):
|
||||
return {
|
||||
"id": identifier,
|
||||
"macs": set(),
|
||||
"ips": set(),
|
||||
"hostnames": set(),
|
||||
"sources": set(),
|
||||
"preferred_name": None,
|
||||
"inventory_notes": None,
|
||||
"inventory_ports": None
|
||||
}
|
||||
|
||||
def get_or_create_host(mac):
|
||||
norm_mac = normalize_mac(mac)
|
||||
if not norm_mac:
|
||||
return None
|
||||
identifier = f"mac:{norm_mac}"
|
||||
host = hosts_map.setdefault(identifier, create_host_entry(identifier))
|
||||
host["macs"].add(norm_mac)
|
||||
return host
|
||||
|
||||
def get_or_create_host_by_ip(ip):
|
||||
if not ip:
|
||||
return None
|
||||
identifier = f"ip:{ip}"
|
||||
host = hosts_map.setdefault(identifier, create_host_entry(identifier))
|
||||
host["ips"].add(ip)
|
||||
return host
|
||||
|
||||
# Process DHCP
|
||||
for lease in dhcp_v4:
|
||||
# Structure depends on OPNsense version, but usually has 'mac', 'address', 'hostname'
|
||||
mac = lease.get('mac') or lease.get('hw_address')
|
||||
ip = lease.get('address') or lease.get('ip')
|
||||
hostname = lease.get('hostname')
|
||||
|
||||
host = get_or_create_host(mac)
|
||||
if host:
|
||||
if ip: host["ips"].add(ip)
|
||||
if hostname: host["hostnames"].add(hostname)
|
||||
host["sources"].add("opnsense-dhcp")
|
||||
|
||||
# Process ARP
|
||||
for entry in arp_table:
|
||||
# Structure: 'mac', 'ip', 'hostname' (sometimes)
|
||||
mac = entry.get('mac')
|
||||
ip = entry.get('ip')
|
||||
hostname = entry.get('hostname')
|
||||
|
||||
host = get_or_create_host(mac)
|
||||
if host:
|
||||
if ip: host["ips"].add(ip)
|
||||
if hostname and hostname != "?": host["hostnames"].add(hostname)
|
||||
host["sources"].add("opnsense-arp")
|
||||
|
||||
# Process DNS Overrides (mapped by IP when possible)
|
||||
ip_to_identifier = {}
|
||||
for identifier, h in hosts_map.items():
|
||||
for ip in h["ips"]:
|
||||
ip_to_identifier[ip] = identifier
|
||||
|
||||
for override in dns_overrides:
|
||||
ip = override.get('ip')
|
||||
domain = override.get('domain')
|
||||
hostname = override.get('hostname')
|
||||
full_fqdn = f"{hostname}.{domain}" if hostname and domain else hostname
|
||||
|
||||
if ip and ip in ip_to_identifier:
|
||||
identifier = ip_to_identifier[ip]
|
||||
if full_fqdn:
|
||||
hosts_map[identifier]["hostnames"].add(full_fqdn)
|
||||
hosts_map[identifier]["sources"].add("opnsense-dns")
|
||||
|
||||
# Process inventory targets (by IP)
|
||||
for entry in inventory_entries:
|
||||
ip = entry.get("ip")
|
||||
if not ip:
|
||||
continue
|
||||
identifier = ip_to_identifier.get(ip)
|
||||
if identifier:
|
||||
host = hosts_map.get(identifier)
|
||||
if host is None:
|
||||
host = get_or_create_host_by_ip(ip)
|
||||
ip_to_identifier[ip] = host["id"]
|
||||
else:
|
||||
host = get_or_create_host_by_ip(ip)
|
||||
if host:
|
||||
ip_to_identifier[ip] = host["id"]
|
||||
if not host:
|
||||
continue
|
||||
hostname = entry.get("hostname")
|
||||
name = entry.get("name")
|
||||
if hostname:
|
||||
host["hostnames"].add(hostname)
|
||||
if name:
|
||||
host["hostnames"].add(name)
|
||||
host["preferred_name"] = name
|
||||
host["sources"].add("inventory")
|
||||
notes = entry.get("notes")
|
||||
if notes:
|
||||
host["inventory_notes"] = notes
|
||||
ports = entry.get("ports")
|
||||
if ports:
|
||||
host["inventory_ports"] = ports
|
||||
|
||||
# 3. Build Actions
|
||||
actions = []
|
||||
today = datetime.datetime.now().strftime("%Y.%m.%d")
|
||||
event_index = f"network-events-{today}"
|
||||
|
||||
for _, h_data in hosts_map.items():
|
||||
name = h_data.get("preferred_name")
|
||||
if not name and h_data["hostnames"]:
|
||||
name = next(iter(h_data["hostnames"]))
|
||||
|
||||
final_host = {
|
||||
"host": {
|
||||
"id": h_data["id"],
|
||||
"macs": list(h_data["macs"]),
|
||||
"ips": list(h_data["ips"]),
|
||||
"name": name,
|
||||
"hostnames": list(h_data["hostnames"]),
|
||||
"last_seen": ts_iso,
|
||||
"sources": list(h_data["sources"])
|
||||
}
|
||||
}
|
||||
|
||||
if h_data.get("inventory_notes"):
|
||||
final_host["host"]["notes"] = h_data["inventory_notes"]
|
||||
if h_data.get("inventory_ports"):
|
||||
final_host["host"]["expected_ports"] = h_data["inventory_ports"]
|
||||
# Merge Static Metadata
|
||||
if h_data["id"] in static_meta:
|
||||
meta = static_meta[h_data["id"]]
|
||||
# Merge fields
|
||||
for k, v in meta.items():
|
||||
final_host["host"][k] = v
|
||||
|
||||
# 3a. Event Document
|
||||
event_doc = {
|
||||
"@timestamp": ts_iso,
|
||||
"source": "opnsense",
|
||||
"scan_id": f"opnsense_{ts_iso}",
|
||||
"host": final_host["host"]
|
||||
}
|
||||
actions.append({
|
||||
"_index": event_index,
|
||||
"_op_type": "index",
|
||||
"_source": event_doc
|
||||
})
|
||||
|
||||
# 3b. Host Upsert
|
||||
# We use a script upsert or doc_as_upsert.
|
||||
# doc_as_upsert is simpler but replaces lists.
|
||||
# Ideally we merge lists (ips, macs), but for now replacing with latest 'truth' from OPNsense + Static is okay.
|
||||
# However, we don't want to lose 'ports' info from Nmap.
|
||||
# So we must NOT overwrite 'ports'.
|
||||
|
||||
host_update_doc = {
|
||||
"host": final_host["host"]
|
||||
}
|
||||
|
||||
actions.append({
|
||||
"_index": "network-hosts",
|
||||
"_op_type": "update",
|
||||
"_id": h_data["id"],
|
||||
"doc": host_update_doc,
|
||||
"doc_as_upsert": True
|
||||
})
|
||||
|
||||
# 4. Send to ES
|
||||
if actions:
|
||||
logger.info(f"Sending {len(actions)} actions to Elasticsearch...")
|
||||
success, failed = es.bulk_index(actions)
|
||||
else:
|
||||
logger.info("No hosts found or no actions generated.")
|
||||
|
||||
elapsed = time.time() - start_time
|
||||
sleep_time = max(0, interval - elapsed)
|
||||
logger.info(f"Cycle done in {elapsed:.2f}s. Sleeping for {sleep_time:.2f}s")
|
||||
time.sleep(sleep_time)
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error in main loop: {e}")
|
||||
time.sleep(10)
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
43
stacks/network-mcp/docker-compose.yml
Normal file
43
stacks/network-mcp/docker-compose.yml
Normal file
@ -0,0 +1,43 @@
|
||||
version: "3.9"
|
||||
|
||||
services:
|
||||
frontend:
|
||||
build:
|
||||
context: .
|
||||
dockerfile: frontend/Dockerfile
|
||||
restart: always
|
||||
env_file:
|
||||
- .env
|
||||
environment:
|
||||
FRONTEND_PORT: "5001"
|
||||
ports:
|
||||
- "5001:5001"
|
||||
|
||||
opnsense_collector:
|
||||
build:
|
||||
context: .
|
||||
dockerfile: collectors/opnsense_collector/Dockerfile
|
||||
restart: always
|
||||
env_file:
|
||||
- .env
|
||||
volumes:
|
||||
- ./static:/app/static
|
||||
- ./inventory_targets.yml:/app/inventory_targets.yml:ro
|
||||
environment:
|
||||
COLLECTOR_INTERVAL_SECONDS: "60"
|
||||
INVENTORY_FILE: "/app/inventory_targets.yml"
|
||||
|
||||
nmap_collector:
|
||||
build:
|
||||
context: .
|
||||
dockerfile: collectors/nmap_collector/Dockerfile
|
||||
restart: always
|
||||
cap_add:
|
||||
- NET_RAW
|
||||
- NET_ADMIN
|
||||
env_file:
|
||||
- .env
|
||||
environment:
|
||||
NMAP_INTERVAL_SECONDS: "300"
|
||||
NMAP_PORT_RANGE: "1-1024"
|
||||
NMAP_BATCH_SIZE: "10"
|
||||
15
stacks/network-mcp/frontend/Dockerfile
Normal file
15
stacks/network-mcp/frontend/Dockerfile
Normal file
@ -0,0 +1,15 @@
|
||||
FROM python:3.11-slim
|
||||
|
||||
ENV PYTHONDONTWRITEBYTECODE=1
|
||||
ENV PYTHONUNBUFFERED=1
|
||||
|
||||
WORKDIR /app
|
||||
|
||||
COPY frontend/requirements.txt /tmp/requirements.txt
|
||||
RUN pip install --no-cache-dir -r /tmp/requirements.txt && rm /tmp/requirements.txt
|
||||
|
||||
COPY frontend/ /app/
|
||||
|
||||
EXPOSE 5001
|
||||
|
||||
CMD ["gunicorn", "--bind", "0.0.0.0:5001", "app:app"]
|
||||
2
stacks/network-mcp/frontend/__init__.py
Normal file
2
stacks/network-mcp/frontend/__init__.py
Normal file
@ -0,0 +1,2 @@
|
||||
"""Network MCP frontend package (used for local testing/imports)."""
|
||||
|
||||
934
stacks/network-mcp/frontend/app.py
Normal file
934
stacks/network-mcp/frontend/app.py
Normal file
@ -0,0 +1,934 @@
|
||||
import base64
|
||||
import json
|
||||
import os
|
||||
from pathlib import Path
|
||||
from typing import Any, Dict, List, Optional
|
||||
from urllib.parse import parse_qs, unquote, urlparse
|
||||
|
||||
import requests
|
||||
from dotenv import load_dotenv
|
||||
from flask import Flask, abort, jsonify, render_template, request
|
||||
|
||||
BASE_DIR = Path(__file__).resolve().parent.parent
|
||||
env_path = BASE_DIR / ".env"
|
||||
if env_path.exists():
|
||||
load_dotenv(env_path)
|
||||
|
||||
ES_URL = os.getenv("ES_URL", "http://localhost:9200").rstrip("/")
|
||||
ES_VERIFY_SSL = os.getenv("ES_VERIFY_SSL", "false").lower() == "true"
|
||||
|
||||
app = Flask(__name__)
|
||||
|
||||
HOST_SEARCH_LIMIT = int(os.getenv("FRONTEND_HOST_LIMIT", "1000"))
|
||||
DEFAULT_EVENT_LIMIT = int(os.getenv("FRONTEND_EVENT_LIMIT", "200"))
|
||||
SERVER_VERSION = os.getenv("NETWORK_MCP_VERSION", "0.1.0")
|
||||
|
||||
REST_TOOLS = [
|
||||
{
|
||||
"name": "list_hosts",
|
||||
"description": "Return the merged view of every known device on the network (searchable by hostname, IP, or MAC).",
|
||||
"method": "GET",
|
||||
"path": "/api/hosts",
|
||||
},
|
||||
{
|
||||
"name": "network_map",
|
||||
"description": "Summarize hosts grouped by detected /24 (IPv4) or /64 (IPv6) networks.",
|
||||
"method": "GET",
|
||||
"path": "/api/map",
|
||||
},
|
||||
{
|
||||
"name": "get_host",
|
||||
"description": "Fetch a single host document by ID (e.g. ip:192.168.5.10).",
|
||||
"method": "GET",
|
||||
"path": "/api/hosts/{host_id}",
|
||||
},
|
||||
{
|
||||
"name": "list_events",
|
||||
"description": "List recent scan/discovery events with filters for host, type, or time range.",
|
||||
"method": "GET",
|
||||
"path": "/api/events",
|
||||
},
|
||||
{
|
||||
"name": "host_events",
|
||||
"description": "List the recent events associated with a specific host.",
|
||||
"method": "GET",
|
||||
"path": "/api/hosts/{host_id}/events",
|
||||
},
|
||||
]
|
||||
|
||||
|
||||
def tool_schema(description: str, properties: Dict[str, Any], required: Optional[List[str]] = None, title: Optional[str] = None):
|
||||
schema: Dict[str, Any] = {
|
||||
"type": "object",
|
||||
"description": description,
|
||||
"properties": properties,
|
||||
"additionalProperties": False,
|
||||
}
|
||||
if required:
|
||||
schema["required"] = required
|
||||
if title:
|
||||
schema["title"] = title
|
||||
return schema
|
||||
|
||||
|
||||
PORT_SCHEMA = tool_schema(
|
||||
"Observed port entry.",
|
||||
{
|
||||
"port": {"type": "integer", "description": "Port number."},
|
||||
"state": {"type": "string", "description": "State reported by nmap (e.g. open, closed)."},
|
||||
"service": {"type": "string", "description": "Detected service name, if available."},
|
||||
},
|
||||
required=["port"],
|
||||
title="Port",
|
||||
)
|
||||
|
||||
HOST_SCHEMA = tool_schema(
|
||||
"Host summary merged from inventory, OPNsense, and nmap.",
|
||||
{
|
||||
"id": {"type": "string", "description": "Stable host identifier (ip:* or mac:*)."},
|
||||
"name": {"type": "string", "description": "Best-known display name."},
|
||||
"ips": {"type": "array", "items": {"type": "string"}, "description": "Associated IP addresses."},
|
||||
"macs": {"type": "array", "items": {"type": "string"}, "description": "Observed MAC addresses."},
|
||||
"hostnames": {"type": "array", "items": {"type": "string"}, "description": "DNS or hostnames discovered."},
|
||||
"sources": {"type": "array", "items": {"type": "string"}, "description": "Data sources contributing to this record."},
|
||||
"last_seen": {"type": "string", "description": "ISO timestamp of the most recent observation."},
|
||||
"notes": {"type": "string", "description": "Inventory notes/annotations, if present."},
|
||||
"expected_ports": {"type": "array", "items": {"type": "string"}, "description": "Ports expected per inventory targets."},
|
||||
"ports": {"type": "array", "items": PORT_SCHEMA, "description": "Latest observed open ports."},
|
||||
},
|
||||
required=["id"],
|
||||
title="Host",
|
||||
)
|
||||
|
||||
EVENT_SCHEMA = tool_schema(
|
||||
"Scan or discovery event emitted by collectors.",
|
||||
{
|
||||
"id": {"type": "string", "description": "Event document identifier."},
|
||||
"timestamp": {"type": "string", "description": "Observation timestamp (@timestamp)."},
|
||||
"source": {"type": "string", "description": "Collector that produced the event (nmap, opnsense, inventory)."},
|
||||
"event": {"type": "object", "description": "Event metadata (type, outcome)."},
|
||||
"host": HOST_SCHEMA,
|
||||
"ports": {"type": "array", "items": PORT_SCHEMA, "description": "Ports included with the event (if any)."},
|
||||
},
|
||||
required=["id", "timestamp"],
|
||||
title="Event",
|
||||
)
|
||||
|
||||
NETWORK_ENTRY_SCHEMA = tool_schema(
|
||||
"Network grouping entry showing hosts per /24 or /64.",
|
||||
{
|
||||
"cidr": {"type": "string", "description": "CIDR label (e.g. 192.168.5.0/24)."},
|
||||
"hosts": {"type": "array", "items": HOST_SCHEMA, "description": "Hosts that belong to this network."},
|
||||
},
|
||||
required=["cidr", "hosts"],
|
||||
title="NetworkEntry",
|
||||
)
|
||||
|
||||
|
||||
MCP_TOOL_DEFINITIONS = {
|
||||
"list_hosts": {
|
||||
"title": "List Hosts",
|
||||
"description": "Return the merged view of every known device on the network with optional filtering by source or identifier.",
|
||||
"annotations": {"readOnlyHint": True, "destructiveHint": False, "openWorldHint": False},
|
||||
"inputSchema": tool_schema(
|
||||
"Filter options when listing hosts.",
|
||||
{
|
||||
"limit": {"type": "integer", "minimum": 1, "maximum": 5000, "title": "Limit", "description": "Maximum number of hosts to return."},
|
||||
"source": {"type": "string", "title": "Source filter", "description": "Only include hosts that contain this source tag (e.g. inventory, nmap, opnsense-arp)."},
|
||||
"terms": {
|
||||
"type": "array",
|
||||
"items": {"type": "string"},
|
||||
"title": "Search terms",
|
||||
"description": "Identifiers (names, hostnames, IPs, or MACs) to match. Equivalent to repeated q parameters in the REST API.",
|
||||
},
|
||||
},
|
||||
title="ListHostsInput",
|
||||
),
|
||||
"outputSchema": tool_schema(
|
||||
"Host list result payload.",
|
||||
{
|
||||
"total": {"type": "integer", "description": "Number of hosts returned."},
|
||||
"hosts": {"type": "array", "items": HOST_SCHEMA, "description": "Host entries sorted by last-seen time."},
|
||||
},
|
||||
required=["total", "hosts"],
|
||||
title="ListHostsResult",
|
||||
),
|
||||
},
|
||||
"network_map": {
|
||||
"title": "Network Map",
|
||||
"description": "Summarize hosts grouped by detected /24 (IPv4) or /64 (IPv6) ranges.",
|
||||
"annotations": {"readOnlyHint": True, "destructiveHint": False, "openWorldHint": False},
|
||||
"inputSchema": tool_schema(
|
||||
"Options when generating the network grouping.",
|
||||
{
|
||||
"limit": {"type": "integer", "minimum": 1, "maximum": 5000, "title": "Host limit", "description": "Maximum number of hosts to consider when building the map."},
|
||||
},
|
||||
title="NetworkMapInput",
|
||||
),
|
||||
"outputSchema": tool_schema(
|
||||
"Grouped view of networks and their hosts.",
|
||||
{
|
||||
"host_count": {"type": "integer", "description": "Number of hosts examined for this map."},
|
||||
"networks": {"type": "array", "items": NETWORK_ENTRY_SCHEMA, "description": "List of network segments and their hosts."},
|
||||
},
|
||||
required=["host_count", "networks"],
|
||||
title="NetworkMapResult",
|
||||
),
|
||||
},
|
||||
"get_host": {
|
||||
"title": "Get Host",
|
||||
"description": "Fetch a single host document by ID, optionally including recent events.",
|
||||
"annotations": {"readOnlyHint": True, "destructiveHint": False, "openWorldHint": False},
|
||||
"inputSchema": tool_schema(
|
||||
"Parameters for retrieving an individual host.",
|
||||
{
|
||||
"host_id": {"type": "string", "title": "Host ID", "description": "Host identifier (e.g. ip:192.168.5.10, mac:aa:bb:cc...)."},
|
||||
"include_events": {"type": "boolean", "title": "Include events", "description": "If true, include recent events for the host."},
|
||||
"events_limit": {"type": "integer", "minimum": 1, "maximum": 1000, "title": "Events limit", "description": "Number of events to include if requested."},
|
||||
},
|
||||
required=["host_id"],
|
||||
title="GetHostInput",
|
||||
),
|
||||
"outputSchema": tool_schema(
|
||||
"Host payload with optional embedded events.",
|
||||
{
|
||||
"host": HOST_SCHEMA,
|
||||
"events": {"type": "array", "items": EVENT_SCHEMA, "description": "Recent events when include_events=true."},
|
||||
},
|
||||
required=["host"],
|
||||
title="GetHostResult",
|
||||
),
|
||||
},
|
||||
"list_events": {
|
||||
"title": "List Events",
|
||||
"description": "List recent scan/discovery events with optional filters.",
|
||||
"annotations": {"readOnlyHint": True, "destructiveHint": False, "openWorldHint": False},
|
||||
"inputSchema": tool_schema(
|
||||
"Filters applied when querying events.",
|
||||
{
|
||||
"limit": {"type": "integer", "minimum": 1, "maximum": 1000, "title": "Limit", "description": "Maximum number of events to return."},
|
||||
"host_id": {"type": "string", "title": "Host filter", "description": "Only include events for this host identifier."},
|
||||
"type": {"type": "string", "title": "Event type", "description": "Restrict to a specific event type (e.g. scan, discovery)."},
|
||||
"since": {"type": "string", "title": "Since timestamp", "description": "ISO8601 timestamp used as a lower bound for @timestamp."},
|
||||
},
|
||||
title="ListEventsInput",
|
||||
),
|
||||
"outputSchema": tool_schema(
|
||||
"Event search result.",
|
||||
{
|
||||
"total": {"type": "integer", "description": "Number of events returned."},
|
||||
"events": {"type": "array", "items": EVENT_SCHEMA, "description": "Event documents sorted by timestamp."},
|
||||
},
|
||||
required=["total", "events"],
|
||||
title="ListEventsResult",
|
||||
),
|
||||
},
|
||||
"host_events": {
|
||||
"title": "Host Events",
|
||||
"description": "List recent events associated with a specific host.",
|
||||
"annotations": {"readOnlyHint": True, "destructiveHint": False, "openWorldHint": False},
|
||||
"inputSchema": tool_schema(
|
||||
"Parameters when retrieving events bound to a single host.",
|
||||
{
|
||||
"host_id": {"type": "string", "title": "Host ID", "description": "Host identifier to filter by."},
|
||||
"limit": {"type": "integer", "minimum": 1, "maximum": 1000, "title": "Limit", "description": "Maximum number of events to return."},
|
||||
"type": {"type": "string", "title": "Event type", "description": "Restrict to a specific event type (e.g. scan, discovery)."},
|
||||
"since": {"type": "string", "title": "Since timestamp", "description": "ISO8601 timestamp used as a lower bound for @timestamp."},
|
||||
},
|
||||
required=["host_id"],
|
||||
title="HostEventsInput",
|
||||
),
|
||||
"outputSchema": tool_schema(
|
||||
"Event list scoped to a host.",
|
||||
{
|
||||
"total": {"type": "integer", "description": "Number of events returned for the host."},
|
||||
"events": {"type": "array", "items": EVENT_SCHEMA, "description": "Host-specific event entries."},
|
||||
},
|
||||
required=["total", "events"],
|
||||
title="HostEventsResult",
|
||||
),
|
||||
},
|
||||
}
|
||||
|
||||
|
||||
def resolve_api_key(api_id: str, api_key: str):
|
||||
if api_id and api_key:
|
||||
return api_id, api_key
|
||||
if not api_key:
|
||||
return None, None
|
||||
if ":" in api_key:
|
||||
possible_id, possible_key = api_key.split(":", 1)
|
||||
return possible_id, possible_key
|
||||
try:
|
||||
decoded = base64.b64decode(api_key).decode()
|
||||
if ":" in decoded:
|
||||
possible_id, possible_key = decoded.split(":", 1)
|
||||
return possible_id, possible_key
|
||||
except Exception:
|
||||
pass
|
||||
return None, None
|
||||
|
||||
|
||||
def build_es_request():
|
||||
headers = {}
|
||||
auth = None
|
||||
|
||||
api_id = os.getenv("ES_API_ID")
|
||||
api_key = os.getenv("ES_API_KEY")
|
||||
api_id, api_key = resolve_api_key(api_id, api_key)
|
||||
if api_id and api_key:
|
||||
token = base64.b64encode(f"{api_id}:{api_key}".encode()).decode()
|
||||
headers["Authorization"] = f"ApiKey {token}"
|
||||
else:
|
||||
auth = (os.getenv("ES_USER", "elastic"), os.getenv("ES_PASS", "changeme"))
|
||||
return headers, auth
|
||||
|
||||
|
||||
def normalize_host(doc: Dict) -> Dict:
|
||||
host = doc.get("host", {})
|
||||
ports = doc.get("ports", [])
|
||||
return {
|
||||
"id": host.get("id"),
|
||||
"name": host.get("name") or host.get("id"),
|
||||
"ips": host.get("ips", []),
|
||||
"macs": host.get("macs", []),
|
||||
"hostnames": host.get("hostnames", []),
|
||||
"sources": host.get("sources", []),
|
||||
"last_seen": host.get("last_seen"),
|
||||
"notes": host.get("notes"),
|
||||
"expected_ports": host.get("expected_ports", []),
|
||||
"ports": [
|
||||
{
|
||||
"port": p.get("port"),
|
||||
"state": p.get("state"),
|
||||
"service": (p.get("service") or {}).get("name"),
|
||||
}
|
||||
for p in ports
|
||||
],
|
||||
}
|
||||
|
||||
|
||||
def parse_search_terms(raw_terms: List[str]) -> List[str]:
|
||||
terms: List[str] = []
|
||||
for raw in raw_terms:
|
||||
if not raw:
|
||||
continue
|
||||
cleaned = raw.replace(",", " ")
|
||||
for chunk in cleaned.split():
|
||||
chunk = chunk.strip()
|
||||
if chunk:
|
||||
terms.append(chunk)
|
||||
return terms
|
||||
|
||||
|
||||
def coerce_string_list(value: Any) -> List[str]:
|
||||
if value is None:
|
||||
return []
|
||||
if isinstance(value, str):
|
||||
return [value]
|
||||
if isinstance(value, (list, tuple)):
|
||||
return [str(item) for item in value if item is not None]
|
||||
return []
|
||||
|
||||
|
||||
def clamp_int(value: Any, default: int, min_value: int, max_value: int) -> int:
|
||||
try:
|
||||
if value is None:
|
||||
return default
|
||||
parsed = int(value)
|
||||
except (TypeError, ValueError):
|
||||
return default
|
||||
return max(min_value, min(max_value, parsed))
|
||||
|
||||
|
||||
def coerce_bool(value: Any, default: bool = False) -> bool:
|
||||
if value is None:
|
||||
return default
|
||||
if isinstance(value, bool):
|
||||
return value
|
||||
if isinstance(value, str):
|
||||
return value.lower() in {"1", "true", "yes", "on"}
|
||||
return default
|
||||
|
||||
|
||||
def build_search_clause(term: str) -> Dict:
|
||||
wildcard = f"*{term}*"
|
||||
return {
|
||||
"bool": {
|
||||
"should": [
|
||||
{"wildcard": {"host.name.keyword": {"value": wildcard, "case_insensitive": True}}},
|
||||
{"wildcard": {"host.hostnames.keyword": {"value": wildcard, "case_insensitive": True}}},
|
||||
{"wildcard": {"host.id.keyword": {"value": wildcard, "case_insensitive": True}}},
|
||||
{"wildcard": {"host.ips": {"value": wildcard, "case_insensitive": True}}},
|
||||
{"wildcard": {"host.macs": {"value": wildcard, "case_insensitive": True}}},
|
||||
],
|
||||
"minimum_should_match": 1,
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
def fetch_hosts(limit: int = HOST_SEARCH_LIMIT, source: Optional[str] = None, search_terms: Optional[List[str]] = None):
|
||||
headers, auth = build_es_request()
|
||||
body = {
|
||||
"size": limit,
|
||||
"sort": [{"host.last_seen": {"order": "desc"}}],
|
||||
}
|
||||
filters: List[Dict] = []
|
||||
if source:
|
||||
filters.append({"term": {"host.sources.keyword": source}})
|
||||
if search_terms:
|
||||
should_clauses = [build_search_clause(term) for term in search_terms]
|
||||
filters.append({"bool": {"should": should_clauses, "minimum_should_match": 1}})
|
||||
if filters:
|
||||
body["query"] = {"bool": {"filter": filters}}
|
||||
resp = requests.get(
|
||||
f"{ES_URL}/network-hosts/_search",
|
||||
json=body,
|
||||
headers=headers,
|
||||
auth=auth,
|
||||
verify=ES_VERIFY_SSL,
|
||||
)
|
||||
resp.raise_for_status()
|
||||
return [normalize_host(hit.get("_source", {})) for hit in resp.json()["hits"]["hits"]]
|
||||
|
||||
|
||||
def fetch_host_by_id(host_id: str) -> Optional[Dict]:
|
||||
headers, auth = build_es_request()
|
||||
body = {"size": 1, "query": {"term": {"host.id.keyword": host_id}}}
|
||||
resp = requests.get(
|
||||
f"{ES_URL}/network-hosts/_search",
|
||||
json=body,
|
||||
headers=headers,
|
||||
auth=auth,
|
||||
verify=ES_VERIFY_SSL,
|
||||
)
|
||||
resp.raise_for_status()
|
||||
hits = resp.json()["hits"]["hits"]
|
||||
if not hits:
|
||||
return None
|
||||
return normalize_host(hits[0].get("_source", {}))
|
||||
|
||||
|
||||
def fetch_events(host_id: Optional[str] = None, limit: int = DEFAULT_EVENT_LIMIT, event_type: Optional[str] = None, since: Optional[str] = None):
|
||||
headers, auth = build_es_request()
|
||||
filters: List[Dict] = []
|
||||
if host_id:
|
||||
filters.append({"term": {"host.id.keyword": host_id}})
|
||||
if event_type:
|
||||
filters.append({"term": {"event.type.keyword": event_type}})
|
||||
if since:
|
||||
filters.append({"range": {"@timestamp": {"gte": since}}})
|
||||
body: Dict = {
|
||||
"size": limit,
|
||||
"sort": [{"@timestamp": {"order": "desc"}}],
|
||||
}
|
||||
if filters:
|
||||
body["query"] = {"bool": {"filter": filters}}
|
||||
resp = requests.get(
|
||||
f"{ES_URL}/network-events-*/_search",
|
||||
json=body,
|
||||
headers=headers,
|
||||
auth=auth,
|
||||
verify=ES_VERIFY_SSL,
|
||||
)
|
||||
if resp.status_code == 404:
|
||||
return []
|
||||
resp.raise_for_status()
|
||||
events = []
|
||||
for hit in resp.json()["hits"]["hits"]:
|
||||
doc = hit.get("_source", {})
|
||||
events.append(
|
||||
{
|
||||
"id": hit.get("_id"),
|
||||
"timestamp": doc.get("@timestamp"),
|
||||
"event": doc.get("event", {}),
|
||||
"host": doc.get("host", {}),
|
||||
"observed": doc.get("observed"),
|
||||
"scan": doc.get("scan"),
|
||||
"ports": doc.get("ports", []),
|
||||
"source": doc.get("source"),
|
||||
}
|
||||
)
|
||||
return events
|
||||
|
||||
|
||||
def derive_network_label(ip: str) -> str:
|
||||
if not ip:
|
||||
return "unknown"
|
||||
if ":" in ip:
|
||||
parts = ip.split(":")
|
||||
prefix = ":".join(parts[:4])
|
||||
return f"{prefix}::/64"
|
||||
octets = ip.split(".")
|
||||
if len(octets) == 4:
|
||||
return f"{octets[0]}.{octets[1]}.{octets[2]}.0/24"
|
||||
return "unknown"
|
||||
|
||||
|
||||
def build_network_map(hosts: List[Dict]):
|
||||
networks: Dict[str, Dict] = {}
|
||||
for host in hosts:
|
||||
seen = set()
|
||||
for ip in host.get("ips", []):
|
||||
label = derive_network_label(ip)
|
||||
if label in seen:
|
||||
continue
|
||||
seen.add(label)
|
||||
entry = networks.setdefault(label, {"cidr": label, "hosts": []})
|
||||
entry["hosts"].append(
|
||||
{
|
||||
"id": host.get("id"),
|
||||
"name": host.get("name"),
|
||||
"ips": host.get("ips", []),
|
||||
"sources": host.get("sources", []),
|
||||
"last_seen": host.get("last_seen"),
|
||||
}
|
||||
)
|
||||
sorted_networks = sorted(networks.values(), key=lambda n: n["cidr"])
|
||||
for entry in sorted_networks:
|
||||
entry["hosts"].sort(key=lambda h: h.get("name") or h.get("id") or "")
|
||||
return sorted_networks
|
||||
|
||||
|
||||
def bool_arg(value: Optional[str], default: bool = False) -> bool:
|
||||
if value is None:
|
||||
return default
|
||||
return value.lower() in {"1", "true", "yes", "on"}
|
||||
|
||||
|
||||
def build_manifest(base_url: str) -> Dict:
|
||||
base = base_url.rstrip("/")
|
||||
tools = []
|
||||
for tool in REST_TOOLS:
|
||||
tools.append(
|
||||
{
|
||||
"name": tool["name"],
|
||||
"description": tool["description"],
|
||||
"method": tool["method"],
|
||||
"path": tool["path"],
|
||||
"url": f"{base}{tool['path']}",
|
||||
}
|
||||
)
|
||||
return {
|
||||
"name": "network-mcp",
|
||||
"description": "Network discovery source-of-truth backed by Elasticsearch, Nmap, and OPNsense.",
|
||||
"schema": "1.0",
|
||||
"tools": tools,
|
||||
"auth": "env",
|
||||
}
|
||||
|
||||
|
||||
def tool_result(summary: str, data: Dict[str, Any]):
|
||||
return summary, data
|
||||
|
||||
|
||||
def handle_tool_list_hosts(arguments: Dict[str, Any]):
|
||||
limit = clamp_int(arguments.get("limit"), HOST_SEARCH_LIMIT, 1, 5000)
|
||||
raw_terms = coerce_string_list(arguments.get("terms"))
|
||||
search_terms = parse_search_terms(raw_terms)
|
||||
hosts = fetch_hosts(limit=limit, source=arguments.get("source"), search_terms=search_terms or None)
|
||||
return tool_result(f"Returned {len(hosts)} hosts.", {"hosts": hosts, "total": len(hosts)})
|
||||
|
||||
|
||||
def handle_tool_network_map(arguments: Dict[str, Any]):
|
||||
limit = clamp_int(arguments.get("limit"), HOST_SEARCH_LIMIT, 1, 5000)
|
||||
hosts = fetch_hosts(limit=limit)
|
||||
network_map = build_network_map(hosts)
|
||||
return tool_result(f"Computed {len(network_map)} networks.", {"networks": network_map, "host_count": len(hosts)})
|
||||
|
||||
|
||||
def handle_tool_get_host(arguments: Dict[str, Any]):
|
||||
host_id = arguments.get("host_id")
|
||||
if not host_id:
|
||||
raise ValueError("host_id is required")
|
||||
host = fetch_host_by_id(host_id)
|
||||
if not host:
|
||||
raise KeyError(f"Host {host_id} not found")
|
||||
include_events = coerce_bool(arguments.get("include_events"), default=False)
|
||||
result = {"host": host}
|
||||
if include_events:
|
||||
events_limit = clamp_int(arguments.get("events_limit"), DEFAULT_EVENT_LIMIT, 1, 1000)
|
||||
result["events"] = fetch_events(host_id=host_id, limit=events_limit)
|
||||
return tool_result(f"Fetched host {host_id}.", result)
|
||||
|
||||
|
||||
def handle_tool_list_events(arguments: Dict[str, Any]):
|
||||
limit = clamp_int(arguments.get("limit"), DEFAULT_EVENT_LIMIT, 1, 1000)
|
||||
events = fetch_events(
|
||||
host_id=arguments.get("host_id"),
|
||||
limit=limit,
|
||||
event_type=arguments.get("type"),
|
||||
since=arguments.get("since"),
|
||||
)
|
||||
return tool_result(f"Returned {len(events)} events.", {"events": events, "total": len(events)})
|
||||
|
||||
|
||||
def handle_tool_host_events(arguments: Dict[str, Any]):
|
||||
host_id = arguments.get("host_id")
|
||||
if not host_id:
|
||||
raise ValueError("host_id is required")
|
||||
limit = clamp_int(arguments.get("limit"), DEFAULT_EVENT_LIMIT, 1, 1000)
|
||||
events = fetch_events(host_id=host_id, limit=limit, event_type=arguments.get("type"), since=arguments.get("since"))
|
||||
return tool_result(f"Returned {len(events)} events for {host_id}.", {"events": events, "total": len(events)})
|
||||
|
||||
|
||||
TOOL_HANDLERS = {
|
||||
"list_hosts": handle_tool_list_hosts,
|
||||
"network_map": handle_tool_network_map,
|
||||
"get_host": handle_tool_get_host,
|
||||
"list_events": handle_tool_list_events,
|
||||
"host_events": handle_tool_host_events,
|
||||
}
|
||||
|
||||
|
||||
def list_mcp_tools():
|
||||
tools = []
|
||||
for name, meta in MCP_TOOL_DEFINITIONS.items():
|
||||
tool = {
|
||||
"name": name,
|
||||
"description": meta.get("description"),
|
||||
"inputSchema": meta.get("inputSchema", {"type": "object"}),
|
||||
}
|
||||
title = meta.get("title")
|
||||
if title:
|
||||
tool["title"] = title
|
||||
output_schema = meta.get("outputSchema")
|
||||
if output_schema:
|
||||
tool["outputSchema"] = output_schema
|
||||
annotations = meta.get("annotations")
|
||||
if annotations:
|
||||
tool["annotations"] = annotations
|
||||
tools.append(tool)
|
||||
return tools
|
||||
|
||||
|
||||
def call_tool_by_name(name: str, arguments: Optional[Dict[str, Any]] = None):
|
||||
if name not in TOOL_HANDLERS:
|
||||
raise KeyError(f"Unknown tool: {name}")
|
||||
handler = TOOL_HANDLERS[name]
|
||||
summary, data = handler(arguments or {})
|
||||
return summary, data
|
||||
|
||||
|
||||
def list_mcp_resources(base_uri: str = "network://"):
|
||||
return [
|
||||
{
|
||||
"uri": f"{base_uri}hosts",
|
||||
"name": "hosts",
|
||||
"title": "Hosts (Snapshot)",
|
||||
"mimeType": "application/json",
|
||||
"description": "Snapshot of merged hosts (inventory + opnsense + nmap). Use resources/templates/list for search parameters.",
|
||||
},
|
||||
{
|
||||
"uri": f"{base_uri}map",
|
||||
"name": "map",
|
||||
"title": "Network Map (Snapshot)",
|
||||
"mimeType": "application/json",
|
||||
"description": "Snapshot of networks grouped by /24 (IPv4) or /64 (IPv6).",
|
||||
},
|
||||
{
|
||||
"uri": f"{base_uri}events",
|
||||
"name": "events",
|
||||
"title": "Recent Events (Snapshot)",
|
||||
"mimeType": "application/json",
|
||||
"description": "Recent scan/discovery events. Use resources/templates/list for filters (host_id/type/since).",
|
||||
},
|
||||
]
|
||||
|
||||
|
||||
def list_mcp_resource_templates(base_uri: str = "network://"):
|
||||
return [
|
||||
{
|
||||
"uriTemplate": f"{base_uri}hosts{{?q,source,limit}}",
|
||||
"name": "hosts_query",
|
||||
"title": "Hosts Query",
|
||||
"mimeType": "application/json",
|
||||
"description": "Query hosts by q (hostname/IP/MAC/name, case-insensitive), source, and limit. Repeat q to provide multiple terms.",
|
||||
},
|
||||
{
|
||||
"uriTemplate": f"{base_uri}host/{{host_id}}{{?include_events,events_limit}}",
|
||||
"name": "host_detail",
|
||||
"title": "Host Detail",
|
||||
"mimeType": "application/json",
|
||||
"description": "Fetch a single host by host_id (e.g. mac:aa:bb.. or ip:192.168.5.10). Optionally include events.",
|
||||
},
|
||||
{
|
||||
"uriTemplate": f"{base_uri}events{{?host_id,type,since,limit}}",
|
||||
"name": "events_query",
|
||||
"title": "Events Query",
|
||||
"mimeType": "application/json",
|
||||
"description": "Query recent events with optional filters host_id, type, since (ISO8601), and limit.",
|
||||
},
|
||||
{
|
||||
"uriTemplate": f"{base_uri}map{{?limit}}",
|
||||
"name": "map_query",
|
||||
"title": "Network Map",
|
||||
"mimeType": "application/json",
|
||||
"description": "Build a network map from up to limit hosts.",
|
||||
},
|
||||
]
|
||||
|
||||
|
||||
def read_mcp_resource(uri: str):
|
||||
parsed = urlparse(uri)
|
||||
if parsed.scheme != "network":
|
||||
raise ValueError(f"Unsupported resource URI scheme: {parsed.scheme}")
|
||||
|
||||
netloc = parsed.netloc
|
||||
query = parse_qs(parsed.query or "")
|
||||
|
||||
if netloc == "hosts":
|
||||
limit = clamp_int((query.get("limit") or [HOST_SEARCH_LIMIT])[0], HOST_SEARCH_LIMIT, 1, 5000)
|
||||
source = (query.get("source") or [None])[0]
|
||||
q_terms = query.get("q") or []
|
||||
search_terms = parse_search_terms(q_terms)
|
||||
payload = {"hosts": fetch_hosts(limit=limit, source=source, search_terms=search_terms or None)}
|
||||
payload["total"] = len(payload["hosts"])
|
||||
return {"contents": [{"uri": uri, "mimeType": "application/json", "text": json.dumps(payload)}]}
|
||||
|
||||
if netloc == "map":
|
||||
limit = clamp_int((query.get("limit") or [HOST_SEARCH_LIMIT])[0], HOST_SEARCH_LIMIT, 1, 5000)
|
||||
hosts = fetch_hosts(limit=limit)
|
||||
payload = {"networks": build_network_map(hosts), "host_count": len(hosts)}
|
||||
return {"contents": [{"uri": uri, "mimeType": "application/json", "text": json.dumps(payload)}]}
|
||||
|
||||
if netloc == "events":
|
||||
limit = clamp_int((query.get("limit") or [DEFAULT_EVENT_LIMIT])[0], DEFAULT_EVENT_LIMIT, 1, 1000)
|
||||
host_id = (query.get("host_id") or [None])[0]
|
||||
event_type = (query.get("type") or [None])[0]
|
||||
since = (query.get("since") or [None])[0]
|
||||
events = fetch_events(host_id=host_id, limit=limit, event_type=event_type, since=since)
|
||||
payload = {"events": events, "total": len(events)}
|
||||
return {"contents": [{"uri": uri, "mimeType": "application/json", "text": json.dumps(payload)}]}
|
||||
|
||||
if netloc == "host":
|
||||
host_id = unquote((parsed.path or "").lstrip("/"))
|
||||
if not host_id:
|
||||
raise ValueError("Host resource requires /<host_id> path")
|
||||
include_events = coerce_bool((query.get("include_events") or [False])[0], default=False)
|
||||
events_limit = clamp_int((query.get("events_limit") or [DEFAULT_EVENT_LIMIT])[0], DEFAULT_EVENT_LIMIT, 1, 1000)
|
||||
host = fetch_host_by_id(host_id)
|
||||
if not host:
|
||||
raise KeyError(f"Host {host_id} not found")
|
||||
payload = {"host": host}
|
||||
if include_events:
|
||||
payload["events"] = fetch_events(host_id=host_id, limit=events_limit)
|
||||
return {"contents": [{"uri": uri, "mimeType": "application/json", "text": json.dumps(payload)}]}
|
||||
|
||||
raise ValueError(f"Unknown resource URI: {uri}")
|
||||
|
||||
|
||||
def jsonrpc_error(rpc_id: Any, code: int, message: str):
|
||||
return {
|
||||
"jsonrpc": "2.0",
|
||||
"id": rpc_id,
|
||||
"error": {"code": code, "message": message},
|
||||
}
|
||||
|
||||
|
||||
def build_initialize_result(protocol_version: Optional[str] = None):
|
||||
protocol_version = protocol_version or "2025-11-25"
|
||||
return {
|
||||
"protocolVersion": protocol_version,
|
||||
"capabilities": {
|
||||
"tools": {"listChanged": False},
|
||||
"resources": {"listChanged": False, "subscribe": False},
|
||||
},
|
||||
"serverInfo": {"name": "network-mcp", "version": SERVER_VERSION},
|
||||
"instructions": "Start with list_hosts (search by hostname/IP/MAC), then use get_host for details and list_events/host_events for timelines; network_map gives a quick /24-/64 overview.",
|
||||
}
|
||||
|
||||
|
||||
def process_rpc_request(payload: Dict[str, Any]):
|
||||
if not isinstance(payload, dict):
|
||||
return jsonrpc_error(None, -32600, "Invalid request")
|
||||
rpc_id = payload.get("id")
|
||||
method = payload.get("method")
|
||||
params = payload.get("params") or {}
|
||||
is_notification = rpc_id is None
|
||||
|
||||
if method == "initialize":
|
||||
requested = params.get("protocolVersion")
|
||||
requested_str = str(requested) if requested is not None else None
|
||||
return {"jsonrpc": "2.0", "id": rpc_id, "result": build_initialize_result(requested_str)}
|
||||
|
||||
if method == "ping":
|
||||
return {"jsonrpc": "2.0", "id": rpc_id, "result": {}}
|
||||
|
||||
if method == "tools/list":
|
||||
result = {"tools": list_mcp_tools(), "nextCursor": None}
|
||||
return {"jsonrpc": "2.0", "id": rpc_id, "result": result}
|
||||
|
||||
if method == "resources/list":
|
||||
result = {"resources": list_mcp_resources(), "nextCursor": None}
|
||||
return {"jsonrpc": "2.0", "id": rpc_id, "result": result}
|
||||
|
||||
if method == "resources/templates/list":
|
||||
result = {"resourceTemplates": list_mcp_resource_templates(), "nextCursor": None}
|
||||
return {"jsonrpc": "2.0", "id": rpc_id, "result": result}
|
||||
|
||||
if method == "resources/read":
|
||||
uri = (params or {}).get("uri")
|
||||
if not uri:
|
||||
return jsonrpc_error(rpc_id, -32602, "uri is required")
|
||||
try:
|
||||
result = read_mcp_resource(uri)
|
||||
return {"jsonrpc": "2.0", "id": rpc_id, "result": result}
|
||||
except ValueError as exc:
|
||||
return jsonrpc_error(rpc_id, -32602, str(exc))
|
||||
except KeyError as exc:
|
||||
message = exc.args[0] if exc.args else str(exc)
|
||||
return jsonrpc_error(rpc_id, -32004, message)
|
||||
|
||||
if method == "notifications/initialized":
|
||||
# No response for notifications.
|
||||
return None
|
||||
|
||||
if method == "tools/call":
|
||||
name = params.get("name")
|
||||
if not name:
|
||||
if is_notification:
|
||||
return None
|
||||
return jsonrpc_error(rpc_id, -32602, "Tool name is required")
|
||||
arguments = params.get("arguments") or {}
|
||||
try:
|
||||
summary, data = call_tool_by_name(name, arguments)
|
||||
result = {
|
||||
"content": [{"type": "text", "text": summary}],
|
||||
"structuredContent": data,
|
||||
"isError": False,
|
||||
}
|
||||
if is_notification:
|
||||
return None
|
||||
return {"jsonrpc": "2.0", "id": rpc_id, "result": result}
|
||||
except ValueError as exc:
|
||||
if is_notification:
|
||||
return None
|
||||
result = {
|
||||
"content": [{"type": "text", "text": f"Tool argument error: {exc}"}],
|
||||
"structuredContent": {"error": str(exc)},
|
||||
"isError": True,
|
||||
}
|
||||
return {"jsonrpc": "2.0", "id": rpc_id, "result": result}
|
||||
except KeyError as exc:
|
||||
message = exc.args[0] if exc.args else str(exc)
|
||||
if is_notification:
|
||||
return None
|
||||
result = {
|
||||
"content": [{"type": "text", "text": f"Tool error: {message}"}],
|
||||
"structuredContent": {"error": message},
|
||||
"isError": True,
|
||||
}
|
||||
return {"jsonrpc": "2.0", "id": rpc_id, "result": result}
|
||||
except Exception as exc: # pragma: no cover - defensive
|
||||
if is_notification:
|
||||
return None
|
||||
return jsonrpc_error(rpc_id, -32603, f"Internal error: {exc}")
|
||||
|
||||
if is_notification:
|
||||
return None
|
||||
|
||||
return jsonrpc_error(rpc_id, -32601, f"Method {method} not found")
|
||||
|
||||
|
||||
def process_rpc_envelope(payload: Any):
|
||||
if isinstance(payload, list):
|
||||
responses = []
|
||||
for entry in payload:
|
||||
response = process_rpc_request(entry)
|
||||
if response is not None:
|
||||
responses.append(response)
|
||||
return responses
|
||||
if isinstance(payload, dict):
|
||||
return process_rpc_request(payload)
|
||||
return jsonrpc_error(None, -32600, "Invalid request")
|
||||
|
||||
|
||||
@app.route("/api/hosts")
|
||||
def api_hosts():
|
||||
limit = min(int(request.args.get("limit", HOST_SEARCH_LIMIT)), 5000)
|
||||
q_args = request.args.getlist("q")
|
||||
search_terms = parse_search_terms(q_args)
|
||||
hosts = fetch_hosts(
|
||||
limit=limit,
|
||||
source=request.args.get("source"),
|
||||
search_terms=search_terms if search_terms else None,
|
||||
)
|
||||
return jsonify({"hosts": hosts, "total": len(hosts)})
|
||||
|
||||
|
||||
@app.route("/api/hosts/<path:host_id>")
|
||||
def api_host_detail(host_id: str):
|
||||
host = fetch_host_by_id(host_id)
|
||||
if not host:
|
||||
abort(404, description=f"Host {host_id} not found")
|
||||
include_events = bool_arg(request.args.get("include_events"), default=False)
|
||||
result = {"host": host}
|
||||
if include_events:
|
||||
limit = min(int(request.args.get("events_limit", DEFAULT_EVENT_LIMIT)), 1000)
|
||||
result["events"] = fetch_events(host_id=host_id, limit=limit)
|
||||
return jsonify(result)
|
||||
|
||||
|
||||
@app.route("/api/events")
|
||||
def api_events():
|
||||
limit = min(int(request.args.get("limit", DEFAULT_EVENT_LIMIT)), 1000)
|
||||
events = fetch_events(
|
||||
host_id=request.args.get("host_id"),
|
||||
limit=limit,
|
||||
event_type=request.args.get("type"),
|
||||
since=request.args.get("since"),
|
||||
)
|
||||
return jsonify({"events": events, "total": len(events)})
|
||||
|
||||
|
||||
@app.route("/api/hosts/<path:host_id>/events")
|
||||
def api_host_events(host_id: str):
|
||||
limit = min(int(request.args.get("limit", DEFAULT_EVENT_LIMIT)), 1000)
|
||||
events = fetch_events(host_id=host_id, limit=limit, event_type=request.args.get("type"), since=request.args.get("since"))
|
||||
return jsonify({"events": events, "total": len(events)})
|
||||
|
||||
|
||||
@app.route("/api/map")
|
||||
def api_map():
|
||||
limit = min(int(request.args.get("limit", HOST_SEARCH_LIMIT)), 5000)
|
||||
hosts = fetch_hosts(limit=limit)
|
||||
network_map = build_network_map(hosts)
|
||||
return jsonify({"networks": network_map, "host_count": len(hosts)})
|
||||
|
||||
|
||||
@app.route("/.well-known/mcp.json", methods=["GET", "POST", "OPTIONS"])
|
||||
@app.route("/api/mcp", methods=["GET", "POST", "OPTIONS"])
|
||||
def api_manifest():
|
||||
if request.method == "OPTIONS":
|
||||
return ("", 204, {"Allow": "GET,POST,OPTIONS"})
|
||||
if request.method == "POST":
|
||||
payload = request.get_json(silent=True)
|
||||
if payload is None:
|
||||
return jsonify(jsonrpc_error(None, -32700, "Invalid JSON")), 400
|
||||
rpc_response = process_rpc_envelope(payload)
|
||||
if rpc_response is None or (isinstance(rpc_response, list) and not rpc_response):
|
||||
return ("", 204)
|
||||
return jsonify(rpc_response)
|
||||
manifest = build_manifest(request.url_root.rstrip("/"))
|
||||
return jsonify(manifest)
|
||||
|
||||
|
||||
@app.route("/")
|
||||
def index():
|
||||
hosts = fetch_hosts()
|
||||
total = len(hosts)
|
||||
with_ports = sum(1 for h in hosts if h["ports"])
|
||||
inventory_hosts = sum(1 for h in hosts if "inventory" in h["sources"])
|
||||
return render_template(
|
||||
"index.html",
|
||||
hosts=hosts,
|
||||
total=total,
|
||||
with_ports=with_ports,
|
||||
inventory_hosts=inventory_hosts,
|
||||
es_url=ES_URL,
|
||||
)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
app.run(host="0.0.0.0", port=int(os.getenv("FRONTEND_PORT", "5001")))
|
||||
4
stacks/network-mcp/frontend/requirements.txt
Normal file
4
stacks/network-mcp/frontend/requirements.txt
Normal file
@ -0,0 +1,4 @@
|
||||
Flask==2.2.5
|
||||
requests==2.31.0
|
||||
python-dotenv==0.21.1
|
||||
gunicorn==21.2.0
|
||||
206
stacks/network-mcp/frontend/templates/index.html
Normal file
206
stacks/network-mcp/frontend/templates/index.html
Normal file
@ -0,0 +1,206 @@
|
||||
<!DOCTYPE html>
|
||||
<html lang="en">
|
||||
<head>
|
||||
<meta charset="UTF-8">
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1.0">
|
||||
<title>Network MCP Hosts</title>
|
||||
<style>
|
||||
body {
|
||||
font-family: system-ui, -apple-system, BlinkMacSystemFont, "Segoe UI", sans-serif;
|
||||
background: #0f172a;
|
||||
color: #e2e8f0;
|
||||
margin: 0;
|
||||
padding: 0 1.5rem 2rem;
|
||||
}
|
||||
header {
|
||||
padding: 2rem 0 1rem;
|
||||
}
|
||||
h1 {
|
||||
margin: 0;
|
||||
}
|
||||
.metrics {
|
||||
display: flex;
|
||||
gap: 1rem;
|
||||
flex-wrap: wrap;
|
||||
margin: 1rem 0 2rem;
|
||||
}
|
||||
.metric-card {
|
||||
background: #1e293b;
|
||||
padding: 1rem 1.5rem;
|
||||
border-radius: 0.75rem;
|
||||
border: 1px solid #334155;
|
||||
min-width: 160px;
|
||||
}
|
||||
.metric-card h3 {
|
||||
margin: 0;
|
||||
font-size: 0.9rem;
|
||||
color: #94a3b8;
|
||||
}
|
||||
.metric-card p {
|
||||
margin: 0.4rem 0 0;
|
||||
font-size: 1.5rem;
|
||||
font-weight: bold;
|
||||
color: #f1f5f9;
|
||||
}
|
||||
.hosts-grid {
|
||||
display: grid;
|
||||
grid-template-columns: repeat(auto-fill, minmax(320px, 1fr));
|
||||
gap: 1rem;
|
||||
}
|
||||
.host-card {
|
||||
background: #1e293b;
|
||||
border-radius: 0.75rem;
|
||||
border: 1px solid #334155;
|
||||
padding: 1rem;
|
||||
display: flex;
|
||||
flex-direction: column;
|
||||
gap: 0.6rem;
|
||||
}
|
||||
.host-card h2 {
|
||||
margin: 0;
|
||||
font-size: 1.1rem;
|
||||
color: #f8fafc;
|
||||
display: flex;
|
||||
justify-content: space-between;
|
||||
align-items: center;
|
||||
}
|
||||
.sources span {
|
||||
display: inline-block;
|
||||
font-size: 0.7rem;
|
||||
padding: 0.15rem 0.4rem;
|
||||
margin-right: 0.3rem;
|
||||
border-radius: 0.4rem;
|
||||
background: #0f766e;
|
||||
}
|
||||
.hosts-card ul {
|
||||
margin: 0;
|
||||
padding-left: 1.2rem;
|
||||
}
|
||||
.port-list {
|
||||
display: flex;
|
||||
flex-wrap: wrap;
|
||||
gap: 0.4rem;
|
||||
}
|
||||
.port-chip {
|
||||
background: #0f172a;
|
||||
border: 1px solid #334155;
|
||||
border-radius: 9999px;
|
||||
padding: 0.2rem 0.6rem;
|
||||
font-size: 0.8rem;
|
||||
}
|
||||
.notes {
|
||||
font-size: 0.9rem;
|
||||
color: #cbd5f5;
|
||||
}
|
||||
.source-tag-inventory {
|
||||
background: #a855f7;
|
||||
}
|
||||
.source-tag-opnsense {
|
||||
background: #0284c7;
|
||||
}
|
||||
.source-tag-nmap {
|
||||
background: #ea580c;
|
||||
}
|
||||
.source-tag-discovery {
|
||||
background: #0ea5e9;
|
||||
}
|
||||
.section-title {
|
||||
font-size: 0.9rem;
|
||||
color: #94a3b8;
|
||||
margin: 0;
|
||||
}
|
||||
.ips, .notes, .last-seen {
|
||||
font-size: 0.9rem;
|
||||
}
|
||||
</style>
|
||||
</head>
|
||||
<body>
|
||||
<header>
|
||||
<h1>Network MCP Overview</h1>
|
||||
<p class="last-seen">Elasticsearch: {{ es_url }}</p>
|
||||
</header>
|
||||
|
||||
<section class="metrics">
|
||||
<div class="metric-card">
|
||||
<h3>Total Hosts</h3>
|
||||
<p>{{ total }}</p>
|
||||
</div>
|
||||
<div class="metric-card">
|
||||
<h3>With Port Data</h3>
|
||||
<p>{{ with_ports }}</p>
|
||||
</div>
|
||||
<div class="metric-card">
|
||||
<h3>Inventory Entries</h3>
|
||||
<p>{{ inventory_hosts }}</p>
|
||||
</div>
|
||||
</section>
|
||||
|
||||
<section class="hosts-grid">
|
||||
{% for host in hosts %}
|
||||
<article class="host-card">
|
||||
<h2>{{ host.name }}
|
||||
{% if host.notes %}
|
||||
<span title="Inventory notes available">📝</span>
|
||||
{% endif %}
|
||||
</h2>
|
||||
<div class="sources">
|
||||
{% for source in host.sources %}
|
||||
{% set tag_class = "" %}
|
||||
{% if source == "inventory" %}
|
||||
{% set tag_class = "source-tag-inventory" %}
|
||||
{% elif source.startswith("opnsense") %}
|
||||
{% set tag_class = "source-tag-opnsense" %}
|
||||
{% elif source == "nmap" %}
|
||||
{% set tag_class = "source-tag-nmap" %}
|
||||
{% elif source == "nmap-discovery" %}
|
||||
{% set tag_class = "source-tag-discovery" %}
|
||||
{% endif %}
|
||||
<span class="{{ tag_class }}">{{ source }}</span>
|
||||
{% endfor %}
|
||||
</div>
|
||||
<div class="ips">
|
||||
<strong>IPs:</strong> {{ host.ips|join(", ") if host.ips else "—" }}
|
||||
</div>
|
||||
{% if host.macs %}
|
||||
<div class="ips">
|
||||
<strong>MACs:</strong> {{ host.macs|join(", ") }}
|
||||
</div>
|
||||
{% endif %}
|
||||
{% if host.hostnames %}
|
||||
<div class="ips">
|
||||
<strong>Hostnames:</strong> {{ host.hostnames|join(", ") }}
|
||||
</div>
|
||||
{% endif %}
|
||||
<div class="last-seen">
|
||||
<strong>Last seen:</strong> {{ host.last_seen or "unknown" }}
|
||||
</div>
|
||||
{% if host.notes %}
|
||||
<div class="notes">
|
||||
<strong>Notes:</strong> {{ host.notes }}
|
||||
</div>
|
||||
{% endif %}
|
||||
{% if host.expected_ports %}
|
||||
<div>
|
||||
<p class="section-title">Expected Ports</p>
|
||||
<div class="port-list">
|
||||
{% for port in host.expected_ports %}
|
||||
<span class="port-chip">{{ port }}</span>
|
||||
{% endfor %}
|
||||
</div>
|
||||
</div>
|
||||
{% endif %}
|
||||
{% if host.ports %}
|
||||
<div>
|
||||
<p class="section-title">Observed Ports</p>
|
||||
<div class="port-list">
|
||||
{% for port in host.ports %}
|
||||
<span class="port-chip">{{ port.port }} {{ port.service or "" }}</span>
|
||||
{% endfor %}
|
||||
</div>
|
||||
</div>
|
||||
{% endif %}
|
||||
</article>
|
||||
{% endfor %}
|
||||
</section>
|
||||
</body>
|
||||
</html>
|
||||
2
stacks/network-mcp/frontend/tests/__init__.py
Normal file
2
stacks/network-mcp/frontend/tests/__init__.py
Normal file
@ -0,0 +1,2 @@
|
||||
"""Unit tests for the Network MCP frontend."""
|
||||
|
||||
203
stacks/network-mcp/frontend/tests/test_mcp.py
Normal file
203
stacks/network-mcp/frontend/tests/test_mcp.py
Normal file
@ -0,0 +1,203 @@
|
||||
import json
|
||||
import unittest
|
||||
from unittest.mock import patch
|
||||
|
||||
|
||||
class FakeResponse:
|
||||
def __init__(self, payload, status_code=200):
|
||||
self._payload = payload
|
||||
self.status_code = status_code
|
||||
|
||||
def json(self):
|
||||
return self._payload
|
||||
|
||||
def raise_for_status(self):
|
||||
if self.status_code >= 400:
|
||||
raise RuntimeError(f"HTTP {self.status_code}")
|
||||
|
||||
|
||||
def _wildcard_match(pattern: str, value: str, case_insensitive: bool) -> bool:
|
||||
if value is None:
|
||||
return False
|
||||
if case_insensitive:
|
||||
pattern = pattern.lower()
|
||||
value = value.lower()
|
||||
if pattern.startswith("*") and pattern.endswith("*"):
|
||||
needle = pattern.strip("*")
|
||||
return needle in value
|
||||
return pattern == value
|
||||
|
||||
|
||||
def _extract_wildcard_clause(field_clause):
|
||||
# Supports either {"field": "*term*"} or {"field": {"value":"*term*", "case_insensitive":true}}
|
||||
if not isinstance(field_clause, dict):
|
||||
return None, None, None
|
||||
if len(field_clause) != 1:
|
||||
return None, None, None
|
||||
field, value = next(iter(field_clause.items()))
|
||||
if isinstance(value, str):
|
||||
return field, value, False
|
||||
if isinstance(value, dict):
|
||||
return field, value.get("value"), bool(value.get("case_insensitive"))
|
||||
return None, None, None
|
||||
|
||||
|
||||
def _filter_hosts_by_query(host_docs, query):
|
||||
if not query:
|
||||
return host_docs
|
||||
bool_query = query.get("bool") if isinstance(query, dict) else None
|
||||
if not bool_query:
|
||||
return host_docs
|
||||
filters = bool_query.get("filter") or []
|
||||
if not filters:
|
||||
return host_docs
|
||||
|
||||
matched = host_docs
|
||||
for f in filters:
|
||||
if "term" in f and "host.sources.keyword" in f["term"]:
|
||||
src = f["term"]["host.sources.keyword"]
|
||||
matched = [h for h in matched if src in (h.get("host", {}).get("sources") or [])]
|
||||
continue
|
||||
|
||||
if "bool" in f and "should" in f["bool"]:
|
||||
shoulds = f["bool"]["should"]
|
||||
|
||||
def matches_any(host_doc):
|
||||
host = host_doc.get("host", {})
|
||||
haystacks = {
|
||||
"host.name.keyword": [host.get("name")],
|
||||
"host.hostnames.keyword": host.get("hostnames") or [],
|
||||
"host.id.keyword": [host.get("id")],
|
||||
"host.ips": host.get("ips") or [],
|
||||
"host.macs": host.get("macs") or [],
|
||||
}
|
||||
for clause in shoulds:
|
||||
if "bool" in clause and "should" in clause["bool"]:
|
||||
# nested should from multiple search terms
|
||||
nested_shoulds = clause["bool"]["should"]
|
||||
for nested in nested_shoulds:
|
||||
if "wildcard" not in nested:
|
||||
continue
|
||||
field, value, ci = _extract_wildcard_clause(nested["wildcard"])
|
||||
if not field or value is None:
|
||||
continue
|
||||
for candidate in haystacks.get(field, []):
|
||||
if _wildcard_match(value, str(candidate or ""), ci):
|
||||
return True
|
||||
if "wildcard" in clause:
|
||||
field, value, ci = _extract_wildcard_clause(clause["wildcard"])
|
||||
if not field or value is None:
|
||||
continue
|
||||
for candidate in haystacks.get(field, []):
|
||||
if _wildcard_match(value, str(candidate or ""), ci):
|
||||
return True
|
||||
return False
|
||||
|
||||
matched = [h for h in matched if matches_any(h)]
|
||||
continue
|
||||
return matched
|
||||
|
||||
|
||||
class TestNetworkMCP(unittest.TestCase):
|
||||
def setUp(self):
|
||||
from frontend import app as app_module
|
||||
|
||||
self.app_module = app_module
|
||||
self.client = app_module.app.test_client()
|
||||
|
||||
self.host_docs = [
|
||||
{
|
||||
"host": {
|
||||
"id": "mac:dc:a6:32:67:55:dc",
|
||||
"name": "SEELE",
|
||||
"hostnames": ["SEELE"],
|
||||
"ips": ["192.168.5.208"],
|
||||
"macs": ["dc:a6:32:67:55:dc"],
|
||||
"sources": ["opnsense-dhcp", "opnsense-arp"],
|
||||
"last_seen": "2025-12-14T16:27:15.427091+00:00",
|
||||
},
|
||||
"ports": [{"port": 22, "state": "open", "service": {"name": "ssh"}}],
|
||||
},
|
||||
{
|
||||
"host": {
|
||||
"id": "mac:aa:bb:cc:dd:ee:ff",
|
||||
"name": "core",
|
||||
"hostnames": ["core.localdomain"],
|
||||
"ips": ["192.168.5.34"],
|
||||
"macs": ["aa:bb:cc:dd:ee:ff"],
|
||||
"sources": ["inventory", "opnsense-arp"],
|
||||
"last_seen": "2025-12-14T16:27:15.427091+00:00",
|
||||
"notes": "Production Docker host",
|
||||
},
|
||||
"ports": [{"port": 443, "state": "open", "service": {"name": "https"}}],
|
||||
},
|
||||
]
|
||||
|
||||
def fake_requests_get(self, url, json=None, headers=None, auth=None, verify=None):
|
||||
if url.endswith("/network-hosts/_search"):
|
||||
query = (json or {}).get("query")
|
||||
hits = _filter_hosts_by_query(self.host_docs, query)
|
||||
return FakeResponse({"hits": {"hits": [{"_source": h} for h in hits]}})
|
||||
if "/network-events-" in url and url.endswith("/_search"):
|
||||
return FakeResponse({"hits": {"hits": []}})
|
||||
return FakeResponse({}, status_code=404)
|
||||
|
||||
def test_rest_search_hostname_case_insensitive(self):
|
||||
with patch.object(self.app_module.requests, "get", side_effect=self.fake_requests_get):
|
||||
resp = self.client.get("/api/hosts?q=seele&limit=50")
|
||||
self.assertEqual(resp.status_code, 200)
|
||||
payload = resp.get_json()
|
||||
self.assertEqual(payload["total"], 1)
|
||||
self.assertEqual(payload["hosts"][0]["name"], "SEELE")
|
||||
|
||||
def test_rest_search_by_ip(self):
|
||||
with patch.object(self.app_module.requests, "get", side_effect=self.fake_requests_get):
|
||||
resp = self.client.get("/api/hosts?q=192.168.5.208")
|
||||
payload = resp.get_json()
|
||||
self.assertEqual(payload["total"], 1)
|
||||
self.assertEqual(payload["hosts"][0]["id"], "mac:dc:a6:32:67:55:dc")
|
||||
|
||||
def test_rest_search_by_mac(self):
|
||||
with patch.object(self.app_module.requests, "get", side_effect=self.fake_requests_get):
|
||||
resp = self.client.get("/api/hosts?q=dc:a6:32:67:55:dc")
|
||||
payload = resp.get_json()
|
||||
self.assertEqual(payload["total"], 1)
|
||||
self.assertEqual(payload["hosts"][0]["name"], "SEELE")
|
||||
|
||||
def test_mcp_tools_call_search_terms(self):
|
||||
with patch.object(self.app_module.requests, "get", side_effect=self.fake_requests_get):
|
||||
body = {
|
||||
"jsonrpc": "2.0",
|
||||
"id": 1,
|
||||
"method": "tools/call",
|
||||
"params": {"name": "list_hosts", "arguments": {"terms": ["seele"], "limit": 10}},
|
||||
}
|
||||
resp = self.client.post("/.well-known/mcp.json", data=json.dumps(body), content_type="application/json")
|
||||
self.assertEqual(resp.status_code, 200)
|
||||
payload = resp.get_json()
|
||||
self.assertFalse(payload["result"]["isError"])
|
||||
hosts = payload["result"]["structuredContent"]["hosts"]
|
||||
self.assertEqual(len(hosts), 1)
|
||||
self.assertEqual(hosts[0]["name"], "SEELE")
|
||||
|
||||
def test_mcp_resources_read_hosts_query(self):
|
||||
with patch.object(self.app_module.requests, "get", side_effect=self.fake_requests_get):
|
||||
body = {"jsonrpc": "2.0", "id": 2, "method": "resources/read", "params": {"uri": "network://hosts?q=seele&limit=5"}}
|
||||
resp = self.client.post("/.well-known/mcp.json", data=json.dumps(body), content_type="application/json")
|
||||
self.assertEqual(resp.status_code, 200)
|
||||
result = resp.get_json()["result"]
|
||||
self.assertEqual(result["contents"][0]["mimeType"], "application/json")
|
||||
data = json.loads(result["contents"][0]["text"])
|
||||
self.assertEqual(data["total"], 1)
|
||||
self.assertEqual(data["hosts"][0]["name"], "SEELE")
|
||||
|
||||
def test_mcp_notifications_initialized_no_response(self):
|
||||
with patch.object(self.app_module.requests, "get", side_effect=self.fake_requests_get):
|
||||
body = {"jsonrpc": "2.0", "method": "notifications/initialized", "params": {}}
|
||||
resp = self.client.post("/.well-known/mcp.json", data=json.dumps(body), content_type="application/json")
|
||||
self.assertEqual(resp.status_code, 204)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
unittest.main()
|
||||
|
||||
24
stacks/network-mcp/ilm/network-events-ilm.json
Normal file
24
stacks/network-mcp/ilm/network-events-ilm.json
Normal file
@ -0,0 +1,24 @@
|
||||
{
|
||||
"policy": {
|
||||
"phases": {
|
||||
"hot": {
|
||||
"min_age": "0ms",
|
||||
"actions": {}
|
||||
},
|
||||
"warm": {
|
||||
"min_age": "7d",
|
||||
"actions": {
|
||||
"forcemerge": {
|
||||
"max_num_segments": 1
|
||||
}
|
||||
}
|
||||
},
|
||||
"delete": {
|
||||
"min_age": "90d",
|
||||
"actions": {
|
||||
"delete": {}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
39
stacks/network-mcp/ilm/network-events-template.json
Normal file
39
stacks/network-mcp/ilm/network-events-template.json
Normal file
@ -0,0 +1,39 @@
|
||||
{
|
||||
"index_patterns": ["network-events-*"],
|
||||
"template": {
|
||||
"settings": {
|
||||
"index.lifecycle.name": "network-events-ilm"
|
||||
},
|
||||
"mappings": {
|
||||
"properties": {
|
||||
"@timestamp": { "type": "date" },
|
||||
"host": {
|
||||
"properties": {
|
||||
"ip": { "type": "ip" },
|
||||
"ips": { "type": "ip" },
|
||||
"mac": { "type": "keyword" },
|
||||
"macs": { "type": "keyword" },
|
||||
"id": { "type": "keyword" },
|
||||
"name": { "type": "keyword" },
|
||||
"hostname": { "type": "keyword" },
|
||||
"hostnames": { "type": "keyword" }
|
||||
}
|
||||
},
|
||||
"ports": {
|
||||
"properties": {
|
||||
"port": { "type": "integer" },
|
||||
"proto": { "type": "keyword" },
|
||||
"state": { "type": "keyword" },
|
||||
"service": {
|
||||
"properties": {
|
||||
"name": { "type": "keyword" },
|
||||
"product": { "type": "keyword" },
|
||||
"version": { "type": "keyword" }
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
40
stacks/network-mcp/ilm/network-hosts-template.json
Normal file
40
stacks/network-mcp/ilm/network-hosts-template.json
Normal file
@ -0,0 +1,40 @@
|
||||
{
|
||||
"index_patterns": ["network-hosts"],
|
||||
"template": {
|
||||
"mappings": {
|
||||
"properties": {
|
||||
"host": {
|
||||
"properties": {
|
||||
"id": { "type": "keyword" },
|
||||
"name": { "type": "keyword" },
|
||||
"fqdn": { "type": "keyword" },
|
||||
"ips": { "type": "ip" },
|
||||
"macs": { "type": "keyword" },
|
||||
"first_seen": { "type": "date" },
|
||||
"last_seen": { "type": "date" },
|
||||
"last_state_change": { "type": "date" },
|
||||
"state": { "type": "keyword" },
|
||||
"role": { "type": "keyword" },
|
||||
"tags": { "type": "keyword" },
|
||||
"notes": { "type": "text" }
|
||||
}
|
||||
},
|
||||
"ports": {
|
||||
"properties": {
|
||||
"port": { "type": "integer" },
|
||||
"proto": { "type": "keyword" },
|
||||
"state": { "type": "keyword" },
|
||||
"first_seen": { "type": "date" },
|
||||
"last_seen": { "type": "date" },
|
||||
"service": {
|
||||
"properties": {
|
||||
"name": { "type": "keyword" },
|
||||
"product": { "type": "keyword" }
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
280
stacks/network-mcp/inventory_targets.yml
Normal file
280
stacks/network-mcp/inventory_targets.yml
Normal file
@ -0,0 +1,280 @@
|
||||
inventory_targets:
|
||||
- name: Blackmoon
|
||||
hostname: blackmoon.localdomain
|
||||
ip: 192.168.5.1
|
||||
notes: Core OpnSense gateway; ping only
|
||||
- name: Supermicro-BMC
|
||||
hostname: 192.168.5.30
|
||||
ip: 192.168.5.30
|
||||
ports:
|
||||
- 22
|
||||
- 80
|
||||
notes: "Supermicro IPMI (ATEN login portal on 80\u2192443) for rack chassis"
|
||||
- name: Jet-Alone
|
||||
hostname: jet-alone.localdomain
|
||||
ip: 192.168.5.31
|
||||
ports:
|
||||
- 22
|
||||
notes: GPU/LLM server
|
||||
- name: Wille
|
||||
hostname: wille.localdomain
|
||||
ip: 192.168.5.33
|
||||
ports:
|
||||
- 22
|
||||
- 80
|
||||
- 443
|
||||
notes: TrueNAS SCALE primary storage (iXsystems /ui interface)
|
||||
- name: Core
|
||||
hostname: core.localdomain
|
||||
ip: 192.168.5.34
|
||||
ports:
|
||||
- 22
|
||||
- 80
|
||||
- 443
|
||||
notes: Production Docker swarm (Traefik, Gitea, Authentik, Immich, etc.)
|
||||
- name: NERV-III
|
||||
hostname: NERV-III
|
||||
ip: 192.168.5.35
|
||||
ports:
|
||||
- 22
|
||||
notes: 'Standalone Proxmox host (Fedora CoreOS VMs: container-dev VM110 plus Ramiel
|
||||
containers)'
|
||||
- name: TP-Link-AP-1
|
||||
hostname: 192.168.5.36
|
||||
ip: 192.168.5.36
|
||||
ports:
|
||||
- 22
|
||||
- 80
|
||||
notes: TP-Link EAP/Omada AP web UI (login page on HTTP)
|
||||
- name: TP-Link-AP-2
|
||||
hostname: 192.168.5.39
|
||||
ip: 192.168.5.39
|
||||
ports:
|
||||
- 22
|
||||
- 80
|
||||
notes: TP-Link EAP/Omada AP web UI (login page on HTTP)
|
||||
- name: Subspace-Mote-1
|
||||
hostname: subspace-mote-1.localdomain
|
||||
ip: 192.168.5.41
|
||||
ports:
|
||||
- 22
|
||||
notes: SBC cluster member
|
||||
- name: BirdNET-GO
|
||||
hostname: 192.168.5.71
|
||||
ip: 192.168.5.71
|
||||
ports:
|
||||
- 22
|
||||
- 8080
|
||||
notes: Armbian (rz3w-02) running birdnet-go container (port 8080)
|
||||
- name: rz3w-02
|
||||
hostname: rz3w-02.localdomain
|
||||
ports:
|
||||
- 22
|
||||
notes: Subspace node with metrics/logging
|
||||
- name: Arael
|
||||
hostname: arael.localdomain
|
||||
ip: 192.168.5.44
|
||||
ports:
|
||||
- 22
|
||||
notes: Debian host, purpose TBD
|
||||
- name: Synology-NAS
|
||||
hostname: 192.168.5.45
|
||||
ip: 192.168.5.45
|
||||
ports:
|
||||
- 22
|
||||
- 80
|
||||
- 443
|
||||
- 5000
|
||||
notes: Synology DSM primary NAS (HTTP redirect to DSM on 5000/5001)
|
||||
- name: Docker-Public
|
||||
hostname: docker-public.localdomain
|
||||
ip: 192.168.5.46
|
||||
ports:
|
||||
- 22
|
||||
notes: Traefik/Docker public host (Traefik on 8080; hosts Invidious, Matomo, FreshRSS,
|
||||
etc.)
|
||||
- name: Frigate
|
||||
hostname: frigate.localdomain
|
||||
ip: 192.168.5.47
|
||||
ports:
|
||||
- 22
|
||||
- 5000
|
||||
notes: NVR VM
|
||||
- name: HomeAssistant
|
||||
hostname: homeassistant.localdomain
|
||||
ip: 192.168.5.48
|
||||
ports:
|
||||
- 22
|
||||
- 8123
|
||||
notes: Home automation host
|
||||
- name: Casper
|
||||
hostname: casper.localdomain
|
||||
ip: 192.168.5.50
|
||||
ports:
|
||||
- 22
|
||||
notes: Logging/Metrics VM
|
||||
- name: Ramiel
|
||||
hostname: ramiel.localdomain
|
||||
ip: 192.168.5.51
|
||||
ports:
|
||||
- 22
|
||||
- 6443
|
||||
notes: Cluster node
|
||||
- name: Ramiel-III
|
||||
hostname: ramiel-iii.localdomain
|
||||
ip: 192.168.5.230
|
||||
ports:
|
||||
- 22
|
||||
notes: Additional Ramiel host
|
||||
- name: NERV
|
||||
hostname: nerv.localdomain
|
||||
ip: 192.168.5.203
|
||||
ports:
|
||||
- 22
|
||||
- 8006
|
||||
notes: Proxmox host
|
||||
- name: Magi2
|
||||
hostname: magi2.localdomain
|
||||
ip: 192.168.5.202
|
||||
ports:
|
||||
- 22
|
||||
- 8006
|
||||
notes: Proxmox host (JSON listed as Magi)
|
||||
- name: JHCI
|
||||
hostname: jhci.localdomain
|
||||
ip: 192.168.5.201
|
||||
ports:
|
||||
- 22
|
||||
- 8006
|
||||
notes: Proxmox host
|
||||
- name: Balthasar
|
||||
hostname: balthasar.localdomain
|
||||
ip: 192.168.5.237
|
||||
ports:
|
||||
- 22
|
||||
- 80
|
||||
notes: Technitium DNS server (hosts DoH UI)
|
||||
- name: Unit-00
|
||||
hostname: unit-00.localdomain
|
||||
ip: 192.168.5.222
|
||||
ports:
|
||||
- 22
|
||||
notes: Client that connects to docker-dev
|
||||
- name: TrueNAS-Backup
|
||||
hostname: ARKII.localdomain
|
||||
ip: 192.168.5.32
|
||||
ports:
|
||||
- 22
|
||||
- 80
|
||||
- 443
|
||||
notes: "TrueNAS SCALE backup NAS (ARKII chassis) \u2013 HTTPS /ui, SSH pending credentials"
|
||||
- name: Mokerlink-POE
|
||||
hostname: 192.168.5.226
|
||||
ip: 192.168.5.226
|
||||
ports:
|
||||
- 80
|
||||
notes: Mokerlink POE-2G08110GSM switch (web login only)
|
||||
- name: EtherNetIP-Controller
|
||||
hostname: 192.168.5.17
|
||||
ip: 192.168.5.17
|
||||
ports:
|
||||
- 2222
|
||||
notes: CNC/3D printer controller interface
|
||||
- name: P1S-Printer
|
||||
hostname: P1S
|
||||
ip: 192.168.5.42
|
||||
notes: Bambu Lab P1S (LLMNR responder only; no TCP services)
|
||||
- name: Container-Dev
|
||||
hostname: container-dev
|
||||
ip: 192.168.5.236
|
||||
ports:
|
||||
- 22
|
||||
- 5355
|
||||
notes: Fedora CoreOS VM (NERV-III VM110) for container dev; only key-based SSH +
|
||||
LLMNR
|
||||
- name: VPS-TransparentProxy-19222713430
|
||||
hostname: 192.227.134.30
|
||||
ip: 192.227.134.30
|
||||
ports:
|
||||
- 22
|
||||
- 80
|
||||
- 443
|
||||
notes: Transparent HAProxy node (Debian 10) running haproxy + zerotier-one + telegraf
|
||||
- name: VPS-TransparentProxy-1071722798
|
||||
hostname: 107.172.27.98
|
||||
ip: 107.172.27.98
|
||||
ports:
|
||||
- 22
|
||||
- 80
|
||||
- 443
|
||||
notes: Transparent HAProxy node (Debian 12) running haproxy + tailscale + zerotier-one
|
||||
+ telegraf/filebeat
|
||||
- name: VPS-TransparentProxy-10717425061
|
||||
hostname: 107.174.250.61
|
||||
ip: 107.174.250.61
|
||||
ports:
|
||||
- 22
|
||||
- 80
|
||||
- 443
|
||||
notes: Transparent HAProxy (Debian 12) with haproxy, docker/containerd, iperf3,
|
||||
filebeat, tailscale, zerotier
|
||||
- name: VPS-Headscale
|
||||
hostname: 198.46.218.8
|
||||
ip: 198.46.218.8
|
||||
ports:
|
||||
- 22
|
||||
- 80
|
||||
- 443
|
||||
notes: Headscale coordination server (Ubuntu 20.04) running headscale, HAProxy,
|
||||
Uptime Kuma, tailscale, zerotier
|
||||
- name: VPS-MailInABox
|
||||
hostname: 198.23.146.170
|
||||
ip: 198.23.146.170
|
||||
ports:
|
||||
- 22
|
||||
- 80
|
||||
- 443
|
||||
notes: mail.uplink.tel Mail-in-a-Box (Postfix, Dovecot, BIND, NSD, nginx, SpamPD,
|
||||
Filebeat, Tailscale)
|
||||
- name: VPS-FriendServer
|
||||
hostname: 172.245.88.186
|
||||
ip: 172.245.88.186
|
||||
ports:
|
||||
- 22
|
||||
- 80
|
||||
- 443
|
||||
notes: '"Friend server managed" (Debian 12) hosting Apache, InspIRCd, MariaDB, Gitea
|
||||
(docker), Tor, Tailscale'
|
||||
- name: VPS-Meow
|
||||
hostname: 107.174.64.22
|
||||
ip: 107.174.64.22
|
||||
ports:
|
||||
- 22
|
||||
- 80
|
||||
- 443
|
||||
notes: '"Meow" VPS (Debian 12) running Docker stack: traefik, wg-easy, wordpress/mysql,
|
||||
nginx, filebrowser'
|
||||
- name: VPS-Lukes
|
||||
hostname: 23.94.206.75
|
||||
ip: 23.94.206.75
|
||||
ports:
|
||||
- 22
|
||||
- 80
|
||||
- 443
|
||||
notes: "Luke's VPS (Debian 12) \u2013 running Docker (Traefik, Caddy, GoatCounter,\
|
||||
\ TTRSS stack, Radicale, filebrowser, ssh-tunnel)"
|
||||
- name: VPS-Tailscale-Edge
|
||||
hostname: 100.64.0.14
|
||||
ip: 100.64.0.14
|
||||
ports:
|
||||
- 22
|
||||
- 80
|
||||
- 443
|
||||
notes: 'Tailscale interface into mail.uplink.tel (Mail-in-a-Box stack: Postfix/Dovecot/BIND/nginx)'
|
||||
- name: BirdNET-Pi
|
||||
hostname: orangepizero2.localdomain
|
||||
ip: 192.168.5.18
|
||||
ports:
|
||||
- 22
|
||||
- 80
|
||||
notes: Orangepi Zero2 running BirdNET-Pi (Caddy on port 80)
|
||||
77
stacks/network-mcp/scripts/bootstrap_indices.py
Normal file
77
stacks/network-mcp/scripts/bootstrap_indices.py
Normal file
@ -0,0 +1,77 @@
|
||||
import os
|
||||
import sys
|
||||
import json
|
||||
import requests
|
||||
import urllib3
|
||||
|
||||
REPO_ROOT = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
|
||||
if REPO_ROOT not in sys.path:
|
||||
sys.path.insert(0, REPO_ROOT)
|
||||
|
||||
from collectors.common.es_auth import resolve_api_key, build_api_key_header
|
||||
|
||||
# Suppress insecure request warnings
|
||||
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
|
||||
|
||||
def load_json(path):
|
||||
with open(path, 'r') as f:
|
||||
return json.load(f)
|
||||
|
||||
def main():
|
||||
es_url = os.getenv("ES_URL", "http://localhost:9200").rstrip('/')
|
||||
env_api_id = os.getenv("ES_API_ID")
|
||||
env_api_key = os.getenv("ES_API_KEY")
|
||||
es_api_id, es_api_key = resolve_api_key(env_api_id, env_api_key)
|
||||
es_user = os.getenv("ES_USER", "elastic")
|
||||
es_pass = os.getenv("ES_PASS", "changeme")
|
||||
verify_ssl = os.getenv("ES_VERIFY_SSL", "true").lower() == "true"
|
||||
|
||||
auth_args = {}
|
||||
if es_api_id and es_api_key:
|
||||
auth_args["headers"] = {"Authorization": build_api_key_header(es_api_id, es_api_key)}
|
||||
print("Using Elasticsearch API key authentication for bootstrap.")
|
||||
else:
|
||||
auth_args["auth"] = (es_user, es_pass)
|
||||
print("Using Elasticsearch basic authentication for bootstrap.")
|
||||
|
||||
print(f"Bootstrapping Elastic at {es_url}...")
|
||||
|
||||
def put(endpoint, data):
|
||||
url = f"{es_url}{endpoint}"
|
||||
print(f"PUT {url}")
|
||||
try:
|
||||
resp = requests.put(url, json=data, verify=verify_ssl, **auth_args)
|
||||
print(f"Response: {resp.status_code} {resp.text}")
|
||||
resp.raise_for_status()
|
||||
except Exception as e:
|
||||
print(f"Error: {e}")
|
||||
# Don't exit, try next
|
||||
|
||||
# 1. ILM Policy
|
||||
ilm_path = "ilm/network-events-ilm.json"
|
||||
if os.path.exists(ilm_path):
|
||||
data = load_json(ilm_path)
|
||||
put("/_ilm/policy/network-events-ilm", data)
|
||||
else:
|
||||
print(f"Missing {ilm_path}")
|
||||
|
||||
# 2. Network Events Template
|
||||
tpl_path = "ilm/network-events-template.json"
|
||||
if os.path.exists(tpl_path):
|
||||
data = load_json(tpl_path)
|
||||
put("/_index_template/network-events", data)
|
||||
else:
|
||||
print(f"Missing {tpl_path}")
|
||||
|
||||
# 3. Network Hosts Template
|
||||
tpl_path = "ilm/network-hosts-template.json"
|
||||
if os.path.exists(tpl_path):
|
||||
data = load_json(tpl_path)
|
||||
put("/_index_template/network-hosts", data)
|
||||
else:
|
||||
print(f"Missing {tpl_path}")
|
||||
|
||||
print("Bootstrap complete.")
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
9
stacks/network-mcp/static/host_metadata.json
Normal file
9
stacks/network-mcp/static/host_metadata.json
Normal file
@ -0,0 +1,9 @@
|
||||
{
|
||||
"mac:aa:bb:cc:dd:ee:ff": {
|
||||
"role": "router",
|
||||
"owner": "admin",
|
||||
"location": "server-room",
|
||||
"tags": ["critical", "gateway"],
|
||||
"notes": "Main gateway"
|
||||
}
|
||||
}
|
||||
20
stacks/obby/docker-compose.yml
Normal file
20
stacks/obby/docker-compose.yml
Normal file
@ -0,0 +1,20 @@
|
||||
---
|
||||
services:
|
||||
obsidian:
|
||||
image: lscr.io/linuxserver/obsidian:latest
|
||||
container_name: obsidian
|
||||
security_opt:
|
||||
- seccomp:unconfined #optional
|
||||
environment:
|
||||
- PUID=1000
|
||||
- PGID=1000
|
||||
- TZ=Etc/UTC
|
||||
volumes:
|
||||
- ./config:/config
|
||||
ports:
|
||||
- 3002:3000
|
||||
- 3003:3001
|
||||
devices:
|
||||
- /dev/dri:/dev/dri #optional
|
||||
shm_size: "1gb"
|
||||
restart: unless-stopped
|
||||
15
stacks/snowflake/docker-compose.yml
Normal file
15
stacks/snowflake/docker-compose.yml
Normal file
@ -0,0 +1,15 @@
|
||||
services:
|
||||
snowflake-proxy:
|
||||
network_mode: host
|
||||
image: containers.torproject.org/tpo/anti-censorship/pluggable-transports/snowflake:latest
|
||||
container_name: snowflake-proxy
|
||||
restart: unless-stopped
|
||||
# For a full list of Snowflake Proxy CLI parameters see
|
||||
# https://gitlab.torproject.org/tpo/anti-censorship/pluggable-transports/snowflake/-/tree/main/proxy?ref_type=heads#running-a-standalone-snowflake-proxy
|
||||
#command: [ "-ephemeral-ports-range", "30000:60000" ]
|
||||
watchtower:
|
||||
image: containrrr/watchtower
|
||||
container_name: watchtower
|
||||
volumes:
|
||||
- /var/run/docker.sock:/var/run/docker.sock
|
||||
command: snowflake-proxy
|
||||
8
stacks/szurubooru/.env.template
Normal file
8
stacks/szurubooru/.env.template
Normal file
@ -0,0 +1,8 @@
|
||||
POSTGRES_USER=szuru
|
||||
POSTGRES_PASSWORD=change_me
|
||||
BUILD_INFO=local-dev
|
||||
PORT=8080
|
||||
THREADS=4
|
||||
BASE_URL=/
|
||||
MOUNT_DATA=./volumes/data
|
||||
MOUNT_SQL=./volumes/postgres
|
||||
46
stacks/szurubooru/docker-compose.yml
Normal file
46
stacks/szurubooru/docker-compose.yml
Normal file
@ -0,0 +1,46 @@
|
||||
## Example Docker Compose configuration
|
||||
##
|
||||
## Use this as a template to set up docker compose, or as guide to set up other
|
||||
## orchestration services
|
||||
services:
|
||||
|
||||
server:
|
||||
image: szurubooru/server:latest
|
||||
depends_on:
|
||||
- sql
|
||||
environment:
|
||||
## These should be the names of the dependent containers listed below,
|
||||
## or FQDNs/IP addresses if these services are running outside of Docker
|
||||
POSTGRES_HOST: sql
|
||||
## Credentials for database:
|
||||
POSTGRES_USER:
|
||||
POSTGRES_PASSWORD:
|
||||
## Commented Values are Default:
|
||||
#POSTGRES_DB: defaults to same as POSTGRES_USER
|
||||
#POSTGRES_PORT: 5432
|
||||
#LOG_SQL: 0 (1 for verbose SQL logs)
|
||||
THREADS:
|
||||
volumes:
|
||||
- "${MOUNT_DATA}:/data"
|
||||
- "./server/config.yaml:/opt/app/config.yaml"
|
||||
|
||||
client:
|
||||
image: szurubooru/client:latest
|
||||
depends_on:
|
||||
- server
|
||||
environment:
|
||||
BACKEND_HOST: server
|
||||
BASE_URL:
|
||||
volumes:
|
||||
- "${MOUNT_DATA}:/data:ro"
|
||||
ports:
|
||||
- "${PORT}:80"
|
||||
|
||||
sql:
|
||||
image: postgres:11-alpine
|
||||
restart: unless-stopped
|
||||
environment:
|
||||
POSTGRES_USER:
|
||||
POSTGRES_PASSWORD:
|
||||
volumes:
|
||||
- "${MOUNT_SQL}:/var/lib/postgresql/data"
|
||||
3
stacks/szurubooru/server/config.yaml
Normal file
3
stacks/szurubooru/server/config.yaml
Normal file
@ -0,0 +1,3 @@
|
||||
name: Hyrax Hub
|
||||
domain: http://localhost:8080
|
||||
secret: "CHANGE_ME"
|
||||
Loading…
x
Reference in New Issue
Block a user