Compare commits

...

10 Commits

20 changed files with 2089 additions and 998 deletions

View File

@ -9,3 +9,5 @@ node_modules
data
videos
*.log
feed-master-config/var
feed-master-config/images

5
.gitignore vendored
View File

@ -33,6 +33,7 @@ env/
# IDE
.vscode/
.idea/
.gemini/
*.swp
*.swo
*~
@ -51,6 +52,10 @@ Thumbs.db
# Logs
*.log
# Feed Master runtime cache
feed-master-config/var/
feed-master-config/images/
# Testing
.pytest_cache/
.coverage

87
Makefile Normal file
View File

@ -0,0 +1,87 @@
# Makefile for TLC Search + Feed Master
.PHONY: help config up down restart logs status update-channels
help:
@echo "TLC Search + Feed Master - Management Commands"
@echo ""
@echo "Configuration:"
@echo " make config - Regenerate feed-master configuration from channels.yml"
@echo ""
@echo "Service Management:"
@echo " make up - Start all services"
@echo " make down - Stop all services"
@echo " make restart - Restart all services"
@echo " make logs - View all service logs"
@echo " make status - Check service status"
@echo ""
@echo "Updates:"
@echo " make update-channels - Regenerate config and restart feed-master"
@echo ""
@echo "Individual Services:"
@echo " make logs-feed - View feed-master logs"
@echo " make logs-bridge - View rss-bridge logs"
@echo " make logs-app - View TLC Search logs"
@echo " make restart-feed - Restart feed-master only"
# Generate feed-master configuration from channels.yml
config:
@echo "Generating feed-master configuration..."
python3 -m python_app.generate_feed_config_simple
@echo "Configuration updated!"
# Start all services
up:
docker compose up -d
@echo ""
@echo "Services started!"
@echo " - RSS Bridge: http://localhost:3001"
@echo " - Feed Master: http://localhost:8097/rss/youtube-unified"
@echo " - TLC Search: http://localhost:8080"
# Stop all services
down:
docker compose down
# Restart all services
restart:
docker compose restart
# View all logs
logs:
docker compose logs -f
# View feed-master logs
logs-feed:
docker compose logs -f feed-master
# View rss-bridge logs
logs-bridge:
docker compose logs -f rss-bridge
# View TLC Search logs
logs-app:
docker compose logs -f app
# Check service status
status:
@docker compose ps
@echo ""
@echo "Endpoints:"
@echo " - RSS Bridge: http://localhost:3001"
@echo " - Feed Master: http://localhost:8097/rss/youtube-unified"
@echo " - TLC Search: http://localhost:8080"
# Restart only feed-master
restart-feed:
docker compose restart feed-master
# Pull latest channel URLs and regenerate configuration
update-channels:
@echo "Regenerating feed-master configuration..."
python3 -m python_app.generate_feed_config_simple
@echo ""
@echo "Restarting feed-master..."
docker compose restart feed-master
@echo ""
@echo "Update complete!"

209
README-FEED-MASTER.md Normal file
View File

@ -0,0 +1,209 @@
# TLC Search + Feed Master Integration
This directory contains an integrated setup combining:
- **TLC Search**: Flask app for searching YouTube transcripts (Elasticsearch/Qdrant)
- **Feed Master**: RSS aggregator for YouTube channels
- **RSS Bridge**: Converts YouTube channels to RSS feeds
All services share the same source of truth for YouTube channels from `channels.yml` and the adjacent
`urls.txt` in this repository.
## Architecture
```
┌─────────────────────┐
│ channels.yml │ Source of truth (this repo)
│ (python_app repo) │
└──────────┬──────────┘
├─────────────────────────────┬────────────────────────┐
│ │ │
v v v
┌──────────────┐ ┌──────────────┐ ┌─────────────────┐
│ TLC Search │ │ RSS Bridge │ │ Feed Master │
│ (Flask App) │ │ (Port 3001) │───────>│ (Port 8097) │
│ Port 8080 │ └──────────────┘ └─────────────────┘
│ │ │
│ Elasticsearch│ │
│ Qdrant │ │
└──────────────┘ │
v
http://localhost:8097/rss/youtube-unified
```
## Services
### 1. TLC Search (Port 8080)
- Indexes and searches YouTube transcripts
- Uses Elasticsearch for metadata and Qdrant for vector search
- Connects to remote Elasticsearch/Qdrant instances
### 2. RSS Bridge (Port 3001)
- Converts YouTube channels to RSS feeds
- Supports both channel IDs and @handles
- Used by Feed Master to aggregate feeds
### 3. Feed Master (Port 8097)
- Aggregates all YouTube channel RSS feeds into one unified feed
- Updates every 5 minutes
- Keeps the most recent 200 items from all channels
## Setup
### Prerequisites
- Docker and Docker Compose
- Python 3.x
### Configuration
1. **Environment Variables**: Create `.env` file with:
```bash
# Elasticsearch
ELASTIC_URL=https://your-elasticsearch-url
ELASTIC_INDEX=this_little_corner_py
ELASTIC_USERNAME=your_username
ELASTIC_PASSWORD=your_password
# Qdrant
QDRANT_URL=https://your-qdrant-url
QDRANT_COLLECTION=tlc-captions-full
# Optional UI links
RSS_FEED_URL=/rss/youtube-unified
CHANNELS_PATH=/app/python_app/channels.yml
RSS_FEED_UPSTREAM=http://feed-master:8080
```
2. **Generate Feed Configuration**:
```bash
# Regenerate feed-master config from the channels list
python3 -m python_app.generate_feed_config_simple
```
This reads `channels.yml` and generates `feed-master-config/fm.yml`.
### Starting Services
```bash
# Start all services
docker compose up -d
# View logs
docker compose logs -f
# View specific service logs
docker compose logs -f feed-master
docker compose logs -f rss-bridge
docker compose logs -f app
```
### Stopping Services
```bash
# Stop all services
docker compose down
# Stop specific service
docker compose stop feed-master
```
## Usage
### Unified RSS Feed
Access the aggregated feed through the TLC app (recommended):
- **URL**: http://localhost:8080/rss
- **Format**: RSS/Atom XML
- **Behavior**: Filters RSS-Bridge error items and prefixes titles with channel name
- **Updates**: Every 5 minutes (feed-master schedule)
- **Items**: Most recent 200 items across all channels
Direct feed-master access still works:
- **URL**: http://localhost:8097/rss/youtube-unified
### TLC Search
Access the search interface at:
- **URL**: http://localhost:8080
### Channel List Endpoints
- **Plain text list**: http://localhost:8080/channels.txt
- **JSON metadata**: http://localhost:8080/api/channel-list
### RSS Bridge
Access individual channel feeds or the web interface at:
- **URL**: http://localhost:3001
## Updating Channel List
When channels are added/removed from `channels.yml`:
```bash
# 1. Regenerate feed configuration
cd /var/core/this-little-corner/src/python_app
python3 -m python_app.generate_feed_config_simple
# 2. Restart feed-master to pick up changes
docker compose restart feed-master
```
## File Structure
```
python_app/
├── docker-compose.yml # All services configuration
├── channels.yml # Canonical YouTube channel list
├── urls.txt # URL list kept in sync with channels.yml
├── generate_feed_config_simple.py # Config generator script (run via python -m)
├── feed-master-config/
│ ├── fm.yml # Feed Master configuration (auto-generated)
│ ├── var/ # Feed Master database
│ └── images/ # Cached images
├── data/ # TLC Search data (read-only)
└── README-FEED-MASTER.md # This file
```
## Troubleshooting
### Feed Master not updating
```bash
# Check if RSS Bridge is accessible
curl http://localhost:3001
# Restart both services in order
docker compose restart rss-bridge
sleep 10
docker compose restart feed-master
```
### Configuration issues
```bash
# Regenerate configuration
python -m python_app.generate_feed_config_simple
# Validate the YAML
cat feed-master-config/fm.yml
# Restart feed-master
docker compose restart feed-master
```
### View feed-master logs
```bash
docker compose logs -f feed-master | grep -E "(ERROR|WARN|youtube)"
```
## Integration Notes
- **Single Source of Truth**: All channel URLs come from `channels.yml` and `urls.txt` in this repo
- **Automatic Regeneration**: Run `python3 -m python_app.generate_feed_config_simple` when `channels.yml` changes
- **No Manual Editing**: Don't edit `fm.yml` directly - regenerate it from the script
- **Handle Support**: Supports both `/channel/ID` and `/@handle` URL formats
- **Shared Channels**: Same channels used for transcript indexing (TLC Search) and RSS aggregation (Feed Master)
- **Skip Broken RSS**: Set `rss: false` in `channels.yml` to exclude a channel from RSS aggregation
## Future Enhancements
- [ ] Automated config regeneration on git pull
- [ ] Channel name lookup from YouTube API
- [ ] Integration with TLC Search for unified UI
- [ ] Webhook notifications for new videos
- [ ] OPML export for other RSS readers

View File

@ -102,6 +102,9 @@ Other tunables (defaults shown in compose):
- `ELASTIC_VERIFY_CERTS` (set to `1` for real TLS verification)
- `QDRANT_COLLECTION` (default `tlc-captions-full`)
- `QDRANT_VECTOR_NAME` / `QDRANT_VECTOR_SIZE` / `QDRANT_EMBED_MODEL`
- `RATE_LIMIT_ENABLED` (default `1`)
- `RATE_LIMIT_REQUESTS` (default `60`)
- `RATE_LIMIT_WINDOW_SECONDS` (default `60`)
Port 8080 on the host is forwarded to the app. Mount `./data` (read-only) if you want local fallbacks for metrics (`LOCAL_DATA_DIR=/app/data/video_metadata`); otherwise the app will rely purely on the remote backends. Stop the container with `docker compose down`.

162
channel_config.py Normal file
View File

@ -0,0 +1,162 @@
from __future__ import annotations
import json
import re
from pathlib import Path
from typing import Any, Dict, List, Optional
_CHANNEL_ID_PATTERN = re.compile(r"(?:https?://)?(?:www\.)?youtube\.com/channel/([^/?#]+)")
_HANDLE_PATTERN = re.compile(r"(?:https?://)?(?:www\.)?youtube\.com/@([^/?#]+)")
def _strip_quotes(value: str) -> str:
if len(value) >= 2 and value[0] == value[-1] and value[0] in {"'", '"'}:
return value[1:-1]
return value
def _parse_yaml_channels(text: str) -> List[Dict[str, str]]:
channels: List[Dict[str, str]] = []
current: Dict[str, str] = {}
for raw_line in text.splitlines():
line = raw_line.strip()
if not line or line.startswith("#"):
continue
if line == "channels:":
continue
if line.startswith("- "):
if current:
channels.append(current)
current = {}
line = line[2:].strip()
if not line:
continue
if ":" not in line:
continue
key, value = line.split(":", 1)
current[key.strip()] = _strip_quotes(value.strip())
if current:
channels.append(current)
return channels
def _extract_from_url(url: str) -> Dict[str, Optional[str]]:
channel_id = None
handle = None
channel_match = _CHANNEL_ID_PATTERN.search(url)
if channel_match:
channel_id = channel_match.group(1)
handle_match = _HANDLE_PATTERN.search(url)
if handle_match:
handle = handle_match.group(1)
return {"id": channel_id, "handle": handle}
def _normalize_handle(handle: Optional[str]) -> Optional[str]:
if not handle:
return None
return handle.lstrip("@").strip() or None
def _parse_bool(value: Optional[object]) -> Optional[bool]:
if isinstance(value, bool):
return value
if value is None:
return None
text = str(value).strip().lower()
if text in {"1", "true", "yes", "y"}:
return True
if text in {"0", "false", "no", "n"}:
return False
return None
def _normalize_entry(entry: Dict[str, Any]) -> Optional[Dict[str, Any]]:
channel_id = entry.get("id") or entry.get("channel_id")
handle = _normalize_handle(entry.get("handle") or entry.get("username"))
url = entry.get("url")
name = entry.get("name")
rss_flag = _parse_bool(
entry.get("rss_enabled") or entry.get("rss") or entry.get("include_in_feed")
)
if url:
extracted = _extract_from_url(url)
channel_id = channel_id or extracted.get("id")
handle = handle or extracted.get("handle")
if not url:
if channel_id:
url = f"https://www.youtube.com/channel/{channel_id}"
elif handle:
url = f"https://www.youtube.com/@{handle}"
if not name:
name = handle or channel_id
if not name or not url:
return None
normalized = {
"id": channel_id or "",
"handle": handle or "",
"name": name,
"url": url,
"rss_enabled": True if rss_flag is None else rss_flag,
}
return normalized
def load_channel_entries(path: Path) -> List[Dict[str, str]]:
if not path.exists():
raise FileNotFoundError(path)
if path.suffix.lower() == ".json":
payload = json.loads(path.read_text(encoding="utf-8"))
if isinstance(payload, dict):
raw_entries = payload.get("channels", [])
else:
raw_entries = payload
else:
raw_entries = _parse_yaml_channels(path.read_text(encoding="utf-8"))
entries: List[Dict[str, str]] = []
for raw in raw_entries:
if not isinstance(raw, dict):
continue
raw_payload: Dict[str, Any] = {}
for key, value in raw.items():
if value is None:
continue
if isinstance(value, bool):
raw_payload[str(key).strip()] = value
else:
raw_payload[str(key).strip()] = str(value).strip()
normalized = _normalize_entry(raw_payload)
if normalized:
entries.append(normalized)
entries.sort(key=lambda item: item["name"].lower())
return entries
def build_rss_bridge_url(entry: Dict[str, str], rss_bridge_host: str = "rss-bridge") -> Optional[str]:
channel_id = entry.get("id") or ""
handle = _normalize_handle(entry.get("handle"))
if channel_id:
return (
f"http://{rss_bridge_host}/?action=display&bridge=YoutubeBridge"
f"&context=By+channel+id&c={channel_id}&format=Mrss"
)
if handle:
return (
f"http://{rss_bridge_host}/?action=display&bridge=YoutubeBridge"
f"&context=By+username&u={handle}&format=Mrss"
)
return None

258
channels.yml Normal file
View File

@ -0,0 +1,258 @@
# Shared YouTube Channel Configuration
# Used by both TLC Search (transcript collection) and Feed Master (RSS aggregation)
channels:
- id: UCCebR16tXbv5Ykk9_WtCCug
name: Channel UCCebR16tXbv
url: https://www.youtube.com/channel/UCCebR16tXbv5Ykk9_WtCCug/videos
- id: UC6vg0HkKKlgsWk-3HfV-vnw
name: A Quality Existence
url: https://www.youtube.com/channel/UC6vg0HkKKlgsWk-3HfV-vnw/videos
- id: UCeWWxwzgLYUbfjWowXhVdYw
name: Andrea with the Bangs
url: https://www.youtube.com/channel/UCeWWxwzgLYUbfjWowXhVdYw/videos
- id: UC952hDf_C4nYJdqwK7VzTxA
name: Charlie's Little Corner
url: https://www.youtube.com/channel/UC952hDf_C4nYJdqwK7VzTxA/videos
- id: UCU5SNBfTo4umhjYz6M0Jsmg
name: Christian Baxter
url: https://www.youtube.com/channel/UCU5SNBfTo4umhjYz6M0Jsmg/videos
- id: UC6Tvr9mBXNaAxLGRA_sUSRA
name: Finding Ideas
url: https://www.youtube.com/channel/UC6Tvr9mBXNaAxLGRA_sUSRA/videos
- id: UC4Rmxg7saTfwIpvq3QEzylQ
name: Ein Sof - Infinite Reflections
url: https://www.youtube.com/channel/UC4Rmxg7saTfwIpvq3QEzylQ/videos
- id: UCTdH4nh6JTcfKUAWvmnPoIQ
name: Eric Seitz
url: https://www.youtube.com/channel/UCTdH4nh6JTcfKUAWvmnPoIQ/videos
- id: UCsi_x8c12NW9FR7LL01QXKA
name: Grail Country
url: https://www.youtube.com/channel/UCsi_x8c12NW9FR7LL01QXKA/videos
- id: UCAqTQ5yLHHH44XWwWXLkvHQ
name: Grizwald Grim
url: https://www.youtube.com/channel/UCAqTQ5yLHHH44XWwWXLkvHQ/videos
- id: UCprytROeCztMOMe8plyJRMg
name: faturechi
url: https://www.youtube.com/channel/UCprytROeCztMOMe8plyJRMg/videos
- id: UCpqDUjTsof-kTNpnyWper_Q
name: John Vervaeke
url: https://www.youtube.com/channel/UCpqDUjTsof-kTNpnyWper_Q/videos
- id: UCL_f53ZEJxp8TtlOkHwMV9Q
name: Jordan B Peterson
url: https://www.youtube.com/channel/UCL_f53ZEJxp8TtlOkHwMV9Q/videos
- id: UCez1fzMRGctojfis2lfRYug
name: Lucas Vos
url: https://www.youtube.com/channel/UCez1fzMRGctojfis2lfRYug/videos
- id: UC2leFZRD0ZlQDQxpR2Zd8oA
name: Mary Kochan
url: https://www.youtube.com/channel/UC2leFZRD0ZlQDQxpR2Zd8oA/videos
- id: UC8SErJkYnDsYGh1HxoZkl-g
name: Sartori Studios
url: https://www.youtube.com/channel/UC8SErJkYnDsYGh1HxoZkl-g/videos
- id: UCEPOn4cgvrrerg_-q_Ygw1A
name: More Christ
url: https://www.youtube.com/channel/UCEPOn4cgvrrerg_-q_Ygw1A/videos
- id: UC2yCyOMUeem-cYwliC-tLJg
name: Paul Anleitner
url: https://www.youtube.com/channel/UC2yCyOMUeem-cYwliC-tLJg/videos
- id: UCGsDIP_K6J6VSTqlq-9IPlg
name: Paul VanderKlay
url: https://www.youtube.com/channel/UCGsDIP_K6J6VSTqlq-9IPlg/videos
- id: UCEzWTLDYmL8soRdQec9Fsjw
name: Randos United
url: https://www.youtube.com/channel/UCEzWTLDYmL8soRdQec9Fsjw/videos
- id: UC1KgNsMdRoIA_njVmaDdHgA
name: Randos United 2
url: https://www.youtube.com/channel/UC1KgNsMdRoIA_njVmaDdHgA/videos
- id: UCFQ6Gptuq-sLflbJ4YY3Umw
name: Rebel Wisdom
url: https://www.youtube.com/channel/UCFQ6Gptuq-sLflbJ4YY3Umw/videos
- id: UCEY1vGNBPsC3dCatZyK3Jkw
name: Strange Theology
url: https://www.youtube.com/channel/UCEY1vGNBPsC3dCatZyK3Jkw/videos
- id: UCIAtCuzdvgNJvSYILnHtdWA
name: The Anadromist
url: https://www.youtube.com/channel/UCIAtCuzdvgNJvSYILnHtdWA/videos
- id: UClIDP7_Kzv_7tDQjTv9EhrA
name: The Chris Show
url: https://www.youtube.com/channel/UClIDP7_Kzv_7tDQjTv9EhrA/videos
- id: UC-QiBn6GsM3JZJAeAQpaGAA
name: TheCommonToad
url: https://www.youtube.com/channel/UC-QiBn6GsM3JZJAeAQpaGAA/videos
- id: UCiJmdXTb76i8eIPXdJyf8ZQ
name: Channel UCiJmdXTb76i
url: https://www.youtube.com/channel/UCiJmdXTb76i8eIPXdJyf8ZQ/videos
- id: UCM9Z05vuQhMEwsV03u6DrLA
name: Cassidy van der Kamp
url: https://www.youtube.com/channel/UCM9Z05vuQhMEwsV03u6DrLA/videos
- id: UCgp_r6WlBwDSJrP43Mz07GQ
name: The Meaning Code
url: https://www.youtube.com/channel/UCgp_r6WlBwDSJrP43Mz07GQ/videos
- id: UC5uv-BxzCrN93B_5qbOdRWw
name: TheScrollersPodcast
url: https://www.youtube.com/channel/UC5uv-BxzCrN93B_5qbOdRWw/videos
- id: UCtCTSf3UwRU14nYWr_xm-dQ
name: Jonathan Pageau
url: https://www.youtube.com/channel/UCtCTSf3UwRU14nYWr_xm-dQ/videos
- id: UC1a4VtU_SMSfdRiwMJR33YQ
name: The Young Levite
url: https://www.youtube.com/channel/UC1a4VtU_SMSfdRiwMJR33YQ/videos
- id: UCg7Ed0lecvko58ibuX1XHng
name: Transfigured
url: https://www.youtube.com/channel/UCg7Ed0lecvko58ibuX1XHng/videos
- id: UCMVG5eqpYFVEB-a9IqAOuHA
name: President Foxman
url: https://www.youtube.com/channel/UCMVG5eqpYFVEB-a9IqAOuHA/videos
- id: UC8mJqpS_EBbMcyuzZDF0TEw
name: Neal Daedalus
url: https://www.youtube.com/channel/UC8mJqpS_EBbMcyuzZDF0TEw/videos
- id: UCGHuURJ1XFHzPSeokf6510A
name: Aphrael Pilotson
url: https://www.youtube.com/channel/UCGHuURJ1XFHzPSeokf6510A/videos
- id: UC704NVL2DyzYg3rMU9r1f7A
handle: chrishoward8473
name: Chris Howard
url: https://www.youtube.com/@chrishoward8473/videos
- id: UChptV-kf8lnncGh7DA2m8Pw
name: Shoulder Serf
url: https://www.youtube.com/channel/UChptV-kf8lnncGh7DA2m8Pw/videos
- id: UCzX6R3ZLQh5Zma_5AsPcqPA
name: Restoring Meaning
url: https://www.youtube.com/channel/UCzX6R3ZLQh5Zma_5AsPcqPA/videos
- id: UCiukuaNd_qzRDTW9qe2OC1w
name: Kale Zelden
url: https://www.youtube.com/channel/UCiukuaNd_qzRDTW9qe2OC1w/videos
- id: UC5yLuFQCms4nb9K2bGQLqIw
name: Ron Copperman
url: https://www.youtube.com/channel/UC5yLuFQCms4nb9K2bGQLqIw/videos
- id: UCVdSgEf9bLXFMBGSMhn7x4Q
name: Mark D Parker
url: https://www.youtube.com/channel/UCVdSgEf9bLXFMBGSMhn7x4Q/videos
- id: UC_dnk5D4tFCRYCrKIcQlcfw
name: Luke Thompson
url: https://www.youtube.com/channel/UC_dnk5D4tFCRYCrKIcQlcfw/videos
- id: UCT8Lq3ufaGEnCSS8WpFatqw
handle: Freerilian
name: Free Rilian
url: https://www.youtube.com/@Freerilian/videos
- id: UC977g6oGYIJDQnsZOGjQBBA
handle: marks.-ry7bm
name: Mark S
url: https://www.youtube.com/@marks.-ry7bm/videos
- id: UCbD1Pm0TOcRK2zaCrwgcTTg
handle: Adams-Fall
name: Adams Fall
url: https://www.youtube.com/@Adams-Fall/videos
- id: UCnojyPW0IgLWTQ0SaDQ1KBA
handle: mcmosav
name: mcmosav
url: https://www.youtube.com/@mcmosav/videos
- id: UCiOZYvBGHw1Y6wyzffwEp9g
handle: Landbeorht
name: Joseph Lambrecht
url: https://www.youtube.com/@Landbeorht/videos
- id: UCAXyF_HFeMgwS8nkGVeroAA
handle: Corner_Citizen
name: Corner Citizen
url: https://www.youtube.com/@Corner_Citizen/videos
- id: UCv2Qft5mZrmA9XAwnl9PU-g
handle: ethan.caughey
name: Ethan Caughey
url: https://www.youtube.com/@ethan.caughey/videos
- id: UCMJCtS8jKouJ2d8UIYzW3vg
handle: MarcInTbilisi
name: Marc Jackson
url: https://www.youtube.com/@MarcInTbilisi/videos
- id: UCk9O91WwruXmgu1NQrKZZEw
handle: climbingmt.sophia
name: Climbing Mt Sophia
url: https://www.youtube.com/@climbingmt.sophia/videos
- id: UCUSyTPWW4JaG1YfUPddw47Q
handle: Skankenstein
name: Skankenstein
url: https://www.youtube.com/@Skankenstein/videos
- id: UCzw2FNI3IRphcAoVcUENOgQ
handle: UpCycleClub
name: UpCycleClub
url: https://www.youtube.com/@UpCycleClub/videos
- id: UCQ7rVoApmYIpcmU7fB9RPyw
handle: JessPurviance
name: Jesspurviance
url: https://www.youtube.com/@JessPurviance/videos
- id: UCrZyTWGMdRM9_P26RKPvh3A
handle: greyhamilton52
name: Grey Hamilton
url: https://www.youtube.com/@greyhamilton52/videos
- id: UCDCfI162vhPvwdxW6X4nmiw
handle: paulrenenichols
name: Paul Rene Nichols
url: https://www.youtube.com/@paulrenenichols/videos
- id: UCFLovlJ8RFApfjrf2y157xg
handle: OfficialSecularKoranism
name: Secular Koranism
url: https://www.youtube.com/@OfficialSecularKoranism/videos
- id: UC_-YQbnPfBbIezMr1adZZiQ
handle: FromWhomAllBlessingsFlow
name: From Whom All Blessings Flow
url: https://www.youtube.com/@FromWhomAllBlessingsFlow/videos
- id: UCn5mf-fcpBmkepIpZ8eFRng
handle: FoodTruckEmily
name: Emily Rajeh
url: https://www.youtube.com/@FoodTruckEmily/videos
- id: UC6zHDj4D323xJkblnPTvY3Q
handle: O.G.Rose.Michelle.and.Daniel
name: OG Rose
url: https://www.youtube.com/@O.G.Rose.Michelle.and.Daniel/videos
- id: UC4GiA5Hnwy415uVRymxPK-w
handle: JonathanDumeer
name: Jonathan Dumeer
url: https://www.youtube.com/@JonathanDumeer/videos
- id: UCMzT-mdCqoyEv_-YZVtE7MQ
handle: JordanGreenhall
name: Jordan Hall
url: https://www.youtube.com/@JordanGreenhall/videos
- id: UC5goUoFM4LPim4eY4pwRXYw
handle: NechamaGluck
name: Nechama Gluck
url: https://www.youtube.com/@NechamaGluck/videos
- id: UCPUVeoQYyq8cndWwyczX6RA
handle: justinsmorningcoffee
name: Justinsmorningcoffee
url: https://www.youtube.com/@justinsmorningcoffee/videos
- id: UCB0C8DEIQlQzvSGuGriBxtA
handle: grahampardun
name: Grahampardun
url: https://www.youtube.com/@grahampardun/videos
- id: UCpLJJLVB_7v4Igq-9arja1A
handle: michaelmartin8681
name: Michaelmartin8681
url: https://www.youtube.com/@michaelmartin8681/videos
- id: UCxV18lwwh29DiWuooz7UCvg
handle: davidbusuttil9086
name: Davidbusuttil9086
url: https://www.youtube.com/@davidbusuttil9086/videos
- id: UCosBhpwwGh_ueYq4ZSi5dGw
handle: matthewparlato5626
name: Matthewparlato5626
url: https://www.youtube.com/@matthewparlato5626/videos
- id: UCwF5LWNOFou_50bT65bq4Bg
handle: lancecleaver227
name: Lancecleaver227
url: https://www.youtube.com/@lancecleaver227/videos
- id: UCaJ0CqiiMSTq4X0rycUOIjw
handle: theplebistocrat
name: the plebistocrat
url: https://www.youtube.com/@theplebistocrat/videos
- id: UCZA5mUAyYcCL1kYgxbeMNrA
handle: RightInChrist
name: Rightinchrist
url: https://www.youtube.com/@RightInChrist/videos
- id: UCDIPXp88qjAV3TiaR5Uo3iQ
handle: RafeKelley
name: Rafekelley
url: https://www.youtube.com/@RafeKelley/videos
- id: UCedgru6YCto3zyXjlbuQuqA
handle: WavesOfObsession
name: Wavesofobsession
url: https://www.youtube.com/@WavesOfObsession/videos

View File

@ -6,7 +6,13 @@ Environment Variables:
ELASTIC_USERNAME / ELASTIC_PASSWORD: Optional basic auth credentials.
ELASTIC_INDEX: Target index name (default: this_little_corner_py).
LOCAL_DATA_DIR: Root folder containing JSON metadata (default: ../data/video_metadata).
CHANNELS_PATH: Path to the canonical channel list (default: ./channels.yml).
RSS_FEED_URL: Public URL/path for the unified RSS feed (default: /rss/youtube-unified).
RSS_FEED_UPSTREAM: Base URL to proxy feed requests (default: http://localhost:8097).
YOUTUBE_API_KEY: Optional API key for pulling metadata directly from YouTube.
RATE_LIMIT_ENABLED: Toggle API rate limiting (default: 1).
RATE_LIMIT_REQUESTS: Max requests per window per client (default: 60).
RATE_LIMIT_WINDOW_SECONDS: Window size in seconds (default: 60).
"""
from __future__ import annotations
@ -53,16 +59,27 @@ class YoutubeSettings:
api_key: Optional[str]
@dataclass(frozen=True)
class RateLimitSettings:
enabled: bool
requests: int
window_seconds: int
@dataclass(frozen=True)
class AppConfig:
elastic: ElasticSettings
data: DataSettings
youtube: YoutubeSettings
rate_limit: RateLimitSettings
qdrant_url: str
qdrant_collection: str
qdrant_vector_name: Optional[str]
qdrant_vector_size: int
qdrant_embed_model: str
channels_path: Path
rss_feed_url: str
rss_feed_upstream: str
def _env(name: str, default: Optional[str] = None) -> Optional[str]:
@ -94,15 +111,29 @@ def load_config() -> AppConfig:
)
data = DataSettings(root=data_root)
youtube = YoutubeSettings(api_key=_env("YOUTUBE_API_KEY"))
rate_limit = RateLimitSettings(
enabled=_env("RATE_LIMIT_ENABLED", "1") in {"1", "true", "True"},
requests=max(int(_env("RATE_LIMIT_REQUESTS", "60")), 0),
window_seconds=max(int(_env("RATE_LIMIT_WINDOW_SECONDS", "60")), 1),
)
channels_path = Path(
_env("CHANNELS_PATH", str(Path(__file__).parent / "channels.yml"))
).expanduser()
rss_feed_url = _env("RSS_FEED_URL", "/rss/youtube-unified")
rss_feed_upstream = _env("RSS_FEED_UPSTREAM", "http://localhost:8097")
return AppConfig(
elastic=elastic,
data=data,
youtube=youtube,
rate_limit=rate_limit,
qdrant_url=_env("QDRANT_URL", "http://localhost:6333"),
qdrant_collection=_env("QDRANT_COLLECTION", "tlc_embeddings"),
qdrant_vector_name=_env("QDRANT_VECTOR_NAME"),
qdrant_vector_size=int(_env("QDRANT_VECTOR_SIZE", "1024")),
qdrant_embed_model=_env("QDRANT_EMBED_MODEL", "BAAI/bge-large-en-v1.5"),
channels_path=channels_path,
rss_feed_url=rss_feed_url or "",
rss_feed_upstream=rss_feed_upstream or "",
)

View File

@ -1,8 +1,47 @@
version: "3.9"
# Runs only the Flask app container, pointing to remote Elasticsearch/Qdrant.
# TLC Search + Feed Master - Complete YouTube content indexing & RSS aggregation
# Provide ELASTIC_URL / QDRANT_URL (and related) via environment or a .env file.
services:
# RSS Bridge - Converts YouTube channels to RSS feeds
rss-bridge:
image: rssbridge/rss-bridge:latest
container_name: tlc-rss-bridge
hostname: rss-bridge
restart: unless-stopped
logging:
driver: json-file
options:
max-size: "10m"
max-file: "5"
ports:
- "3001:80"
# Feed Master - Aggregates multiple RSS feeds into unified feed
feed-master:
image: umputun/feed-master:latest
container_name: tlc-feed-master
hostname: feed-master
restart: unless-stopped
depends_on:
- rss-bridge
logging:
driver: json-file
options:
max-size: "10m"
max-file: "5"
environment:
- DEBUG=false
- FM_DB=/srv/var/feed-master.bdb
- FM_CONF=/srv/etc/fm.yml
volumes:
- ./feed-master-config:/srv/etc
- ./feed-master-config/var:/srv/var
- ./feed-master-config/images:/srv/images
ports:
- "8097:8080"
# TLC Search - Flask app for searching YouTube transcripts
app:
build:
context: .
@ -16,6 +55,9 @@ services:
ELASTIC_PASSWORD: ${ELASTIC_PASSWORD:-}
ELASTIC_API_KEY: ${ELASTIC_API_KEY:-}
ELASTIC_VERIFY_CERTS: ${ELASTIC_VERIFY_CERTS:-0}
CHANNELS_PATH: ${CHANNELS_PATH:-/app/python_app/channels.yml}
RSS_FEED_URL: ${RSS_FEED_URL:-/rss/youtube-unified}
RSS_FEED_UPSTREAM: ${RSS_FEED_UPSTREAM:-http://feed-master:8080}
QDRANT_URL: ${QDRANT_URL:?set QDRANT_URL to your remote Qdrant URL}
QDRANT_COLLECTION: ${QDRANT_COLLECTION:-tlc-captions-full}
QDRANT_VECTOR_NAME: ${QDRANT_VECTOR_NAME:-}
@ -23,4 +65,5 @@ services:
QDRANT_EMBED_MODEL: ${QDRANT_EMBED_MODEL:-BAAI/bge-large-en-v1.5}
LOCAL_DATA_DIR: ${LOCAL_DATA_DIR:-/app/data/video_metadata}
volumes:
- ./channels.yml:/app/python_app/channels.yml:ro
- ./data:/app/data:ro

166
feed-master-config/fm.yml Normal file
View File

@ -0,0 +1,166 @@
# Feed Master Configuration
# Auto-generated from channels.yml
# Do not edit manually - regenerate using generate_feed_config_simple.py
feeds:
youtube-unified:
title: YouTube Unified Feed
description: Aggregated feed from all YouTube channels
link: https://youtube.com
language: "en-us"
sources:
- name: A Quality Existence
url: http://rss-bridge/?action=display&bridge=YoutubeBridge&context=By+channel+id&c=UC6vg0HkKKlgsWk-3HfV-vnw&format=Mrss
- name: Adams Fall
url: http://rss-bridge/?action=display&bridge=YoutubeBridge&context=By+channel+id&c=UCbD1Pm0TOcRK2zaCrwgcTTg&format=Mrss
- name: Andrea with the Bangs
url: http://rss-bridge/?action=display&bridge=YoutubeBridge&context=By+channel+id&c=UCeWWxwzgLYUbfjWowXhVdYw&format=Mrss
- name: Aphrael Pilotson
url: http://rss-bridge/?action=display&bridge=YoutubeBridge&context=By+channel+id&c=UCGHuURJ1XFHzPSeokf6510A&format=Mrss
- name: Cassidy van der Kamp
url: http://rss-bridge/?action=display&bridge=YoutubeBridge&context=By+channel+id&c=UCM9Z05vuQhMEwsV03u6DrLA&format=Mrss
- name: Channel UCCebR16tXbv
url: http://rss-bridge/?action=display&bridge=YoutubeBridge&context=By+channel+id&c=UCCebR16tXbv5Ykk9_WtCCug&format=Mrss
- name: Channel UCiJmdXTb76i
url: http://rss-bridge/?action=display&bridge=YoutubeBridge&context=By+channel+id&c=UCiJmdXTb76i8eIPXdJyf8ZQ&format=Mrss
- name: Charlie's Little Corner
url: http://rss-bridge/?action=display&bridge=YoutubeBridge&context=By+channel+id&c=UC952hDf_C4nYJdqwK7VzTxA&format=Mrss
- name: Chris Howard
url: http://rss-bridge/?action=display&bridge=YoutubeBridge&context=By+channel+id&c=UC704NVL2DyzYg3rMU9r1f7A&format=Mrss
- name: Christian Baxter
url: http://rss-bridge/?action=display&bridge=YoutubeBridge&context=By+channel+id&c=UCU5SNBfTo4umhjYz6M0Jsmg&format=Mrss
- name: Climbing Mt Sophia
url: http://rss-bridge/?action=display&bridge=YoutubeBridge&context=By+channel+id&c=UCk9O91WwruXmgu1NQrKZZEw&format=Mrss
- name: Corner Citizen
url: http://rss-bridge/?action=display&bridge=YoutubeBridge&context=By+channel+id&c=UCAXyF_HFeMgwS8nkGVeroAA&format=Mrss
- name: Davidbusuttil9086
url: http://rss-bridge/?action=display&bridge=YoutubeBridge&context=By+channel+id&c=UCxV18lwwh29DiWuooz7UCvg&format=Mrss
- name: Ein Sof - Infinite Reflections
url: http://rss-bridge/?action=display&bridge=YoutubeBridge&context=By+channel+id&c=UC4Rmxg7saTfwIpvq3QEzylQ&format=Mrss
- name: Emily Rajeh
url: http://rss-bridge/?action=display&bridge=YoutubeBridge&context=By+channel+id&c=UCn5mf-fcpBmkepIpZ8eFRng&format=Mrss
- name: Eric Seitz
url: http://rss-bridge/?action=display&bridge=YoutubeBridge&context=By+channel+id&c=UCTdH4nh6JTcfKUAWvmnPoIQ&format=Mrss
- name: Ethan Caughey
url: http://rss-bridge/?action=display&bridge=YoutubeBridge&context=By+channel+id&c=UCv2Qft5mZrmA9XAwnl9PU-g&format=Mrss
- name: faturechi
url: http://rss-bridge/?action=display&bridge=YoutubeBridge&context=By+channel+id&c=UCprytROeCztMOMe8plyJRMg&format=Mrss
- name: Finding Ideas
url: http://rss-bridge/?action=display&bridge=YoutubeBridge&context=By+channel+id&c=UC6Tvr9mBXNaAxLGRA_sUSRA&format=Mrss
- name: Free Rilian
url: http://rss-bridge/?action=display&bridge=YoutubeBridge&context=By+channel+id&c=UCT8Lq3ufaGEnCSS8WpFatqw&format=Mrss
- name: From Whom All Blessings Flow
url: http://rss-bridge/?action=display&bridge=YoutubeBridge&context=By+channel+id&c=UC_-YQbnPfBbIezMr1adZZiQ&format=Mrss
- name: Grahampardun
url: http://rss-bridge/?action=display&bridge=YoutubeBridge&context=By+channel+id&c=UCB0C8DEIQlQzvSGuGriBxtA&format=Mrss
- name: Grail Country
url: http://rss-bridge/?action=display&bridge=YoutubeBridge&context=By+channel+id&c=UCsi_x8c12NW9FR7LL01QXKA&format=Mrss
- name: Grey Hamilton
url: http://rss-bridge/?action=display&bridge=YoutubeBridge&context=By+channel+id&c=UCrZyTWGMdRM9_P26RKPvh3A&format=Mrss
- name: Grizwald Grim
url: http://rss-bridge/?action=display&bridge=YoutubeBridge&context=By+channel+id&c=UCAqTQ5yLHHH44XWwWXLkvHQ&format=Mrss
- name: Jesspurviance
url: http://rss-bridge/?action=display&bridge=YoutubeBridge&context=By+channel+id&c=UCQ7rVoApmYIpcmU7fB9RPyw&format=Mrss
- name: John Vervaeke
url: http://rss-bridge/?action=display&bridge=YoutubeBridge&context=By+channel+id&c=UCpqDUjTsof-kTNpnyWper_Q&format=Mrss
- name: Jonathan Dumeer
url: http://rss-bridge/?action=display&bridge=YoutubeBridge&context=By+channel+id&c=UC4GiA5Hnwy415uVRymxPK-w&format=Mrss
- name: Jonathan Pageau
url: http://rss-bridge/?action=display&bridge=YoutubeBridge&context=By+channel+id&c=UCtCTSf3UwRU14nYWr_xm-dQ&format=Mrss
- name: Jordan B Peterson
url: http://rss-bridge/?action=display&bridge=YoutubeBridge&context=By+channel+id&c=UCL_f53ZEJxp8TtlOkHwMV9Q&format=Mrss
- name: Jordan Hall
url: http://rss-bridge/?action=display&bridge=YoutubeBridge&context=By+channel+id&c=UCMzT-mdCqoyEv_-YZVtE7MQ&format=Mrss
- name: Joseph Lambrecht
url: http://rss-bridge/?action=display&bridge=YoutubeBridge&context=By+channel+id&c=UCiOZYvBGHw1Y6wyzffwEp9g&format=Mrss
- name: Justinsmorningcoffee
url: http://rss-bridge/?action=display&bridge=YoutubeBridge&context=By+channel+id&c=UCPUVeoQYyq8cndWwyczX6RA&format=Mrss
- name: Kale Zelden
url: http://rss-bridge/?action=display&bridge=YoutubeBridge&context=By+channel+id&c=UCiukuaNd_qzRDTW9qe2OC1w&format=Mrss
- name: Lancecleaver227
url: http://rss-bridge/?action=display&bridge=YoutubeBridge&context=By+channel+id&c=UCwF5LWNOFou_50bT65bq4Bg&format=Mrss
- name: Lucas Vos
url: http://rss-bridge/?action=display&bridge=YoutubeBridge&context=By+channel+id&c=UCez1fzMRGctojfis2lfRYug&format=Mrss
- name: Luke Thompson
url: http://rss-bridge/?action=display&bridge=YoutubeBridge&context=By+channel+id&c=UC_dnk5D4tFCRYCrKIcQlcfw&format=Mrss
- name: Marc Jackson
url: http://rss-bridge/?action=display&bridge=YoutubeBridge&context=By+channel+id&c=UCMJCtS8jKouJ2d8UIYzW3vg&format=Mrss
- name: Mark D Parker
url: http://rss-bridge/?action=display&bridge=YoutubeBridge&context=By+channel+id&c=UCVdSgEf9bLXFMBGSMhn7x4Q&format=Mrss
- name: Mark S
url: http://rss-bridge/?action=display&bridge=YoutubeBridge&context=By+channel+id&c=UC977g6oGYIJDQnsZOGjQBBA&format=Mrss
- name: Mary Kochan
url: http://rss-bridge/?action=display&bridge=YoutubeBridge&context=By+channel+id&c=UC2leFZRD0ZlQDQxpR2Zd8oA&format=Mrss
- name: Matthewparlato5626
url: http://rss-bridge/?action=display&bridge=YoutubeBridge&context=By+channel+id&c=UCosBhpwwGh_ueYq4ZSi5dGw&format=Mrss
- name: mcmosav
url: http://rss-bridge/?action=display&bridge=YoutubeBridge&context=By+channel+id&c=UCnojyPW0IgLWTQ0SaDQ1KBA&format=Mrss
- name: Michaelmartin8681
url: http://rss-bridge/?action=display&bridge=YoutubeBridge&context=By+channel+id&c=UCpLJJLVB_7v4Igq-9arja1A&format=Mrss
- name: More Christ
url: http://rss-bridge/?action=display&bridge=YoutubeBridge&context=By+channel+id&c=UCEPOn4cgvrrerg_-q_Ygw1A&format=Mrss
- name: Neal Daedalus
url: http://rss-bridge/?action=display&bridge=YoutubeBridge&context=By+channel+id&c=UC8mJqpS_EBbMcyuzZDF0TEw&format=Mrss
- name: Nechama Gluck
url: http://rss-bridge/?action=display&bridge=YoutubeBridge&context=By+channel+id&c=UC5goUoFM4LPim4eY4pwRXYw&format=Mrss
- name: OG Rose
url: http://rss-bridge/?action=display&bridge=YoutubeBridge&context=By+channel+id&c=UC6zHDj4D323xJkblnPTvY3Q&format=Mrss
- name: Paul Anleitner
url: http://rss-bridge/?action=display&bridge=YoutubeBridge&context=By+channel+id&c=UC2yCyOMUeem-cYwliC-tLJg&format=Mrss
- name: Paul Rene Nichols
url: http://rss-bridge/?action=display&bridge=YoutubeBridge&context=By+channel+id&c=UCDCfI162vhPvwdxW6X4nmiw&format=Mrss
- name: Paul VanderKlay
url: http://rss-bridge/?action=display&bridge=YoutubeBridge&context=By+channel+id&c=UCGsDIP_K6J6VSTqlq-9IPlg&format=Mrss
- name: President Foxman
url: http://rss-bridge/?action=display&bridge=YoutubeBridge&context=By+channel+id&c=UCMVG5eqpYFVEB-a9IqAOuHA&format=Mrss
- name: Rafekelley
url: http://rss-bridge/?action=display&bridge=YoutubeBridge&context=By+channel+id&c=UCDIPXp88qjAV3TiaR5Uo3iQ&format=Mrss
- name: Randos United
url: http://rss-bridge/?action=display&bridge=YoutubeBridge&context=By+channel+id&c=UCEzWTLDYmL8soRdQec9Fsjw&format=Mrss
- name: Randos United 2
url: http://rss-bridge/?action=display&bridge=YoutubeBridge&context=By+channel+id&c=UC1KgNsMdRoIA_njVmaDdHgA&format=Mrss
- name: Rebel Wisdom
url: http://rss-bridge/?action=display&bridge=YoutubeBridge&context=By+channel+id&c=UCFQ6Gptuq-sLflbJ4YY3Umw&format=Mrss
- name: Restoring Meaning
url: http://rss-bridge/?action=display&bridge=YoutubeBridge&context=By+channel+id&c=UCzX6R3ZLQh5Zma_5AsPcqPA&format=Mrss
- name: Rightinchrist
url: http://rss-bridge/?action=display&bridge=YoutubeBridge&context=By+channel+id&c=UCZA5mUAyYcCL1kYgxbeMNrA&format=Mrss
- name: Ron Copperman
url: http://rss-bridge/?action=display&bridge=YoutubeBridge&context=By+channel+id&c=UC5yLuFQCms4nb9K2bGQLqIw&format=Mrss
- name: Sartori Studios
url: http://rss-bridge/?action=display&bridge=YoutubeBridge&context=By+channel+id&c=UC8SErJkYnDsYGh1HxoZkl-g&format=Mrss
- name: Secular Koranism
url: http://rss-bridge/?action=display&bridge=YoutubeBridge&context=By+channel+id&c=UCFLovlJ8RFApfjrf2y157xg&format=Mrss
- name: Shoulder Serf
url: http://rss-bridge/?action=display&bridge=YoutubeBridge&context=By+channel+id&c=UChptV-kf8lnncGh7DA2m8Pw&format=Mrss
- name: Skankenstein
url: http://rss-bridge/?action=display&bridge=YoutubeBridge&context=By+channel+id&c=UCUSyTPWW4JaG1YfUPddw47Q&format=Mrss
- name: Strange Theology
url: http://rss-bridge/?action=display&bridge=YoutubeBridge&context=By+channel+id&c=UCEY1vGNBPsC3dCatZyK3Jkw&format=Mrss
- name: The Anadromist
url: http://rss-bridge/?action=display&bridge=YoutubeBridge&context=By+channel+id&c=UCIAtCuzdvgNJvSYILnHtdWA&format=Mrss
- name: The Chris Show
url: http://rss-bridge/?action=display&bridge=YoutubeBridge&context=By+channel+id&c=UClIDP7_Kzv_7tDQjTv9EhrA&format=Mrss
- name: The Meaning Code
url: http://rss-bridge/?action=display&bridge=YoutubeBridge&context=By+channel+id&c=UCgp_r6WlBwDSJrP43Mz07GQ&format=Mrss
- name: the plebistocrat
url: http://rss-bridge/?action=display&bridge=YoutubeBridge&context=By+channel+id&c=UCaJ0CqiiMSTq4X0rycUOIjw&format=Mrss
- name: The Young Levite
url: http://rss-bridge/?action=display&bridge=YoutubeBridge&context=By+channel+id&c=UC1a4VtU_SMSfdRiwMJR33YQ&format=Mrss
- name: TheCommonToad
url: http://rss-bridge/?action=display&bridge=YoutubeBridge&context=By+channel+id&c=UC-QiBn6GsM3JZJAeAQpaGAA&format=Mrss
- name: TheScrollersPodcast
url: http://rss-bridge/?action=display&bridge=YoutubeBridge&context=By+channel+id&c=UC5uv-BxzCrN93B_5qbOdRWw&format=Mrss
- name: Transfigured
url: http://rss-bridge/?action=display&bridge=YoutubeBridge&context=By+channel+id&c=UCg7Ed0lecvko58ibuX1XHng&format=Mrss
- name: UpCycleClub
url: http://rss-bridge/?action=display&bridge=YoutubeBridge&context=By+channel+id&c=UCzw2FNI3IRphcAoVcUENOgQ&format=Mrss
- name: Wavesofobsession
url: http://rss-bridge/?action=display&bridge=YoutubeBridge&context=By+channel+id&c=UCedgru6YCto3zyXjlbuQuqA&format=Mrss
system:
update: 5m
max_per_feed: 5
max_total: 200
max_keep: 1000
base_url: http://localhost:8097

91
generate_feed_config.py Normal file
View File

@ -0,0 +1,91 @@
#!/usr/bin/env python3
"""
Generate feed-master configuration from channels.yml.
This ensures a single source of truth for the YouTube channels.
"""
import sys
from pathlib import Path
from .channel_config import build_rss_bridge_url, load_channel_entries
def generate_fm_config(channels_file, output_file, rss_bridge_host="rss-bridge"):
"""Generate feed-master YAML configuration from channels.yml"""
print(f"Reading channels from {channels_file}")
channels = load_channel_entries(Path(channels_file))
print(f"Found {len(channels)} channels")
# Generate feed configuration
config = []
config.append("# Feed Master Configuration")
config.append("# Auto-generated from channels.yml")
config.append("# Do not edit manually - regenerate using generate_feed_config.py")
config.append("")
config.append("feeds:")
config.append(" youtube-unified:")
config.append(" title: YouTube Unified Feed")
config.append(" description: Aggregated feed from all YouTube channels")
config.append(" link: https://youtube.com")
config.append(' language: "en-us"')
config.append(" sources:")
processed = 0
skipped = 0
for channel in channels:
if not channel.get("rss_enabled", True):
skipped += 1
continue
bridge_url = build_rss_bridge_url(channel, rss_bridge_host=rss_bridge_host)
if not bridge_url:
skipped += 1
continue
name = channel.get("name", "Unknown")
config.append(f" - name: {name}")
config.append(f" url: {bridge_url}")
processed += 1
# Add system configuration
config.append("")
config.append("system:")
config.append(" update: 5m")
config.append(" max_per_feed: 5")
config.append(" max_total: 200")
config.append(" max_keep: 1000")
config.append(" base_url: http://localhost:8097")
# Write output
print(f"\nProcessed {processed} channels, skipped {skipped}")
with open(output_file, 'w') as f:
f.write('\n'.join(config))
print(f"Configuration written to {output_file}")
print(f"\nTo apply this configuration:")
print(f" 1. Copy {output_file} to feed-master/etc/fm.yml")
print(f" 2. Restart the feed-master service")
if __name__ == "__main__":
# Default paths
script_dir = Path(__file__).parent
channels_file = script_dir / "channels.yml"
output_file = script_dir / "feed-master-config" / "fm.yml"
# Allow overriding via command line
if len(sys.argv) > 1:
channels_file = Path(sys.argv[1])
if len(sys.argv) > 2:
output_file = Path(sys.argv[2])
if not channels_file.exists():
print(f"Error: {channels_file} not found", file=sys.stderr)
print(f"\nUsage: {sys.argv[0]} [channels.yml] [output.yml]", file=sys.stderr)
sys.exit(1)
# Ensure output directory exists
output_file.parent.mkdir(parents=True, exist_ok=True)
generate_fm_config(channels_file, output_file)

88
generate_feed_config_simple.py Executable file
View File

@ -0,0 +1,88 @@
#!/usr/bin/env python3
"""
Generate feed-master configuration from channels.yml.
Simplified version that doesn't require RSS-Bridge to be running.
"""
import sys
from pathlib import Path
from .channel_config import build_rss_bridge_url, load_channel_entries
def generate_fm_config(channels_file, output_file, rss_bridge_host="rss-bridge"):
"""Generate feed-master YAML configuration from channels.yml"""
print(f"Reading channels from {channels_file}")
channels = load_channel_entries(Path(channels_file))
print(f"Found {len(channels)} channels")
# Generate feed configuration
config = []
config.append("# Feed Master Configuration")
config.append("# Auto-generated from channels.yml")
config.append("# Do not edit manually - regenerate using generate_feed_config_simple.py")
config.append("")
config.append("feeds:")
config.append(" youtube-unified:")
config.append(" title: YouTube Unified Feed")
config.append(" description: Aggregated feed from all YouTube channels")
config.append(" link: https://youtube.com")
config.append(' language: "en-us"')
config.append(" sources:")
processed = 0
skipped = 0
for channel in channels:
if not channel.get("rss_enabled", True):
skipped += 1
continue
bridge_url = build_rss_bridge_url(channel, rss_bridge_host=rss_bridge_host)
if not bridge_url:
skipped += 1
continue
name = channel.get("name", "Unknown")
config.append(f" - name: {name}")
config.append(f" url: {bridge_url}")
processed += 1
# Add system configuration
config.append("")
config.append("system:")
config.append(" update: 5m")
config.append(" max_per_feed: 5")
config.append(" max_total: 200")
config.append(" max_keep: 1000")
config.append(" base_url: http://localhost:8097")
# Write output
print(f"\nProcessed {processed} channels, skipped {skipped}")
with open(output_file, 'w') as f:
f.write('\n'.join(config))
print(f"Configuration written to {output_file}")
if __name__ == "__main__":
# Default paths
script_dir = Path(__file__).parent
channels_file = script_dir / "channels.yml"
output_file = script_dir / "feed-master-config" / "fm.yml"
# Allow overriding via command line
if len(sys.argv) > 1:
channels_file = Path(sys.argv[1])
if len(sys.argv) > 2:
output_file = Path(sys.argv[2])
if not channels_file.exists():
print(f"Error: {channels_file} not found", file=sys.stderr)
print(f"\nUsage: {sys.argv[0]} [channels.yml] [output.yml]", file=sys.stderr)
sys.exit(1)
# Ensure output directory exists
output_file.parent.mkdir(parents=True, exist_ok=True)
generate_fm_config(channels_file, output_file)

View File

@ -4,4 +4,3 @@ youtube-transcript-api>=0.6
google-api-python-client>=2.0.0
python-dotenv>=0.19.0
requests>=2.31.0
sentence-transformers>=2.7.0

View File

@ -4,10 +4,10 @@ Flask application exposing search, graph, and transcript endpoints for TLC.
Routes:
GET / -> static HTML search page.
GET /graph -> static reference graph UI.
GET /vector-search -> experimental Qdrant vector search UI.
GET /api/channels -> channels aggregation.
GET /api/channel-list -> canonical channel list + feed URL.
GET /channels.txt -> raw channel URLs list.
GET /api/search -> Elasticsearch keyword search.
POST /api/vector-search -> Qdrant vector similarity query.
GET /api/graph -> reference graph API.
GET /api/transcript -> transcript JSON payload.
"""
@ -17,23 +17,23 @@ from __future__ import annotations
import copy
import json
import logging
import os
import re
import urllib.parse
import xml.etree.ElementTree as ET
from pathlib import Path
from typing import Any, Dict, Iterable, List, Optional, Sequence, Set, Tuple
from typing import Any, Deque, Dict, Iterable, List, Optional, Sequence, Set, Tuple
from collections import Counter, deque
from collections import Counter, defaultdict, deque
from datetime import datetime
from flask import Flask, jsonify, request, send_from_directory
from threading import Lock
from time import monotonic
import requests
try:
from sentence_transformers import SentenceTransformer # type: ignore
except ImportError: # pragma: no cover - optional dependency
SentenceTransformer = None
from flask import Flask, jsonify, request, send_from_directory
from .config import CONFIG, AppConfig
from .channel_config import load_channel_entries
try:
from elasticsearch import Elasticsearch # type: ignore
@ -43,35 +43,52 @@ except ImportError: # pragma: no cover - dependency optional
BadRequestError = Exception # type: ignore
LOGGER = logging.getLogger(__name__)
_EMBED_MODEL = None
_EMBED_MODEL_NAME: Optional[str] = None
# Security constants
MAX_QUERY_SIZE = 100
MAX_OFFSET = 10000
DEFAULT_ELASTIC_TIMEOUT = int(os.environ.get("ELASTIC_TIMEOUT_SECONDS", "30"))
_RATE_LIMIT_BUCKETS: Dict[str, Deque[float]] = defaultdict(deque)
_RATE_LIMIT_LOCK = Lock()
_RSS_AUTHOR_CACHE: Dict[str, Tuple[str, float]] = {}
_RSS_AUTHOR_LOCK = Lock()
_RSS_AUTHOR_TTL_SECONDS = 60 * 60 * 24
_RSS_OEMBED_LIMIT = 12
def _ensure_embedder(model_name: str) -> "SentenceTransformer":
global _EMBED_MODEL, _EMBED_MODEL_NAME
if SentenceTransformer is None: # pragma: no cover - optional dependency
raise RuntimeError(
"sentence-transformers is required for vector search. Install via pip install sentence-transformers."
)
if _EMBED_MODEL is None or _EMBED_MODEL_NAME != model_name:
LOGGER.info("Loading embedding model: %s", model_name)
_EMBED_MODEL = SentenceTransformer(model_name)
_EMBED_MODEL_NAME = model_name
return _EMBED_MODEL
def _client_rate_key() -> str:
forwarded = request.headers.get("X-Forwarded-For", "")
if forwarded:
return forwarded.split(",")[0].strip()
return request.headers.get("X-Real-IP") or request.remote_addr or "unknown"
def embed_query(text: str, *, model_name: str, expected_dim: int) -> List[float]:
embedder = _ensure_embedder(model_name)
vector = embedder.encode(
[f"query: {text}"],
show_progress_bar=False,
normalize_embeddings=True,
)[0].tolist()
if len(vector) != expected_dim:
raise RuntimeError(
f"Embedding dimension mismatch (expected {expected_dim}, got {len(vector)})"
)
return vector
def _rate_limited_response(retry_after: int):
response = jsonify({"error": "rate_limited", "retryAfter": retry_after})
response.status_code = 429
response.headers["Retry-After"] = str(retry_after)
return response
def sanitize_query_string(query: str) -> str:
"""
Sanitize user input for Elasticsearch query_string queries.
Removes dangerous field targeting and script injection patterns.
"""
if not query:
return "*"
sanitized = query.strip()
# Remove field targeting patterns like "_id:", "_source:", "script:"
dangerous_field_patterns = [
r'\b_[a-z_]+\s*:', # Internal fields like _id:, _source:
r'\bscript\s*:', # Script injection
]
for pattern in dangerous_field_patterns:
sanitized = re.sub(pattern, '', sanitized, flags=re.IGNORECASE)
# Remove excessive wildcards that could cause ReDoS
sanitized = re.sub(r'\*{2,}', '*', sanitized)
sanitized = re.sub(r'\?{2,}', '?', sanitized)
return sanitized.strip() or "*"
def _ensure_client(config: AppConfig) -> "Elasticsearch":
@ -94,6 +111,192 @@ def _ensure_client(config: AppConfig) -> "Elasticsearch":
return Elasticsearch(config.elastic.url, **kwargs)
def _extract_video_id(url: str) -> Optional[str]:
if not url:
return None
try:
parsed = urllib.parse.urlparse(url.strip())
except Exception:
return None
host = (parsed.netloc or "").lower()
path = parsed.path or ""
if host in {"youtu.be", "www.youtu.be"}:
return path.lstrip("/") or None
if host.endswith("youtube.com"):
if path == "/watch":
params = urllib.parse.parse_qs(parsed.query)
return (params.get("v") or [None])[0]
if path.startswith("/shorts/"):
return path.split("/", 2)[2] if len(path.split("/", 2)) > 2 else None
return None
def _lookup_channel_names(
client: "Elasticsearch",
index: str,
video_ids: Iterable[str],
) -> Dict[str, str]:
ids = [vid for vid in video_ids if vid]
if not ids:
return {}
now = monotonic()
mapping: Dict[str, str] = {}
cached_hits = 0
elastic_hits = 0
remaining = []
with _RSS_AUTHOR_LOCK:
for vid in ids:
cached = _RSS_AUTHOR_CACHE.get(vid)
if cached and (now - cached[1]) < _RSS_AUTHOR_TTL_SECONDS:
mapping[vid] = cached[0]
cached_hits += 1
else:
remaining.append(vid)
if remaining:
try:
response = client.mget(index=index, body={"ids": remaining})
except Exception as exc: # pragma: no cover - elasticsearch handles errors
LOGGER.warning("RSS title lookup failed: %s", exc)
response = {}
for doc in response.get("docs", []):
if not doc.get("found"):
continue
source = doc.get("_source") or {}
name = source.get("channel_name") or source.get("channel_id")
if name:
vid = doc.get("_id", "")
mapping[vid] = str(name)
elastic_hits += 1
with _RSS_AUTHOR_LOCK:
_RSS_AUTHOR_CACHE[vid] = (mapping[vid], now)
missing = [vid for vid in remaining if vid not in mapping]
oembed_hits = 0
oembed_attempts = 0
if missing:
for vid in missing[:_RSS_OEMBED_LIMIT]:
oembed_attempts += 1
video_url = f"https://www.youtube.com/watch?v={vid}"
oembed_url = (
"https://www.youtube.com/oembed?format=json&url="
+ urllib.parse.quote(video_url, safe="")
)
try:
response = requests.get(oembed_url, timeout=10)
if response.status_code != 200:
continue
data = response.json()
except Exception:
continue
author = data.get("author_name")
if not author:
continue
mapping[vid] = str(author)
oembed_hits += 1
with _RSS_AUTHOR_LOCK:
_RSS_AUTHOR_CACHE[vid] = (mapping[vid], now)
missing_count = max(len(ids) - cached_hits - elastic_hits - oembed_hits, 0)
if missing_count or oembed_attempts:
LOGGER.info(
"RSS title lookup: total=%d cached=%d elastic=%d oembed=%d missing=%d",
len(ids),
cached_hits,
elastic_hits,
oembed_hits,
missing_count,
)
else:
LOGGER.debug(
"RSS title lookup: total=%d cached=%d elastic=%d",
len(ids),
cached_hits,
elastic_hits,
)
return mapping
def _rewrite_rss_payload(
content: bytes,
client: "Elasticsearch",
index: str,
feed_name: str,
) -> bytes:
try:
root = ET.fromstring(content)
except ET.ParseError:
LOGGER.warning("RSS rewrite skipped (invalid XML) for %s", feed_name)
return content
channel = root.find("channel")
if channel is None:
LOGGER.warning("RSS rewrite skipped (missing channel) for %s", feed_name)
return content
items = channel.findall("item")
total_items = len(items)
removed_errors = 0
video_ids: Set[str] = set()
for item in list(items):
title_el = item.find("title")
title_text = (title_el.text or "").strip() if title_el is not None else ""
if "Bridge returned error" in title_text:
channel.remove(item)
removed_errors += 1
continue
link_el = item.find("link")
guid_el = item.find("guid")
video_id = _extract_video_id((link_el.text or "") if link_el is not None else "")
if not video_id:
video_id = _extract_video_id((guid_el.text or "") if guid_el is not None else "")
if video_id:
video_ids.add(video_id)
channel_name_map = _lookup_channel_names(client, index, video_ids)
if not channel_name_map:
LOGGER.info(
"RSS rewrite: feed=%s items=%d removed_errors=%d resolved=0",
feed_name,
total_items,
removed_errors,
)
return ET.tostring(root, encoding="utf-8", xml_declaration=True)
prefixed = 0
for item in channel.findall("item"):
title_el = item.find("title")
if title_el is None or not title_el.text:
continue
link_el = item.find("link")
guid_el = item.find("guid")
video_id = _extract_video_id((link_el.text or "") if link_el is not None else "")
if not video_id:
video_id = _extract_video_id((guid_el.text or "") if guid_el is not None else "")
if not video_id:
continue
channel_name = channel_name_map.get(video_id)
if not channel_name:
continue
prefix = f"{channel_name} - "
if title_el.text.startswith(prefix):
continue
title_el.text = f"{channel_name} - {title_el.text}"
prefixed += 1
LOGGER.info(
"RSS rewrite: feed=%s items=%d removed_errors=%d prefixed=%d resolved=%d",
feed_name,
total_items,
removed_errors,
prefixed,
len(channel_name_map),
)
return ET.tostring(root, encoding="utf-8", xml_declaration=True)
def metrics_payload(data_root: Path, include_external: bool = True) -> Dict[str, Any]:
total_items = 0
channel_counter: Counter = Counter()
@ -201,6 +404,15 @@ def elastic_metrics_payload(
"top_hits": {
"size": 1,
"_source": {"includes": ["channel_name"]},
"sort": [
{
"channel_name.keyword": {
"order": "asc",
"missing": "_last",
"unmapped_type": "keyword",
}
}
],
}
}
},
@ -236,7 +448,7 @@ def elastic_metrics_payload(
"Elasticsearch metrics request: %s",
json.dumps({"index": index, "body": body}, indent=2),
)
response = client.search(index=index, body=body)
response = client.search(index=index, body=body, request_timeout=30)
break
except BadRequestError as exc:
last_error = exc
@ -406,7 +618,7 @@ def build_query_payload(
if use_query_string:
base_fields = ["title^3", "description^2", "transcript_full", "transcript_secondary_full"]
qs_query = (query or "").strip() or "*"
qs_query = sanitize_query_string(query or "")
query_body: Dict[str, Any] = {
"query_string": {
"query": qs_query,
@ -568,58 +780,14 @@ def build_query_payload(
return body
def create_app(config: AppConfig = CONFIG) -> Flask:
app = Flask(__name__, static_folder=str(Path(__file__).parent / "static"))
client = _ensure_client(config)
index = config.elastic.index
qdrant_url = config.qdrant_url
qdrant_collection = config.qdrant_collection
qdrant_vector_name = config.qdrant_vector_name
qdrant_vector_size = config.qdrant_vector_size
qdrant_embed_model = config.qdrant_embed_model
@app.route("/")
def index_page():
return send_from_directory(app.static_folder, "index.html")
@app.route("/graph")
def graph_page():
return send_from_directory(app.static_folder, "graph.html")
@app.route("/vector-search")
def vector_search_page():
return send_from_directory(app.static_folder, "vector.html")
@app.route("/static/<path:filename>")
def static_files(filename: str):
return send_from_directory(app.static_folder, filename)
def normalize_reference_list(values: Any) -> List[str]:
if values is None:
return []
if isinstance(values, (list, tuple, set)):
iterable = values
else:
iterable = [values]
normalized: List[str] = []
for item in iterable:
candidate: Optional[str]
if isinstance(item, dict):
candidate = item.get("video_id") or item.get("id") # type: ignore[assignment]
else:
candidate = item # type: ignore[assignment]
if candidate is None:
continue
text = str(candidate).strip()
if not text:
continue
if text.lower() in {"none", "null"}:
continue
normalized.append(text)
return normalized
def build_graph_payload(
root_id: str, depth: int, max_nodes: int, *, include_external: bool = True
client: "Elasticsearch",
index: str,
root_id: str,
depth: int,
max_nodes: int,
*,
include_external: bool = True,
) -> Dict[str, Any]:
root_id = root_id.strip()
if not root_id:
@ -637,11 +805,7 @@ def build_graph_payload(
LOGGER.debug("Graph: failed to load %s: %s", video_id, exc)
doc_cache[video_id] = None
doc = doc_cache[video_id]
if (
doc is not None
and not include_external
and doc.get("external_reference")
):
if doc is not None and not include_external and doc.get("external_reference"):
doc_cache[video_id] = None
return None
return doc_cache[video_id]
@ -760,7 +924,10 @@ def build_graph_payload(
},
}
def build_full_graph_payload(
client: "Elasticsearch",
index: str,
max_nodes: Optional[int],
*,
highlight_id: Optional[str] = None,
@ -811,7 +978,7 @@ def build_full_graph_payload(
if (not existing.get("title") or existing["title"] == node_id) and doc.get("title"):
existing["title"] = doc["title"]
if not existing.get("channel_id") and doc.get("channel_id"):
existing["channel_id"] = doc["channel_id"]
existing["channel_id"] = doc.get("channel_id")
if (
existing.get("channel_name") in {"Unknown", node_id, None}
and (doc.get("channel_name") or doc.get("channel_id"))
@ -852,9 +1019,7 @@ def build_full_graph_payload(
scroll_id: Optional[str] = None
try:
body = {"query": query, "_source": source_fields, "sort": ["_doc"]}
response = client.search(
index=index, body=body, size=batch_size, scroll="1m"
)
response = client.search(index=index, body=body, size=batch_size, scroll="1m", request_timeout=60)
scroll_id = response.get("_scroll_id")
stop_fetch = False
while not stop_fetch:
@ -919,12 +1084,6 @@ def build_full_graph_payload(
if link.get("source") in nodes and link.get("target") in nodes
]
links = [
link
for link in links
if link.get("source") in nodes and link.get("target") in nodes
]
return {
"root": highlight_id or "",
"depth": 0,
@ -938,6 +1097,88 @@ def build_full_graph_payload(
},
}
def create_app(config: AppConfig = CONFIG) -> Flask:
app = Flask(__name__, static_folder=str(Path(__file__).parent / "static"))
app.config['MAX_CONTENT_LENGTH'] = 1 * 1024 * 1024
@app.after_request
def add_security_headers(response):
response.headers['X-Frame-Options'] = 'DENY'
response.headers['X-Content-Type-Options'] = 'nosniff'
response.headers['Permissions-Policy'] = 'geolocation=(), microphone=(), camera=()'
response.headers['Content-Security-Policy'] = (
"default-src 'self'; "
"script-src 'self' https://cdn.jsdelivr.net https://unpkg.com; "
"style-src 'self' 'unsafe-inline' https://unpkg.com; "
"img-src 'self' data: https:; "
"font-src 'self' https://unpkg.com; "
"connect-src 'self'"
)
return response
@app.before_request
def enforce_rate_limit():
if not config.rate_limit.enabled:
return None
if not request.path.startswith("/api/"):
return None
limit = config.rate_limit.requests
window_seconds = config.rate_limit.window_seconds
if limit <= 0 or window_seconds <= 0:
return None
now = monotonic()
key = _client_rate_key()
with _RATE_LIMIT_LOCK:
bucket = _RATE_LIMIT_BUCKETS[key]
while bucket and now - bucket[0] > window_seconds:
bucket.popleft()
if len(bucket) >= limit:
retry_after = max(1, int(window_seconds - (now - bucket[0])))
return _rate_limited_response(retry_after)
bucket.append(now)
return None
client = _ensure_client(config)
index = config.elastic.index
@app.route("/")
def index_page():
return send_from_directory(app.static_folder, "index.html")
@app.route("/graph")
def graph_page():
return send_from_directory(app.static_folder, "graph.html")
@app.route("/static/<path:filename>")
def static_files(filename: str):
return send_from_directory(app.static_folder, filename)
def normalize_reference_list(values: Any) -> List[str]:
if values is None:
return []
if isinstance(values, (list, tuple, set)):
iterable = values
else:
iterable = [values]
normalized: List[str] = []
for item in iterable:
candidate: Optional[str]
if isinstance(item, dict):
candidate = item.get("video_id") or item.get("id") # type: ignore[assignment]
else:
candidate = item # type: ignore[assignment]
if candidate is None:
continue
text = str(candidate).strip()
if not text:
continue
if text.lower() in {"none", "null"}:
continue
normalized.append(text)
return normalized
@app.route("/api/channels")
def channels():
include_external = request.args.get("external", default="0", type=str)
@ -952,6 +1193,15 @@ def build_full_graph_payload(
"top_hits": {
"size": 1,
"_source": {"includes": ["channel_name"]},
"sort": [
{
"channel_name.keyword": {
"order": "asc",
"missing": "_last",
"unmapped_type": "keyword",
}
}
],
}
}
},
@ -1023,6 +1273,72 @@ def build_full_graph_payload(
data.sort(key=lambda item: item["Name"].lower())
return jsonify(data)
@app.route("/api/channel-list")
def channel_list():
payload = {
"channels": [],
"rss_feed_url": config.rss_feed_url,
"source": str(config.channels_path),
}
try:
payload["channels"] = load_channel_entries(config.channels_path)
except FileNotFoundError:
LOGGER.warning("Channel list not found: %s", config.channels_path)
payload["error"] = "channels_not_found"
except Exception as exc:
LOGGER.exception("Failed to load channel list: %s", exc)
payload["error"] = "channels_load_failed"
return jsonify(payload)
@app.route("/channels.txt")
def channel_urls():
try:
channels = load_channel_entries(config.channels_path)
except FileNotFoundError:
LOGGER.warning("Channel list not found: %s", config.channels_path)
return jsonify({"error": "channels_not_found"}), 404
except Exception as exc:
LOGGER.exception("Failed to load channel list: %s", exc)
return jsonify({"error": "channels_load_failed"}), 500
urls = [channel["url"] for channel in channels if channel.get("url")]
body = "\n".join(urls) + ("\n" if urls else "")
return (body, 200, {"Content-Type": "text/plain; charset=utf-8"})
def _rss_target(feed_name: str) -> str:
name = (feed_name or "").strip("/")
if not name:
name = "youtube-unified"
return f"{config.rss_feed_upstream.rstrip('/')}/rss/{name}"
@app.route("/rss")
@app.route("/rss/<path:feed_name>")
def rss_proxy(feed_name: str = ""):
target = _rss_target(feed_name)
try:
upstream = requests.get(target, timeout=30)
except requests.RequestException as exc:
LOGGER.warning("RSS upstream error for %s: %s", target, exc)
return jsonify({"error": "rss_unavailable"}), 502
payload = _rewrite_rss_payload(upstream.content, client, index, feed_name or "rss")
headers = {
"Content-Type": upstream.headers.get(
"Content-Type", "application/xml; charset=UTF-8"
)
}
cache_header = upstream.headers.get("Cache-Control")
if cache_header:
headers["Cache-Control"] = cache_header
etag = upstream.headers.get("ETag")
if etag:
headers["ETag"] = etag
last_modified = upstream.headers.get("Last-Modified")
if last_modified:
headers["Last-Modified"] = last_modified
return (payload, upstream.status_code, headers)
@app.route("/api/graph")
def graph_api():
video_id = (request.args.get("video_id") or "").strip()
@ -1050,13 +1366,20 @@ def build_full_graph_payload(
if full_graph:
payload = build_full_graph_payload(
client,
index,
None,
highlight_id=video_id or None,
include_external=include_external,
)
else:
payload = build_graph_payload(
video_id, depth, max_nodes, include_external=include_external
client,
index,
video_id,
depth,
max_nodes,
include_external=include_external,
)
if not payload["nodes"]:
return (
@ -1126,7 +1449,7 @@ def build_full_graph_payload(
year = request.args.get("year", "", type=str) or None
sort = request.args.get("sort", "relevant", type=str)
page = max(request.args.get("page", 0, type=int), 0)
size = max(request.args.get("size", 10, type=int), 1)
size = min(max(request.args.get("size", 10, type=int), 1), MAX_QUERY_SIZE)
def parse_flag(name: str, default: bool = True) -> bool:
value = request.args.get(name)
@ -1154,6 +1477,10 @@ def build_full_graph_payload(
include_external=include_external,
)
start = page * size
if start >= MAX_OFFSET:
return jsonify({"error": "offset_too_large", "maxOffset": MAX_OFFSET}), 400
if start + size > MAX_OFFSET:
size = max(1, MAX_OFFSET - start)
if config.elastic.debug:
LOGGER.info(
"Elasticsearch search request: %s",
@ -1178,6 +1505,7 @@ def build_full_graph_payload(
from_=start,
size=size,
body=payload,
request_timeout=30,
)
if config.elastic.debug:
LOGGER.info(
@ -1199,14 +1527,10 @@ def build_full_graph_payload(
for value in (highlight_map.get("transcript_secondary_full", []) or [])
]
title_html = (
highlight_map.get("title")
or [source.get("title") or "Untitled"]
)[0]
description_html = (
highlight_map.get("description")
or [source.get("description") or ""]
)[0]
title_highlight = highlight_map.get("title") or []
description_highlight = highlight_map.get("description") or []
title_html = title_highlight[0] if title_highlight else None
description_html = description_highlight[0] if description_highlight else None
documents.append(
{
"video_id": source.get("video_id"),
@ -1367,6 +1691,15 @@ def build_full_graph_payload(
"top_hits": {
"size": 1,
"_source": {"includes": ["channel_name"]},
"sort": [
{
"channel_name.keyword": {
"order": "asc",
"missing": "_last",
"unmapped_type": "keyword",
}
}
],
}
}
},
@ -1459,145 +1792,6 @@ def build_full_graph_payload(
def frequency_page():
return send_from_directory(app.static_folder, "frequency.html")
@app.route("/api/vector-search", methods=["POST"])
def api_vector_search():
payload = request.get_json(silent=True) or {}
query_text = (payload.get("query") or "").strip()
filters = payload.get("filters") or {}
limit = max(int(payload.get("size", 10)), 1)
offset = max(int(payload.get("offset", 0)), 0)
if not query_text:
return jsonify(
{"items": [], "totalResults": 0, "offset": offset, "error": "empty_query"}
)
try:
query_vector = embed_query(
query_text, model_name=qdrant_embed_model, expected_dim=qdrant_vector_size
)
except Exception as exc: # pragma: no cover - runtime dependency
LOGGER.error("Embedding failed: %s", exc, exc_info=config.elastic.debug)
return jsonify({"error": "embedding_unavailable"}), 500
qdrant_vector_payload: Any
if qdrant_vector_name:
qdrant_vector_payload = {qdrant_vector_name: query_vector}
else:
qdrant_vector_payload = query_vector
qdrant_body: Dict[str, Any] = {
"vector": qdrant_vector_payload,
"limit": limit,
"offset": offset,
"with_payload": True,
"with_vectors": False,
}
if filters:
qdrant_body["filter"] = filters
try:
response = requests.post(
f"{qdrant_url}/collections/{qdrant_collection}/points/search",
json=qdrant_body,
timeout=20,
)
response.raise_for_status()
data = response.json()
except Exception as exc:
LOGGER.error("Vector search failed: %s", exc, exc_info=config.elastic.debug)
return jsonify({"error": "vector_search_unavailable"}), 502
points = data.get("result", []) if isinstance(data, dict) else []
items: List[Dict[str, Any]] = []
missing_channel_ids: Set[str] = set()
for point in points:
payload = point.get("payload", {}) or {}
raw_highlights = payload.get("highlights") or []
highlight_entries: List[Dict[str, str]] = []
for entry in raw_highlights:
if isinstance(entry, dict):
html_value = entry.get("html") or entry.get("text")
else:
html_value = str(entry)
if not html_value:
continue
highlight_entries.append({"html": html_value, "source": "primary"})
channel_label = (
payload.get("channel_name")
or payload.get("channel_title")
or payload.get("channel_id")
)
items.append(
{
"video_id": payload.get("video_id"),
"channel_id": payload.get("channel_id"),
"channel_name": channel_label,
"title": payload.get("title"),
"titleHtml": payload.get("title"),
"description": payload.get("description"),
"descriptionHtml": payload.get("description"),
"date": payload.get("date"),
"url": payload.get("url"),
"chunkText": payload.get("text")
or payload.get("chunk_text")
or payload.get("chunk")
or payload.get("content"),
"chunkTimestamp": payload.get("timestamp")
or payload.get("start_seconds")
or payload.get("start"),
"toHighlight": highlight_entries,
"highlightSource": {
"primary": bool(highlight_entries),
"secondary": False,
},
"distance": point.get("score"),
"internal_references_count": payload.get("internal_references_count", 0),
"internal_references": payload.get("internal_references", []),
"referenced_by_count": payload.get("referenced_by_count", 0),
"referenced_by": payload.get("referenced_by", []),
"video_status": payload.get("video_status"),
"duration": payload.get("duration"),
}
)
if (not channel_label) and payload.get("channel_id"):
missing_channel_ids.add(str(payload.get("channel_id")))
if missing_channel_ids:
try:
es_lookup = client.search(
index=index,
body={
"size": len(missing_channel_ids) * 2,
"_source": ["channel_id", "channel_name"],
"query": {"terms": {"channel_id.keyword": list(missing_channel_ids)}},
},
)
hits = es_lookup.get("hits", {}).get("hits", [])
channel_lookup = {}
for hit in hits:
src = hit.get("_source", {}) or {}
cid = src.get("channel_id")
cname = src.get("channel_name")
if cid and cname and cid not in channel_lookup:
channel_lookup[cid] = cname
for item in items:
if not item.get("channel_name"):
cid = item.get("channel_id")
if cid and cid in channel_lookup:
item["channel_name"] = channel_lookup[cid]
except Exception as exc:
LOGGER.debug("Vector channel lookup failed: %s", exc)
return jsonify(
{
"items": items,
"totalResults": len(items),
"offset": offset,
}
)
@app.route("/api/transcript")
def transcript():
video_id = request.args.get("video_id", type=str)
@ -1634,7 +1828,8 @@ def build_full_graph_payload(
def main() -> None: # pragma: no cover
logging.basicConfig(level=logging.INFO, format="%(levelname)s %(message)s")
app = create_app()
app.run(host="0.0.0.0", port=8080, debug=True)
debug_mode = os.environ.get("FLASK_DEBUG", "0").lower() in ("1", "true")
app.run(host="0.0.0.0", port=8080, debug=debug_mode)
if __name__ == "__main__": # pragma: no cover

View File

@ -45,6 +45,10 @@
const aboutBtn = document.getElementById("aboutBtn");
const aboutPanel = document.getElementById("aboutPanel");
const aboutCloseBtn = document.getElementById("aboutCloseBtn");
const rssButton = document.getElementById("rssButton");
const rssFeedLink = document.getElementById("rssFeedLink");
const channelListLink = document.getElementById("channelListLink");
const channelCount = document.getElementById("channelCount");
const resultsDiv = document.getElementById("results");
const metaDiv = document.getElementById("meta");
const metricsContainer = document.getElementById("metrics");
@ -60,7 +64,7 @@
const DEFAULT_SETTINGS = {
channel: "",
year: "",
sort: "relevant",
sort: "newer",
size: "10",
exact: true,
fuzzy: true,
@ -406,6 +410,57 @@
}
}
async function loadChannelListInfo() {
if (!rssFeedLink && !channelListLink && !channelCount) return;
try {
const res = await fetch("/api/channel-list");
const payload = await res.json();
if (rssFeedLink) {
const feedUrl = payload.rss_feed_url || "";
if (feedUrl) {
rssFeedLink.href = feedUrl;
rssFeedLink.textContent = feedUrl;
} else {
rssFeedLink.textContent = "Unavailable";
rssFeedLink.removeAttribute("href");
}
}
if (rssButton) {
const feedUrl = payload.rss_feed_url || "";
if (feedUrl) {
rssButton.href = feedUrl;
rssButton.classList.remove("is-disabled");
rssButton.removeAttribute("aria-disabled");
} else {
rssButton.removeAttribute("href");
rssButton.classList.add("is-disabled");
rssButton.setAttribute("aria-disabled", "true");
}
}
if (channelCount) {
const count = Array.isArray(payload.channels) ? payload.channels.length : 0;
channelCount.textContent = count ? `${count} channels` : "No channels loaded";
}
if (channelListLink && payload.error) {
channelListLink.textContent = "Channel list unavailable";
}
} catch (err) {
console.error("Failed to load channel list", err);
if (rssFeedLink) {
rssFeedLink.textContent = "Unavailable";
rssFeedLink.removeAttribute("href");
}
if (rssButton) {
rssButton.removeAttribute("href");
rssButton.classList.add("is-disabled");
rssButton.setAttribute("aria-disabled", "true");
}
if (channelCount) {
channelCount.textContent = "Channel list unavailable";
}
}
}
function updateUrl(q) {
const next = new URL(window.location.href);
if (q) {
@ -1340,10 +1395,12 @@ async function updateFrequencyChart(term, channels, year, queryMode, toggles = {
}
const el = document.createElement("div");
el.className = "item";
const rawTitle = item.title || "Untitled";
const rawDescription = item.description || "";
const titleHtml =
item.titleHtml || escapeHtml(item.title || "Untitled");
item.titleHtml || escapeHtml(rawTitle);
const descriptionHtml =
item.descriptionHtml || escapeHtml(item.description || "");
item.descriptionHtml || escapeHtml(rawDescription);
const header = document.createElement("div");
header.className = "result-header";
@ -1395,7 +1452,11 @@ async function updateFrequencyChart(term, channels, year, queryMode, toggles = {
}
const titleEl = document.createElement("strong");
if (item.titleHtml) {
titleEl.innerHTML = titleHtml;
} else {
titleEl.textContent = rawTitle;
}
headerMain.appendChild(titleEl);
const metaLine = document.createElement("div");
@ -1519,7 +1580,11 @@ async function updateFrequencyChart(term, channels, year, queryMode, toggles = {
if (descriptionHtml) {
const desc = document.createElement("div");
desc.className = "muted description-block";
if (item.descriptionHtml) {
desc.innerHTML = descriptionHtml;
} else {
desc.textContent = rawDescription;
}
el.appendChild(desc);
}
@ -1722,6 +1787,7 @@ window.addEventListener("popstate", () => {
setFromQuery();
loadMetrics();
loadYears();
loadChannelListInfo();
loadChannels().then(() => runSearch(currentPage));
})();

View File

@ -5,9 +5,9 @@
<meta name="viewport" content="width=device-width, initial-scale=1" />
<title>TLC Search</title>
<link rel="icon" href="/static/favicon.png" type="image/png" />
<link rel="stylesheet" href="https://unpkg.com/xp.css" />
<link rel="stylesheet" href="https://unpkg.com/xp.css" integrity="sha384-isKk8ZXKlU28/m3uIrnyTfuPaamQIF4ONLeGSfsWGEe3qBvaeLU5wkS4J7cTIwxI" crossorigin="anonymous" />
<link rel="stylesheet" href="/static/style.css" />
<script src="https://cdn.jsdelivr.net/npm/d3@7/dist/d3.min.js"></script>
<script src="https://cdn.jsdelivr.net/npm/d3@7/dist/d3.min.js" integrity="sha384-CjloA8y00+1SDAUkjs099PVfnY2KmDC2BZnws9kh8D/lX1s46w6EPhpXdqMfjK6i" crossorigin="anonymous"></script>
</head>
<body>
<div class="window" style="max-width: 1200px; margin: 20px auto;">
@ -21,11 +21,23 @@
</div>
</div>
<div class="window-body">
<div class="window-actions">
<a
id="rssButton"
class="rss-button"
href="/rss"
target="_blank"
rel="noopener"
title="Unified RSS feed"
aria-label="Unified RSS feed"
>
<svg class="rss-button__icon" viewBox="0 0 24 24" aria-hidden="true">
<path d="M6 18a2 2 0 1 0 0 4a2 2 0 0 0 0-4zm-4 6a4 4 0 0 1 4-4a4 4 0 0 1 4 4h-2a2 2 0 0 0-2-2a2 2 0 0 0-2 2zm0-8v-2c6.627 0 12 5.373 12 12h-2c0-5.523-4.477-10-10-10zm0-4V4c11.046 0 20 8.954 20 20h-2c0-9.941-8.059-18-18-18z"/>
</svg>
<span class="rss-button__label">RSS</span>
</a>
</div>
<p>Enter a phrase to query title, description, and transcript text.</p>
<p style="font-size: 11px;">
Looking for semantic matches? Try the
<a href="/vector-search">vector search beta</a>.
</p>
<fieldset>
<legend>Search</legend>
@ -133,6 +145,15 @@
<p>Use the toggles to choose exact, fuzzy, or phrase matching. Query string mode accepts raw Lucene syntax.</p>
<p>Results are ranked by your chosen sort order; the timeline summarizes the same query.</p>
<p>You can download transcripts, copy MLA citations, or explore references via the graph button.</p>
<div class="about-panel__section">
<div class="about-panel__label">Unified RSS feed</div>
<a id="rssFeedLink" href="#" target="_blank" rel="noopener">Loading…</a>
</div>
<div class="about-panel__section">
<div class="about-panel__label">Channel list</div>
<a id="channelListLink" href="/api/channel-list" target="_blank" rel="noopener">View JSON</a>
<div id="channelCount" class="about-panel__meta"></div>
</div>
</div>
</div>

View File

@ -510,6 +510,22 @@ body.modal-open {
color: #000;
}
.about-panel__section {
margin-top: 8px;
padding-top: 6px;
border-top: 1px solid #c0c0c0;
}
.about-panel__label {
font-weight: bold;
margin-bottom: 2px;
}
.about-panel__meta {
font-size: 10px;
color: #555;
}
.about-panel__header button {
border: none;
background: transparent;
@ -549,6 +565,50 @@ body.modal-open {
box-sizing: border-box;
}
.window-actions {
display: flex;
justify-content: flex-end;
margin-bottom: 6px;
}
.rss-button {
display: inline-flex;
align-items: center;
gap: 4px;
padding: 2px 6px;
border: 1px solid;
border-color: ButtonHighlight ButtonShadow ButtonShadow ButtonHighlight;
background: ButtonFace;
color: #000;
text-decoration: none;
font-size: 11px;
cursor: pointer;
}
.rss-button:hover {
background: #f3f3f3;
}
.rss-button:active {
border-color: ButtonShadow ButtonHighlight ButtonHighlight ButtonShadow;
}
.rss-button.is-disabled {
opacity: 0.5;
cursor: default;
pointer-events: none;
}
.rss-button__icon {
width: 14px;
height: 14px;
fill: #f38b00;
}
.rss-button__label {
font-weight: bold;
}
/* Badges */
.badge-row {
margin-top: 6px;

View File

@ -1,46 +0,0 @@
<!doctype html>
<html>
<head>
<meta charset="utf-8" />
<meta name="viewport" content="width=device-width, initial-scale=1" />
<title>TLC Vector Search</title>
<link rel="icon" href="/static/favicon.png" type="image/png" />
<link rel="stylesheet" href="https://unpkg.com/xp.css" />
<link rel="stylesheet" href="/static/style.css" />
</head>
<body>
<div class="window" style="max-width: 1200px; margin: 20px auto;">
<div class="title-bar">
<div class="title-bar-text">Vector Search (Experimental)</div>
<div class="title-bar-controls">
<a class="title-bar-link" href="/">⬅ Back to Search</a>
</div>
</div>
<div class="window-body">
<p>Enter a natural language prompt; results come from the Qdrant vector index.</p>
<fieldset>
<legend>Vector Query</legend>
<div class="field-row" style="margin-bottom: 8px;">
<label for="vectorQuery" style="width: 60px;">Query:</label>
<input id="vectorQuery" type="text" placeholder="Describe what you are looking for" style="flex: 1;" />
<button id="vectorSearchBtn">Search</button>
</div>
</fieldset>
<div id="vectorMeta" style="margin-top: 12px; font-size: 11px;"></div>
<fieldset style="margin-top: 16px;">
<legend>Results</legend>
<div id="vectorResults"></div>
</fieldset>
</div>
<div class="status-bar">
<p class="status-bar-field">Experimental mode • Qdrant</p>
</div>
</div>
<script src="/static/vector.js"></script>
</body>
</html>

View File

@ -1,423 +0,0 @@
(() => {
const queryInput = document.getElementById("vectorQuery");
const searchBtn = document.getElementById("vectorSearchBtn");
const resultsDiv = document.getElementById("vectorResults");
const metaDiv = document.getElementById("vectorMeta");
const transcriptCache = new Map();
if (!queryInput || !searchBtn || !resultsDiv || !metaDiv) {
console.error("Vector search elements missing");
return;
}
/** Utility helpers **/
const escapeHtml = (str) =>
(str || "").replace(/[&<>"']/g, (ch) => {
switch (ch) {
case "&":
return "&amp;";
case "<":
return "&lt;";
case ">":
return "&gt;";
case '"':
return "&quot;";
case "'":
return "&#39;";
default:
return ch;
}
});
const fmtDate = (value) => {
try {
return (value || "").split("T")[0];
} catch {
return value;
}
};
const fmtSimilarity = (score) => {
if (typeof score !== "number" || Number.isNaN(score)) return "";
return score.toFixed(3);
};
const getVideoStatus = (item) =>
(item && item.video_status ? String(item.video_status).toLowerCase() : "");
const isLikelyDeleted = (item) => getVideoStatus(item) === "deleted";
const formatTimestamp = (seconds) => {
if (!seconds && seconds !== 0) return "00:00";
const hours = Math.floor(seconds / 3600);
const mins = Math.floor((seconds % 3600) / 60);
const secs = Math.floor(seconds % 60);
if (hours > 0) {
return `${hours}:${mins.toString().padStart(2, "0")}:${secs
.toString()
.padStart(2, "0")}`;
}
return `${mins}:${secs.toString().padStart(2, "0")}`;
};
const formatSegmentTimestamp = (segment) => {
if (!segment) return "";
if (segment.timestamp) return segment.timestamp;
const fields = [
segment.start_seconds,
segment.start,
segment.offset,
segment.time,
];
for (const value of fields) {
if (value == null) continue;
const num = parseFloat(value);
if (!Number.isNaN(num)) {
return formatTimestamp(num);
}
}
return "";
};
const serializeTranscriptSection = (label, parts, fullText) => {
let content = "";
if (typeof fullText === "string" && fullText.trim()) {
content = fullText.trim();
} else if (Array.isArray(parts) && parts.length) {
content = parts
.map((segment) => {
const ts = formatSegmentTimestamp(segment);
const text = segment && segment.text ? segment.text : "";
return ts ? `[${ts}] ${text}` : text;
})
.join("\n")
.trim();
}
if (!content) return "";
return `${label}\n${content}\n`;
};
const fetchTranscriptData = async (videoId) => {
if (!videoId) return null;
if (transcriptCache.has(videoId)) {
return transcriptCache.get(videoId);
}
const res = await fetch(`/api/transcript?video_id=${encodeURIComponent(videoId)}`);
if (!res.ok) {
throw new Error(`Transcript fetch failed (${res.status})`);
}
const data = await res.json();
transcriptCache.set(videoId, data);
return data;
};
const buildTranscriptDownloadText = (item, transcriptData) => {
const lines = [];
lines.push(`Title: ${item.title || "Untitled"}`);
if (item.channel_name) lines.push(`Channel: ${item.channel_name}`);
if (item.date) lines.push(`Published: ${item.date}`);
if (item.url) lines.push(`URL: ${item.url}`);
lines.push("");
const primaryText = serializeTranscriptSection(
"Primary Transcript",
transcriptData.transcript_parts,
transcriptData.transcript_full
);
const secondaryText = serializeTranscriptSection(
"Secondary Transcript",
transcriptData.transcript_secondary_parts,
transcriptData.transcript_secondary_full
);
if (primaryText) lines.push(primaryText);
if (secondaryText) lines.push(secondaryText);
if (!primaryText && !secondaryText) {
lines.push("No transcript available.");
}
return lines.join("\n").trim() + "\n";
};
const flashButtonMessage = (button, message, duration = 1800) => {
if (!button) return;
const original = button.dataset.originalLabel || button.textContent;
button.dataset.originalLabel = original;
button.textContent = message;
setTimeout(() => {
button.textContent = button.dataset.originalLabel || original;
}, duration);
};
const handleTranscriptDownload = async (item, button) => {
if (!item.video_id) return;
button.disabled = true;
try {
const transcriptData = await fetchTranscriptData(item.video_id);
if (!transcriptData) throw new Error("Transcript unavailable");
const text = buildTranscriptDownloadText(item, transcriptData);
const blob = new Blob([text], { type: "text/plain" });
const url = URL.createObjectURL(blob);
const link = document.createElement("a");
link.href = url;
link.download = `${item.video_id}.txt`;
document.body.appendChild(link);
link.click();
document.body.removeChild(link);
URL.revokeObjectURL(url);
flashButtonMessage(button, "Downloaded");
} catch (err) {
console.error("Download failed", err);
alert("Unable to download transcript right now.");
} finally {
button.disabled = false;
}
};
const formatMlaDate = (value) => {
if (!value) return "n.d.";
const parsed = new Date(value);
if (Number.isNaN(parsed.valueOf())) return value;
const months = [
"Jan.", "Feb.", "Mar.", "Apr.", "May", "June",
"July", "Aug.", "Sept.", "Oct.", "Nov.", "Dec.",
];
return `${parsed.getDate()} ${months[parsed.getMonth()]} ${parsed.getFullYear()}`;
};
const buildMlaCitation = (item) => {
const channel = (item.channel_name || item.channel_id || "Unknown").trim();
const title = (item.title || "Untitled").trim();
const url = item.url || "";
const publishDate = formatMlaDate(item.date);
const today = formatMlaDate(new Date().toISOString().split("T")[0]);
return `${channel}. "${title}." YouTube, uploaded by ${channel}, ${publishDate}, ${url}. Accessed ${today}.`;
};
const handleCopyCitation = async (item, button) => {
const citation = buildMlaCitation(item);
try {
if (navigator.clipboard && window.isSecureContext) {
await navigator.clipboard.writeText(citation);
} else {
const textarea = document.createElement("textarea");
textarea.value = citation;
textarea.style.position = "fixed";
textarea.style.opacity = "0";
document.body.appendChild(textarea);
textarea.select();
document.execCommand("copy");
document.body.removeChild(textarea);
}
flashButtonMessage(button, "Copied!");
} catch (err) {
console.error("Citation copy failed", err);
alert(citation);
}
};
/** Rendering helpers **/
const createHighlightRows = (entries) => {
if (!Array.isArray(entries) || !entries.length) return null;
const container = document.createElement("div");
container.className = "transcript highlight-list";
entries.forEach((entry) => {
if (!entry) return;
const row = document.createElement("div");
row.className = "highlight-row";
const textBlock = document.createElement("div");
textBlock.className = "highlight-text";
const html = entry.html || entry.text || entry;
textBlock.innerHTML = html || "";
row.appendChild(textBlock);
const indicator = document.createElement("span");
indicator.className = "highlight-source-indicator highlight-source-indicator--primary";
indicator.title = "Vector highlight";
row.appendChild(indicator);
container.appendChild(row);
});
return container;
};
const createActions = (item) => {
const actions = document.createElement("div");
actions.className = "result-actions";
const downloadBtn = document.createElement("button");
downloadBtn.type = "button";
downloadBtn.className = "result-action-btn";
downloadBtn.textContent = "Download transcript";
downloadBtn.addEventListener("click", () => handleTranscriptDownload(item, downloadBtn));
actions.appendChild(downloadBtn);
const citationBtn = document.createElement("button");
citationBtn.type = "button";
citationBtn.className = "result-action-btn";
citationBtn.textContent = "Copy citation";
citationBtn.addEventListener("click", () => handleCopyCitation(item, citationBtn));
actions.appendChild(citationBtn);
const graphBtn = document.createElement("button");
graphBtn.type = "button";
graphBtn.className = "result-action-btn graph-launch-btn";
graphBtn.textContent = "Graph";
graphBtn.disabled = !item.video_id;
graphBtn.addEventListener("click", () => {
if (!item.video_id) return;
const target = `/graph?video_id=${encodeURIComponent(item.video_id)}`;
window.open(target, "_blank", "noopener");
});
actions.appendChild(graphBtn);
return actions;
};
const renderVectorResults = (payload) => {
resultsDiv.innerHTML = "";
const items = payload.items || [];
if (!items.length) {
metaDiv.textContent = "No vector matches for this prompt.";
return;
}
metaDiv.textContent = `Matches: ${items.length} (vector mode)`;
items.forEach((item) => {
const el = document.createElement("div");
el.className = "item";
const header = document.createElement("div");
header.className = "result-header";
const headerMain = document.createElement("div");
headerMain.className = "result-header-main";
const titleEl = document.createElement("strong");
titleEl.innerHTML = item.titleHtml || escapeHtml(item.title || "Untitled");
headerMain.appendChild(titleEl);
const metaLine = document.createElement("div");
metaLine.className = "muted result-meta";
const channelLabel = item.channel_name || item.channel_id || "Unknown";
const dateLabel = fmtDate(item.date);
let durationSeconds = null;
if (typeof item.duration === "number") {
durationSeconds = item.duration;
} else if (typeof item.duration === "string" && item.duration.trim()) {
const parsed = parseFloat(item.duration);
if (!Number.isNaN(parsed)) {
durationSeconds = parsed;
}
}
const durationLabel = durationSeconds != null ? `${formatTimestamp(durationSeconds)}` : "";
metaLine.textContent = channelLabel ? `${channelLabel}${dateLabel}${durationLabel}` : `${dateLabel}${durationLabel}`;
if (isLikelyDeleted(item)) {
metaLine.appendChild(document.createTextNode(" "));
const statusEl = document.createElement("span");
statusEl.className = "result-status result-status--deleted";
statusEl.textContent = "Likely deleted";
metaLine.appendChild(statusEl);
}
headerMain.appendChild(metaLine);
if (item.url) {
const linkLine = document.createElement("div");
linkLine.className = "muted";
const anchor = document.createElement("a");
anchor.href = item.url;
anchor.target = "_blank";
anchor.rel = "noopener";
anchor.textContent = "Open on YouTube";
linkLine.appendChild(anchor);
headerMain.appendChild(linkLine);
}
if (typeof item.distance === "number") {
const scoreLine = document.createElement("div");
scoreLine.className = "muted";
scoreLine.textContent = `Similarity score: ${fmtSimilarity(item.distance)}`;
headerMain.appendChild(scoreLine);
}
header.appendChild(headerMain);
header.appendChild(createActions(item));
el.appendChild(header);
if (item.descriptionHtml || item.description) {
const desc = document.createElement("div");
desc.className = "muted description-block";
desc.innerHTML = item.descriptionHtml || escapeHtml(item.description);
el.appendChild(desc);
}
if (item.chunkText) {
const chunkBlock = document.createElement("div");
chunkBlock.className = "vector-chunk";
if (item.chunkTimestamp && item.url) {
const tsObj =
typeof item.chunkTimestamp === "object"
? item.chunkTimestamp
: { timestamp: item.chunkTimestamp };
const ts = formatSegmentTimestamp(tsObj);
const tsLink = document.createElement("a");
const paramValue =
typeof item.chunkTimestamp === "number"
? Math.floor(item.chunkTimestamp)
: item.chunkTimestamp;
tsLink.href = `${item.url}${item.url.includes("?") ? "&" : "?"}t=${encodeURIComponent(
paramValue
)}`;
tsLink.target = "_blank";
tsLink.rel = "noopener";
tsLink.textContent = ts ? `[${ts}]` : "[timestamp]";
chunkBlock.appendChild(tsLink);
chunkBlock.appendChild(document.createTextNode(" "));
}
const chunkTextSpan = document.createElement("span");
chunkTextSpan.textContent = item.chunkText;
chunkBlock.appendChild(chunkTextSpan);
el.appendChild(chunkBlock);
}
const highlights = createHighlightRows(item.toHighlight);
if (highlights) {
el.appendChild(highlights);
}
resultsDiv.appendChild(el);
});
};
/** Search handler **/
const runVectorSearch = async () => {
const query = queryInput.value.trim();
if (!query) {
alert("Please enter a query.");
return;
}
metaDiv.textContent = "Searching vector index…";
resultsDiv.innerHTML = "";
searchBtn.disabled = true;
try {
const res = await fetch("/api/vector-search", {
method: "POST",
headers: { "Content-Type": "application/json" },
body: JSON.stringify({ query }),
});
if (!res.ok) {
throw new Error(`Vector search failed (${res.status})`);
}
const data = await res.json();
if (data.error) {
metaDiv.textContent = "Vector search unavailable.";
return;
}
renderVectorResults(data);
} catch (err) {
console.error(err);
metaDiv.textContent = "Vector search unavailable.";
} finally {
searchBtn.disabled = false;
}
};
searchBtn.addEventListener("click", runVectorSearch);
queryInput.addEventListener("keypress", (event) => {
if (event.key === "Enter") {
runVectorSearch();
}
});
})();

74
urls.txt Normal file
View File

@ -0,0 +1,74 @@
https://www.youtube.com/channel/UCCebR16tXbv5Ykk9_WtCCug/videos
https://www.youtube.com/channel/UC6vg0HkKKlgsWk-3HfV-vnw/videos
https://www.youtube.com/channel/UCeWWxwzgLYUbfjWowXhVdYw/videos
https://www.youtube.com/channel/UC952hDf_C4nYJdqwK7VzTxA/videos
https://www.youtube.com/channel/UCU5SNBfTo4umhjYz6M0Jsmg/videos
https://www.youtube.com/channel/UC6Tvr9mBXNaAxLGRA_sUSRA/videos
https://www.youtube.com/channel/UC4Rmxg7saTfwIpvq3QEzylQ/videos
https://www.youtube.com/channel/UCTdH4nh6JTcfKUAWvmnPoIQ/videos
https://www.youtube.com/channel/UCsi_x8c12NW9FR7LL01QXKA/videos
https://www.youtube.com/channel/UCAqTQ5yLHHH44XWwWXLkvHQ/videos
https://www.youtube.com/channel/UCprytROeCztMOMe8plyJRMg/videos
https://www.youtube.com/channel/UCpqDUjTsof-kTNpnyWper_Q/videos
https://www.youtube.com/channel/UCL_f53ZEJxp8TtlOkHwMV9Q/videos
https://www.youtube.com/channel/UCez1fzMRGctojfis2lfRYug/videos
https://www.youtube.com/channel/UC2leFZRD0ZlQDQxpR2Zd8oA/videos
https://www.youtube.com/channel/UC8SErJkYnDsYGh1HxoZkl-g/videos
https://www.youtube.com/channel/UCEPOn4cgvrrerg_-q_Ygw1A/videos
https://www.youtube.com/channel/UC2yCyOMUeem-cYwliC-tLJg/videos
https://www.youtube.com/channel/UCGsDIP_K6J6VSTqlq-9IPlg/videos
https://www.youtube.com/channel/UCEzWTLDYmL8soRdQec9Fsjw/videos
https://www.youtube.com/channel/UC1KgNsMdRoIA_njVmaDdHgA/videos
https://www.youtube.com/channel/UCFQ6Gptuq-sLflbJ4YY3Umw/videos
https://www.youtube.com/channel/UCEY1vGNBPsC3dCatZyK3Jkw/videos
https://www.youtube.com/channel/UCIAtCuzdvgNJvSYILnHtdWA/videos
https://www.youtube.com/channel/UClIDP7_Kzv_7tDQjTv9EhrA/videos
https://www.youtube.com/channel/UC-QiBn6GsM3JZJAeAQpaGAA/videos
https://www.youtube.com/channel/UCiJmdXTb76i8eIPXdJyf8ZQ/videos
https://www.youtube.com/channel/UCM9Z05vuQhMEwsV03u6DrLA/videos
https://www.youtube.com/channel/UCgp_r6WlBwDSJrP43Mz07GQ/videos
https://www.youtube.com/channel/UC5uv-BxzCrN93B_5qbOdRWw/videos
https://www.youtube.com/channel/UCtCTSf3UwRU14nYWr_xm-dQ/videos
https://www.youtube.com/channel/UC1a4VtU_SMSfdRiwMJR33YQ/videos
https://www.youtube.com/channel/UCg7Ed0lecvko58ibuX1XHng/videos
https://www.youtube.com/channel/UCMVG5eqpYFVEB-a9IqAOuHA/videos
https://www.youtube.com/channel/UC8mJqpS_EBbMcyuzZDF0TEw/videos
https://www.youtube.com/channel/UCGHuURJ1XFHzPSeokf6510A/videos
https://www.youtube.com/@chrishoward8473/videos
https://www.youtube.com/channel/UChptV-kf8lnncGh7DA2m8Pw/videos
https://www.youtube.com/channel/UCzX6R3ZLQh5Zma_5AsPcqPA/videos
https://www.youtube.com/channel/UCiukuaNd_qzRDTW9qe2OC1w/videos
https://www.youtube.com/channel/UC5yLuFQCms4nb9K2bGQLqIw/videos
https://www.youtube.com/channel/UCVdSgEf9bLXFMBGSMhn7x4Q/videos
https://www.youtube.com/channel/UC_dnk5D4tFCRYCrKIcQlcfw/videos
https://www.youtube.com/@Freerilian/videos
https://www.youtube.com/@marks.-ry7bm/videos
https://www.youtube.com/@Adams-Fall/videos
https://www.youtube.com/@mcmosav/videos
https://www.youtube.com/@Landbeorht/videos
https://www.youtube.com/@Corner_Citizen/videos
https://www.youtube.com/@ethan.caughey/videos
https://www.youtube.com/@MarcInTbilisi/videos
https://www.youtube.com/@climbingmt.sophia/videos
https://www.youtube.com/@Skankenstein/videos
https://www.youtube.com/@UpCycleClub/videos
https://www.youtube.com/@JessPurviance/videos
https://www.youtube.com/@greyhamilton52/videos
https://www.youtube.com/@paulrenenichols/videos
https://www.youtube.com/@OfficialSecularKoranism/videos
https://www.youtube.com/@FromWhomAllBlessingsFlow/videos
https://www.youtube.com/@FoodTruckEmily/videos
https://www.youtube.com/@O.G.Rose.Michelle.and.Daniel/videos
https://www.youtube.com/@JonathanDumeer/videos
https://www.youtube.com/@JordanGreenhall/videos
https://www.youtube.com/@NechamaGluck/videos
https://www.youtube.com/@justinsmorningcoffee/videos
https://www.youtube.com/@grahampardun/videos
https://www.youtube.com/@michaelmartin8681/videos
https://www.youtube.com/@davidbusuttil9086/videos
https://www.youtube.com/@matthewparlato5626/videos
https://www.youtube.com/@lancecleaver227/videos
https://www.youtube.com/@theplebistocrat/videos
https://www.youtube.com/@RightInChrist/videos
https://www.youtube.com/@RafeKelley/videos
https://www.youtube.com/@WavesOfObsession/videos