From 30503628b584b17c613365b5c80a2611e6a8589b Mon Sep 17 00:00:00 2001 From: knight Date: Thu, 8 Jan 2026 22:53:30 -0500 Subject: [PATCH] Add unified channel feed --- .dockerignore | 2 + .gitignore | 4 + Makefile | 87 +++++++++++ channel_config.py | 162 ++++++++++++++++++++ channels.yml | 258 ++++++++++++++++++++++++++++++++ config.py | 14 ++ docker-compose.yml | 45 +++++- feed-master-config/fm.yml | 166 +++++++++++++++++++++ generate_feed_config.py | 91 ++++++++++++ generate_feed_config_simple.py | 88 +++++++++++ search_app.py | 262 +++++++++++++++++++++++++++++++++ static/app.js | 56 +++++++ static/index.html | 25 ++++ static/style.css | 60 ++++++++ 14 files changed, 1319 insertions(+), 1 deletion(-) create mode 100644 Makefile create mode 100644 channel_config.py create mode 100644 channels.yml create mode 100644 feed-master-config/fm.yml create mode 100644 generate_feed_config.py create mode 100755 generate_feed_config_simple.py diff --git a/.dockerignore b/.dockerignore index 0d1dd24..34a0883 100644 --- a/.dockerignore +++ b/.dockerignore @@ -9,3 +9,5 @@ node_modules data videos *.log +feed-master-config/var +feed-master-config/images diff --git a/.gitignore b/.gitignore index ac732eb..5cf4a06 100644 --- a/.gitignore +++ b/.gitignore @@ -51,6 +51,10 @@ Thumbs.db # Logs *.log +# Feed Master runtime cache +feed-master-config/var/ +feed-master-config/images/ + # Testing .pytest_cache/ .coverage diff --git a/Makefile b/Makefile new file mode 100644 index 0000000..7e4eb88 --- /dev/null +++ b/Makefile @@ -0,0 +1,87 @@ +# Makefile for TLC Search + Feed Master + +.PHONY: help config up down restart logs status update-channels + +help: + @echo "TLC Search + Feed Master - Management Commands" + @echo "" + @echo "Configuration:" + @echo " make config - Regenerate feed-master configuration from channels.yml" + @echo "" + @echo "Service Management:" + @echo " make up - Start all services" + @echo " make down - Stop all services" + @echo " make restart - Restart all services" + @echo " make logs - View all service logs" + @echo " make status - Check service status" + @echo "" + @echo "Updates:" + @echo " make update-channels - Regenerate config and restart feed-master" + @echo "" + @echo "Individual Services:" + @echo " make logs-feed - View feed-master logs" + @echo " make logs-bridge - View rss-bridge logs" + @echo " make logs-app - View TLC Search logs" + @echo " make restart-feed - Restart feed-master only" + +# Generate feed-master configuration from channels.yml +config: + @echo "Generating feed-master configuration..." + python3 -m python_app.generate_feed_config_simple + @echo "Configuration updated!" + +# Start all services +up: + docker compose up -d + @echo "" + @echo "Services started!" + @echo " - RSS Bridge: http://localhost:3001" + @echo " - Feed Master: http://localhost:8097/rss/youtube-unified" + @echo " - TLC Search: http://localhost:8080" + +# Stop all services +down: + docker compose down + +# Restart all services +restart: + docker compose restart + +# View all logs +logs: + docker compose logs -f + +# View feed-master logs +logs-feed: + docker compose logs -f feed-master + +# View rss-bridge logs +logs-bridge: + docker compose logs -f rss-bridge + +# View TLC Search logs +logs-app: + docker compose logs -f app + +# Check service status +status: + @docker compose ps + @echo "" + @echo "Endpoints:" + @echo " - RSS Bridge: http://localhost:3001" + @echo " - Feed Master: http://localhost:8097/rss/youtube-unified" + @echo " - TLC Search: http://localhost:8080" + +# Restart only feed-master +restart-feed: + docker compose restart feed-master + +# Pull latest channel URLs and regenerate configuration +update-channels: + @echo "Regenerating feed-master configuration..." + python3 -m python_app.generate_feed_config_simple + @echo "" + @echo "Restarting feed-master..." + docker compose restart feed-master + @echo "" + @echo "Update complete!" diff --git a/channel_config.py b/channel_config.py new file mode 100644 index 0000000..9b742e3 --- /dev/null +++ b/channel_config.py @@ -0,0 +1,162 @@ +from __future__ import annotations + +import json +import re +from pathlib import Path +from typing import Any, Dict, List, Optional + +_CHANNEL_ID_PATTERN = re.compile(r"(?:https?://)?(?:www\.)?youtube\.com/channel/([^/?#]+)") +_HANDLE_PATTERN = re.compile(r"(?:https?://)?(?:www\.)?youtube\.com/@([^/?#]+)") + + +def _strip_quotes(value: str) -> str: + if len(value) >= 2 and value[0] == value[-1] and value[0] in {"'", '"'}: + return value[1:-1] + return value + + +def _parse_yaml_channels(text: str) -> List[Dict[str, str]]: + channels: List[Dict[str, str]] = [] + current: Dict[str, str] = {} + + for raw_line in text.splitlines(): + line = raw_line.strip() + if not line or line.startswith("#"): + continue + if line == "channels:": + continue + if line.startswith("- "): + if current: + channels.append(current) + current = {} + line = line[2:].strip() + if not line: + continue + if ":" not in line: + continue + key, value = line.split(":", 1) + current[key.strip()] = _strip_quotes(value.strip()) + + if current: + channels.append(current) + return channels + + +def _extract_from_url(url: str) -> Dict[str, Optional[str]]: + channel_id = None + handle = None + + channel_match = _CHANNEL_ID_PATTERN.search(url) + if channel_match: + channel_id = channel_match.group(1) + + handle_match = _HANDLE_PATTERN.search(url) + if handle_match: + handle = handle_match.group(1) + + return {"id": channel_id, "handle": handle} + + +def _normalize_handle(handle: Optional[str]) -> Optional[str]: + if not handle: + return None + return handle.lstrip("@").strip() or None + + +def _parse_bool(value: Optional[object]) -> Optional[bool]: + if isinstance(value, bool): + return value + if value is None: + return None + text = str(value).strip().lower() + if text in {"1", "true", "yes", "y"}: + return True + if text in {"0", "false", "no", "n"}: + return False + return None + + +def _normalize_entry(entry: Dict[str, Any]) -> Optional[Dict[str, Any]]: + channel_id = entry.get("id") or entry.get("channel_id") + handle = _normalize_handle(entry.get("handle") or entry.get("username")) + url = entry.get("url") + name = entry.get("name") + rss_flag = _parse_bool( + entry.get("rss_enabled") or entry.get("rss") or entry.get("include_in_feed") + ) + + if url: + extracted = _extract_from_url(url) + channel_id = channel_id or extracted.get("id") + handle = handle or extracted.get("handle") + + if not url: + if channel_id: + url = f"https://www.youtube.com/channel/{channel_id}" + elif handle: + url = f"https://www.youtube.com/@{handle}" + + if not name: + name = handle or channel_id + + if not name or not url: + return None + + normalized = { + "id": channel_id or "", + "handle": handle or "", + "name": name, + "url": url, + "rss_enabled": True if rss_flag is None else rss_flag, + } + return normalized + + +def load_channel_entries(path: Path) -> List[Dict[str, str]]: + if not path.exists(): + raise FileNotFoundError(path) + + if path.suffix.lower() == ".json": + payload = json.loads(path.read_text(encoding="utf-8")) + if isinstance(payload, dict): + raw_entries = payload.get("channels", []) + else: + raw_entries = payload + else: + raw_entries = _parse_yaml_channels(path.read_text(encoding="utf-8")) + + entries: List[Dict[str, str]] = [] + for raw in raw_entries: + if not isinstance(raw, dict): + continue + raw_payload: Dict[str, Any] = {} + for key, value in raw.items(): + if value is None: + continue + if isinstance(value, bool): + raw_payload[str(key).strip()] = value + else: + raw_payload[str(key).strip()] = str(value).strip() + normalized = _normalize_entry(raw_payload) + if normalized: + entries.append(normalized) + + entries.sort(key=lambda item: item["name"].lower()) + return entries + + +def build_rss_bridge_url(entry: Dict[str, str], rss_bridge_host: str = "rss-bridge") -> Optional[str]: + channel_id = entry.get("id") or "" + handle = _normalize_handle(entry.get("handle")) + + if channel_id: + return ( + f"http://{rss_bridge_host}/?action=display&bridge=YoutubeBridge" + f"&context=By+channel+id&c={channel_id}&format=Mrss" + ) + if handle: + return ( + f"http://{rss_bridge_host}/?action=display&bridge=YoutubeBridge" + f"&context=By+username&u={handle}&format=Mrss" + ) + return None diff --git a/channels.yml b/channels.yml new file mode 100644 index 0000000..adf40bc --- /dev/null +++ b/channels.yml @@ -0,0 +1,258 @@ +# Shared YouTube Channel Configuration +# Used by both TLC Search (transcript collection) and Feed Master (RSS aggregation) + +channels: + - id: UCCebR16tXbv5Ykk9_WtCCug + name: Channel UCCebR16tXbv + url: https://www.youtube.com/channel/UCCebR16tXbv5Ykk9_WtCCug/videos + - id: UC6vg0HkKKlgsWk-3HfV-vnw + name: A Quality Existence + url: https://www.youtube.com/channel/UC6vg0HkKKlgsWk-3HfV-vnw/videos + - id: UCeWWxwzgLYUbfjWowXhVdYw + name: Andrea with the Bangs + url: https://www.youtube.com/channel/UCeWWxwzgLYUbfjWowXhVdYw/videos + - id: UC952hDf_C4nYJdqwK7VzTxA + name: Charlie's Little Corner + url: https://www.youtube.com/channel/UC952hDf_C4nYJdqwK7VzTxA/videos + - id: UCU5SNBfTo4umhjYz6M0Jsmg + name: Christian Baxter + url: https://www.youtube.com/channel/UCU5SNBfTo4umhjYz6M0Jsmg/videos + - id: UC6Tvr9mBXNaAxLGRA_sUSRA + name: Finding Ideas + url: https://www.youtube.com/channel/UC6Tvr9mBXNaAxLGRA_sUSRA/videos + - id: UC4Rmxg7saTfwIpvq3QEzylQ + name: Ein Sof - Infinite Reflections + url: https://www.youtube.com/channel/UC4Rmxg7saTfwIpvq3QEzylQ/videos + - id: UCTdH4nh6JTcfKUAWvmnPoIQ + name: Eric Seitz + url: https://www.youtube.com/channel/UCTdH4nh6JTcfKUAWvmnPoIQ/videos + - id: UCsi_x8c12NW9FR7LL01QXKA + name: Grail Country + url: https://www.youtube.com/channel/UCsi_x8c12NW9FR7LL01QXKA/videos + - id: UCAqTQ5yLHHH44XWwWXLkvHQ + name: Grizwald Grim + url: https://www.youtube.com/channel/UCAqTQ5yLHHH44XWwWXLkvHQ/videos + - id: UCprytROeCztMOMe8plyJRMg + name: faturechi + url: https://www.youtube.com/channel/UCprytROeCztMOMe8plyJRMg/videos + - id: UCpqDUjTsof-kTNpnyWper_Q + name: John Vervaeke + url: https://www.youtube.com/channel/UCpqDUjTsof-kTNpnyWper_Q/videos + - id: UCL_f53ZEJxp8TtlOkHwMV9Q + name: Jordan B Peterson + url: https://www.youtube.com/channel/UCL_f53ZEJxp8TtlOkHwMV9Q/videos + - id: UCez1fzMRGctojfis2lfRYug + name: Lucas Vos + url: https://www.youtube.com/channel/UCez1fzMRGctojfis2lfRYug/videos + - id: UC2leFZRD0ZlQDQxpR2Zd8oA + name: Mary Kochan + url: https://www.youtube.com/channel/UC2leFZRD0ZlQDQxpR2Zd8oA/videos + - id: UC8SErJkYnDsYGh1HxoZkl-g + name: Sartori Studios + url: https://www.youtube.com/channel/UC8SErJkYnDsYGh1HxoZkl-g/videos + - id: UCEPOn4cgvrrerg_-q_Ygw1A + name: More Christ + url: https://www.youtube.com/channel/UCEPOn4cgvrrerg_-q_Ygw1A/videos + - id: UC2yCyOMUeem-cYwliC-tLJg + name: Paul Anleitner + url: https://www.youtube.com/channel/UC2yCyOMUeem-cYwliC-tLJg/videos + - id: UCGsDIP_K6J6VSTqlq-9IPlg + name: Paul VanderKlay + url: https://www.youtube.com/channel/UCGsDIP_K6J6VSTqlq-9IPlg/videos + - id: UCEzWTLDYmL8soRdQec9Fsjw + name: Randos United + url: https://www.youtube.com/channel/UCEzWTLDYmL8soRdQec9Fsjw/videos + - id: UC1KgNsMdRoIA_njVmaDdHgA + name: Randos United 2 + url: https://www.youtube.com/channel/UC1KgNsMdRoIA_njVmaDdHgA/videos + - id: UCFQ6Gptuq-sLflbJ4YY3Umw + name: Rebel Wisdom + url: https://www.youtube.com/channel/UCFQ6Gptuq-sLflbJ4YY3Umw/videos + - id: UCEY1vGNBPsC3dCatZyK3Jkw + name: Strange Theology + url: https://www.youtube.com/channel/UCEY1vGNBPsC3dCatZyK3Jkw/videos + - id: UCIAtCuzdvgNJvSYILnHtdWA + name: The Anadromist + url: https://www.youtube.com/channel/UCIAtCuzdvgNJvSYILnHtdWA/videos + - id: UClIDP7_Kzv_7tDQjTv9EhrA + name: The Chris Show + url: https://www.youtube.com/channel/UClIDP7_Kzv_7tDQjTv9EhrA/videos + - id: UC-QiBn6GsM3JZJAeAQpaGAA + name: TheCommonToad + url: https://www.youtube.com/channel/UC-QiBn6GsM3JZJAeAQpaGAA/videos + - id: UCiJmdXTb76i8eIPXdJyf8ZQ + name: Channel UCiJmdXTb76i + url: https://www.youtube.com/channel/UCiJmdXTb76i8eIPXdJyf8ZQ/videos + - id: UCM9Z05vuQhMEwsV03u6DrLA + name: Cassidy van der Kamp + url: https://www.youtube.com/channel/UCM9Z05vuQhMEwsV03u6DrLA/videos + - id: UCgp_r6WlBwDSJrP43Mz07GQ + name: The Meaning Code + url: https://www.youtube.com/channel/UCgp_r6WlBwDSJrP43Mz07GQ/videos + - id: UC5uv-BxzCrN93B_5qbOdRWw + name: TheScrollersPodcast + url: https://www.youtube.com/channel/UC5uv-BxzCrN93B_5qbOdRWw/videos + - id: UCtCTSf3UwRU14nYWr_xm-dQ + name: Jonathan Pageau + url: https://www.youtube.com/channel/UCtCTSf3UwRU14nYWr_xm-dQ/videos + - id: UC1a4VtU_SMSfdRiwMJR33YQ + name: The Young Levite + url: https://www.youtube.com/channel/UC1a4VtU_SMSfdRiwMJR33YQ/videos + - id: UCg7Ed0lecvko58ibuX1XHng + name: Transfigured + url: https://www.youtube.com/channel/UCg7Ed0lecvko58ibuX1XHng/videos + - id: UCMVG5eqpYFVEB-a9IqAOuHA + name: President Foxman + url: https://www.youtube.com/channel/UCMVG5eqpYFVEB-a9IqAOuHA/videos + - id: UC8mJqpS_EBbMcyuzZDF0TEw + name: Neal Daedalus + url: https://www.youtube.com/channel/UC8mJqpS_EBbMcyuzZDF0TEw/videos + - id: UCGHuURJ1XFHzPSeokf6510A + name: Aphrael Pilotson + url: https://www.youtube.com/channel/UCGHuURJ1XFHzPSeokf6510A/videos + - id: UC704NVL2DyzYg3rMU9r1f7A + handle: chrishoward8473 + name: Chris Howard + url: https://www.youtube.com/@chrishoward8473/videos + - id: UChptV-kf8lnncGh7DA2m8Pw + name: Shoulder Serf + url: https://www.youtube.com/channel/UChptV-kf8lnncGh7DA2m8Pw/videos + - id: UCzX6R3ZLQh5Zma_5AsPcqPA + name: Restoring Meaning + url: https://www.youtube.com/channel/UCzX6R3ZLQh5Zma_5AsPcqPA/videos + - id: UCiukuaNd_qzRDTW9qe2OC1w + name: Kale Zelden + url: https://www.youtube.com/channel/UCiukuaNd_qzRDTW9qe2OC1w/videos + - id: UC5yLuFQCms4nb9K2bGQLqIw + name: Ron Copperman + url: https://www.youtube.com/channel/UC5yLuFQCms4nb9K2bGQLqIw/videos + - id: UCVdSgEf9bLXFMBGSMhn7x4Q + name: Mark D Parker + url: https://www.youtube.com/channel/UCVdSgEf9bLXFMBGSMhn7x4Q/videos + - id: UC_dnk5D4tFCRYCrKIcQlcfw + name: Luke Thompson + url: https://www.youtube.com/channel/UC_dnk5D4tFCRYCrKIcQlcfw/videos + - id: UCT8Lq3ufaGEnCSS8WpFatqw + handle: Freerilian + name: Free Rilian + url: https://www.youtube.com/@Freerilian/videos + - id: UC977g6oGYIJDQnsZOGjQBBA + handle: marks.-ry7bm + name: Mark S + url: https://www.youtube.com/@marks.-ry7bm/videos + - id: UCbD1Pm0TOcRK2zaCrwgcTTg + handle: Adams-Fall + name: Adams Fall + url: https://www.youtube.com/@Adams-Fall/videos + - id: UCnojyPW0IgLWTQ0SaDQ1KBA + handle: mcmosav + name: mcmosav + url: https://www.youtube.com/@mcmosav/videos + - id: UCiOZYvBGHw1Y6wyzffwEp9g + handle: Landbeorht + name: Joseph Lambrecht + url: https://www.youtube.com/@Landbeorht/videos + - id: UCAXyF_HFeMgwS8nkGVeroAA + handle: Corner_Citizen + name: Corner Citizen + url: https://www.youtube.com/@Corner_Citizen/videos + - id: UCv2Qft5mZrmA9XAwnl9PU-g + handle: ethan.caughey + name: Ethan Caughey + url: https://www.youtube.com/@ethan.caughey/videos + - id: UCMJCtS8jKouJ2d8UIYzW3vg + handle: MarcInTbilisi + name: Marc Jackson + url: https://www.youtube.com/@MarcInTbilisi/videos + - id: UCk9O91WwruXmgu1NQrKZZEw + handle: climbingmt.sophia + name: Climbing Mt Sophia + url: https://www.youtube.com/@climbingmt.sophia/videos + - id: UCUSyTPWW4JaG1YfUPddw47Q + handle: Skankenstein + name: Skankenstein + url: https://www.youtube.com/@Skankenstein/videos + - id: UCzw2FNI3IRphcAoVcUENOgQ + handle: UpCycleClub + name: UpCycleClub + url: https://www.youtube.com/@UpCycleClub/videos + - id: UCQ7rVoApmYIpcmU7fB9RPyw + handle: JessPurviance + name: Jesspurviance + url: https://www.youtube.com/@JessPurviance/videos + - id: UCrZyTWGMdRM9_P26RKPvh3A + handle: greyhamilton52 + name: Grey Hamilton + url: https://www.youtube.com/@greyhamilton52/videos + - id: UCDCfI162vhPvwdxW6X4nmiw + handle: paulrenenichols + name: Paul Rene Nichols + url: https://www.youtube.com/@paulrenenichols/videos + - id: UCFLovlJ8RFApfjrf2y157xg + handle: OfficialSecularKoranism + name: Secular Koranism + url: https://www.youtube.com/@OfficialSecularKoranism/videos + - id: UC_-YQbnPfBbIezMr1adZZiQ + handle: FromWhomAllBlessingsFlow + name: From Whom All Blessings Flow + url: https://www.youtube.com/@FromWhomAllBlessingsFlow/videos + - id: UCn5mf-fcpBmkepIpZ8eFRng + handle: FoodTruckEmily + name: Emily Rajeh + url: https://www.youtube.com/@FoodTruckEmily/videos + - id: UC6zHDj4D323xJkblnPTvY3Q + handle: O.G.Rose.Michelle.and.Daniel + name: OG Rose + url: https://www.youtube.com/@O.G.Rose.Michelle.and.Daniel/videos + - id: UC4GiA5Hnwy415uVRymxPK-w + handle: JonathanDumeer + name: Jonathan Dumeer + url: https://www.youtube.com/@JonathanDumeer/videos + - id: UCMzT-mdCqoyEv_-YZVtE7MQ + handle: JordanGreenhall + name: Jordan Hall + url: https://www.youtube.com/@JordanGreenhall/videos + - id: UC5goUoFM4LPim4eY4pwRXYw + handle: NechamaGluck + name: Nechama Gluck + url: https://www.youtube.com/@NechamaGluck/videos + - id: UCPUVeoQYyq8cndWwyczX6RA + handle: justinsmorningcoffee + name: Justinsmorningcoffee + url: https://www.youtube.com/@justinsmorningcoffee/videos + - id: UCB0C8DEIQlQzvSGuGriBxtA + handle: grahampardun + name: Grahampardun + url: https://www.youtube.com/@grahampardun/videos + - id: UCpLJJLVB_7v4Igq-9arja1A + handle: michaelmartin8681 + name: Michaelmartin8681 + url: https://www.youtube.com/@michaelmartin8681/videos + - id: UCxV18lwwh29DiWuooz7UCvg + handle: davidbusuttil9086 + name: Davidbusuttil9086 + url: https://www.youtube.com/@davidbusuttil9086/videos + - id: UCosBhpwwGh_ueYq4ZSi5dGw + handle: matthewparlato5626 + name: Matthewparlato5626 + url: https://www.youtube.com/@matthewparlato5626/videos + - id: UCwF5LWNOFou_50bT65bq4Bg + handle: lancecleaver227 + name: Lancecleaver227 + url: https://www.youtube.com/@lancecleaver227/videos + - id: UCaJ0CqiiMSTq4X0rycUOIjw + handle: theplebistocrat + name: the plebistocrat + url: https://www.youtube.com/@theplebistocrat/videos + - id: UCZA5mUAyYcCL1kYgxbeMNrA + handle: RightInChrist + name: Rightinchrist + url: https://www.youtube.com/@RightInChrist/videos + - id: UCDIPXp88qjAV3TiaR5Uo3iQ + handle: RafeKelley + name: Rafekelley + url: https://www.youtube.com/@RafeKelley/videos + - id: UCedgru6YCto3zyXjlbuQuqA + handle: WavesOfObsession + name: Wavesofobsession + url: https://www.youtube.com/@WavesOfObsession/videos diff --git a/config.py b/config.py index 4d7a61f..f109889 100644 --- a/config.py +++ b/config.py @@ -6,6 +6,9 @@ Environment Variables: ELASTIC_USERNAME / ELASTIC_PASSWORD: Optional basic auth credentials. ELASTIC_INDEX: Target index name (default: this_little_corner_py). LOCAL_DATA_DIR: Root folder containing JSON metadata (default: ../data/video_metadata). + CHANNELS_PATH: Path to the canonical channel list (default: ./channels.yml). + RSS_FEED_URL: Public URL/path for the unified RSS feed (default: /rss/youtube-unified). + RSS_FEED_UPSTREAM: Base URL to proxy feed requests (default: http://localhost:8097). YOUTUBE_API_KEY: Optional API key for pulling metadata directly from YouTube. RATE_LIMIT_ENABLED: Toggle API rate limiting (default: 1). RATE_LIMIT_REQUESTS: Max requests per window per client (default: 60). @@ -74,6 +77,9 @@ class AppConfig: qdrant_vector_name: Optional[str] qdrant_vector_size: int qdrant_embed_model: str + channels_path: Path + rss_feed_url: str + rss_feed_upstream: str def _env(name: str, default: Optional[str] = None) -> Optional[str]: @@ -110,6 +116,11 @@ def load_config() -> AppConfig: requests=max(int(_env("RATE_LIMIT_REQUESTS", "60")), 0), window_seconds=max(int(_env("RATE_LIMIT_WINDOW_SECONDS", "60")), 1), ) + channels_path = Path( + _env("CHANNELS_PATH", str(Path(__file__).parent / "channels.yml")) + ).expanduser() + rss_feed_url = _env("RSS_FEED_URL", "/rss/youtube-unified") + rss_feed_upstream = _env("RSS_FEED_UPSTREAM", "http://localhost:8097") return AppConfig( elastic=elastic, data=data, @@ -120,6 +131,9 @@ def load_config() -> AppConfig: qdrant_vector_name=_env("QDRANT_VECTOR_NAME"), qdrant_vector_size=int(_env("QDRANT_VECTOR_SIZE", "1024")), qdrant_embed_model=_env("QDRANT_EMBED_MODEL", "BAAI/bge-large-en-v1.5"), + channels_path=channels_path, + rss_feed_url=rss_feed_url or "", + rss_feed_upstream=rss_feed_upstream or "", ) diff --git a/docker-compose.yml b/docker-compose.yml index ff41606..272213f 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -1,8 +1,47 @@ version: "3.9" -# Runs only the Flask app container, pointing to remote Elasticsearch/Qdrant. +# TLC Search + Feed Master - Complete YouTube content indexing & RSS aggregation # Provide ELASTIC_URL / QDRANT_URL (and related) via environment or a .env file. services: + # RSS Bridge - Converts YouTube channels to RSS feeds + rss-bridge: + image: rssbridge/rss-bridge:latest + container_name: tlc-rss-bridge + hostname: rss-bridge + restart: unless-stopped + logging: + driver: json-file + options: + max-size: "10m" + max-file: "5" + ports: + - "3001:80" + + # Feed Master - Aggregates multiple RSS feeds into unified feed + feed-master: + image: umputun/feed-master:latest + container_name: tlc-feed-master + hostname: feed-master + restart: unless-stopped + depends_on: + - rss-bridge + logging: + driver: json-file + options: + max-size: "10m" + max-file: "5" + environment: + - DEBUG=false + - FM_DB=/srv/var/feed-master.bdb + - FM_CONF=/srv/etc/fm.yml + volumes: + - ./feed-master-config:/srv/etc + - ./feed-master-config/var:/srv/var + - ./feed-master-config/images:/srv/images + ports: + - "8097:8080" + + # TLC Search - Flask app for searching YouTube transcripts app: build: context: . @@ -16,6 +55,9 @@ services: ELASTIC_PASSWORD: ${ELASTIC_PASSWORD:-} ELASTIC_API_KEY: ${ELASTIC_API_KEY:-} ELASTIC_VERIFY_CERTS: ${ELASTIC_VERIFY_CERTS:-0} + CHANNELS_PATH: ${CHANNELS_PATH:-/app/python_app/channels.yml} + RSS_FEED_URL: ${RSS_FEED_URL:-/rss/youtube-unified} + RSS_FEED_UPSTREAM: ${RSS_FEED_UPSTREAM:-http://feed-master:8080} QDRANT_URL: ${QDRANT_URL:?set QDRANT_URL to your remote Qdrant URL} QDRANT_COLLECTION: ${QDRANT_COLLECTION:-tlc-captions-full} QDRANT_VECTOR_NAME: ${QDRANT_VECTOR_NAME:-} @@ -23,4 +65,5 @@ services: QDRANT_EMBED_MODEL: ${QDRANT_EMBED_MODEL:-BAAI/bge-large-en-v1.5} LOCAL_DATA_DIR: ${LOCAL_DATA_DIR:-/app/data/video_metadata} volumes: + - ./channels.yml:/app/python_app/channels.yml:ro - ./data:/app/data:ro diff --git a/feed-master-config/fm.yml b/feed-master-config/fm.yml new file mode 100644 index 0000000..06e2c95 --- /dev/null +++ b/feed-master-config/fm.yml @@ -0,0 +1,166 @@ +# Feed Master Configuration +# Auto-generated from channels.yml +# Do not edit manually - regenerate using generate_feed_config_simple.py + +feeds: + youtube-unified: + title: YouTube Unified Feed + description: Aggregated feed from all YouTube channels + link: https://youtube.com + language: "en-us" + sources: + - name: A Quality Existence + url: http://rss-bridge/?action=display&bridge=YoutubeBridge&context=By+channel+id&c=UC6vg0HkKKlgsWk-3HfV-vnw&format=Mrss + - name: Adams Fall + url: http://rss-bridge/?action=display&bridge=YoutubeBridge&context=By+channel+id&c=UCbD1Pm0TOcRK2zaCrwgcTTg&format=Mrss + - name: Andrea with the Bangs + url: http://rss-bridge/?action=display&bridge=YoutubeBridge&context=By+channel+id&c=UCeWWxwzgLYUbfjWowXhVdYw&format=Mrss + - name: Aphrael Pilotson + url: http://rss-bridge/?action=display&bridge=YoutubeBridge&context=By+channel+id&c=UCGHuURJ1XFHzPSeokf6510A&format=Mrss + - name: Cassidy van der Kamp + url: http://rss-bridge/?action=display&bridge=YoutubeBridge&context=By+channel+id&c=UCM9Z05vuQhMEwsV03u6DrLA&format=Mrss + - name: Channel UCCebR16tXbv + url: http://rss-bridge/?action=display&bridge=YoutubeBridge&context=By+channel+id&c=UCCebR16tXbv5Ykk9_WtCCug&format=Mrss + - name: Channel UCiJmdXTb76i + url: http://rss-bridge/?action=display&bridge=YoutubeBridge&context=By+channel+id&c=UCiJmdXTb76i8eIPXdJyf8ZQ&format=Mrss + - name: Charlie's Little Corner + url: http://rss-bridge/?action=display&bridge=YoutubeBridge&context=By+channel+id&c=UC952hDf_C4nYJdqwK7VzTxA&format=Mrss + - name: Chris Howard + url: http://rss-bridge/?action=display&bridge=YoutubeBridge&context=By+channel+id&c=UC704NVL2DyzYg3rMU9r1f7A&format=Mrss + - name: Christian Baxter + url: http://rss-bridge/?action=display&bridge=YoutubeBridge&context=By+channel+id&c=UCU5SNBfTo4umhjYz6M0Jsmg&format=Mrss + - name: Climbing Mt Sophia + url: http://rss-bridge/?action=display&bridge=YoutubeBridge&context=By+channel+id&c=UCk9O91WwruXmgu1NQrKZZEw&format=Mrss + - name: Corner Citizen + url: http://rss-bridge/?action=display&bridge=YoutubeBridge&context=By+channel+id&c=UCAXyF_HFeMgwS8nkGVeroAA&format=Mrss + - name: Davidbusuttil9086 + url: http://rss-bridge/?action=display&bridge=YoutubeBridge&context=By+channel+id&c=UCxV18lwwh29DiWuooz7UCvg&format=Mrss + - name: Ein Sof - Infinite Reflections + url: http://rss-bridge/?action=display&bridge=YoutubeBridge&context=By+channel+id&c=UC4Rmxg7saTfwIpvq3QEzylQ&format=Mrss + - name: Emily Rajeh + url: http://rss-bridge/?action=display&bridge=YoutubeBridge&context=By+channel+id&c=UCn5mf-fcpBmkepIpZ8eFRng&format=Mrss + - name: Eric Seitz + url: http://rss-bridge/?action=display&bridge=YoutubeBridge&context=By+channel+id&c=UCTdH4nh6JTcfKUAWvmnPoIQ&format=Mrss + - name: Ethan Caughey + url: http://rss-bridge/?action=display&bridge=YoutubeBridge&context=By+channel+id&c=UCv2Qft5mZrmA9XAwnl9PU-g&format=Mrss + - name: faturechi + url: http://rss-bridge/?action=display&bridge=YoutubeBridge&context=By+channel+id&c=UCprytROeCztMOMe8plyJRMg&format=Mrss + - name: Finding Ideas + url: http://rss-bridge/?action=display&bridge=YoutubeBridge&context=By+channel+id&c=UC6Tvr9mBXNaAxLGRA_sUSRA&format=Mrss + - name: Free Rilian + url: http://rss-bridge/?action=display&bridge=YoutubeBridge&context=By+channel+id&c=UCT8Lq3ufaGEnCSS8WpFatqw&format=Mrss + - name: From Whom All Blessings Flow + url: http://rss-bridge/?action=display&bridge=YoutubeBridge&context=By+channel+id&c=UC_-YQbnPfBbIezMr1adZZiQ&format=Mrss + - name: Grahampardun + url: http://rss-bridge/?action=display&bridge=YoutubeBridge&context=By+channel+id&c=UCB0C8DEIQlQzvSGuGriBxtA&format=Mrss + - name: Grail Country + url: http://rss-bridge/?action=display&bridge=YoutubeBridge&context=By+channel+id&c=UCsi_x8c12NW9FR7LL01QXKA&format=Mrss + - name: Grey Hamilton + url: http://rss-bridge/?action=display&bridge=YoutubeBridge&context=By+channel+id&c=UCrZyTWGMdRM9_P26RKPvh3A&format=Mrss + - name: Grizwald Grim + url: http://rss-bridge/?action=display&bridge=YoutubeBridge&context=By+channel+id&c=UCAqTQ5yLHHH44XWwWXLkvHQ&format=Mrss + - name: Jesspurviance + url: http://rss-bridge/?action=display&bridge=YoutubeBridge&context=By+channel+id&c=UCQ7rVoApmYIpcmU7fB9RPyw&format=Mrss + - name: John Vervaeke + url: http://rss-bridge/?action=display&bridge=YoutubeBridge&context=By+channel+id&c=UCpqDUjTsof-kTNpnyWper_Q&format=Mrss + - name: Jonathan Dumeer + url: http://rss-bridge/?action=display&bridge=YoutubeBridge&context=By+channel+id&c=UC4GiA5Hnwy415uVRymxPK-w&format=Mrss + - name: Jonathan Pageau + url: http://rss-bridge/?action=display&bridge=YoutubeBridge&context=By+channel+id&c=UCtCTSf3UwRU14nYWr_xm-dQ&format=Mrss + - name: Jordan B Peterson + url: http://rss-bridge/?action=display&bridge=YoutubeBridge&context=By+channel+id&c=UCL_f53ZEJxp8TtlOkHwMV9Q&format=Mrss + - name: Jordan Hall + url: http://rss-bridge/?action=display&bridge=YoutubeBridge&context=By+channel+id&c=UCMzT-mdCqoyEv_-YZVtE7MQ&format=Mrss + - name: Joseph Lambrecht + url: http://rss-bridge/?action=display&bridge=YoutubeBridge&context=By+channel+id&c=UCiOZYvBGHw1Y6wyzffwEp9g&format=Mrss + - name: Justinsmorningcoffee + url: http://rss-bridge/?action=display&bridge=YoutubeBridge&context=By+channel+id&c=UCPUVeoQYyq8cndWwyczX6RA&format=Mrss + - name: Kale Zelden + url: http://rss-bridge/?action=display&bridge=YoutubeBridge&context=By+channel+id&c=UCiukuaNd_qzRDTW9qe2OC1w&format=Mrss + - name: Lancecleaver227 + url: http://rss-bridge/?action=display&bridge=YoutubeBridge&context=By+channel+id&c=UCwF5LWNOFou_50bT65bq4Bg&format=Mrss + - name: Lucas Vos + url: http://rss-bridge/?action=display&bridge=YoutubeBridge&context=By+channel+id&c=UCez1fzMRGctojfis2lfRYug&format=Mrss + - name: Luke Thompson + url: http://rss-bridge/?action=display&bridge=YoutubeBridge&context=By+channel+id&c=UC_dnk5D4tFCRYCrKIcQlcfw&format=Mrss + - name: Marc Jackson + url: http://rss-bridge/?action=display&bridge=YoutubeBridge&context=By+channel+id&c=UCMJCtS8jKouJ2d8UIYzW3vg&format=Mrss + - name: Mark D Parker + url: http://rss-bridge/?action=display&bridge=YoutubeBridge&context=By+channel+id&c=UCVdSgEf9bLXFMBGSMhn7x4Q&format=Mrss + - name: Mark S + url: http://rss-bridge/?action=display&bridge=YoutubeBridge&context=By+channel+id&c=UC977g6oGYIJDQnsZOGjQBBA&format=Mrss + - name: Mary Kochan + url: http://rss-bridge/?action=display&bridge=YoutubeBridge&context=By+channel+id&c=UC2leFZRD0ZlQDQxpR2Zd8oA&format=Mrss + - name: Matthewparlato5626 + url: http://rss-bridge/?action=display&bridge=YoutubeBridge&context=By+channel+id&c=UCosBhpwwGh_ueYq4ZSi5dGw&format=Mrss + - name: mcmosav + url: http://rss-bridge/?action=display&bridge=YoutubeBridge&context=By+channel+id&c=UCnojyPW0IgLWTQ0SaDQ1KBA&format=Mrss + - name: Michaelmartin8681 + url: http://rss-bridge/?action=display&bridge=YoutubeBridge&context=By+channel+id&c=UCpLJJLVB_7v4Igq-9arja1A&format=Mrss + - name: More Christ + url: http://rss-bridge/?action=display&bridge=YoutubeBridge&context=By+channel+id&c=UCEPOn4cgvrrerg_-q_Ygw1A&format=Mrss + - name: Neal Daedalus + url: http://rss-bridge/?action=display&bridge=YoutubeBridge&context=By+channel+id&c=UC8mJqpS_EBbMcyuzZDF0TEw&format=Mrss + - name: Nechama Gluck + url: http://rss-bridge/?action=display&bridge=YoutubeBridge&context=By+channel+id&c=UC5goUoFM4LPim4eY4pwRXYw&format=Mrss + - name: OG Rose + url: http://rss-bridge/?action=display&bridge=YoutubeBridge&context=By+channel+id&c=UC6zHDj4D323xJkblnPTvY3Q&format=Mrss + - name: Paul Anleitner + url: http://rss-bridge/?action=display&bridge=YoutubeBridge&context=By+channel+id&c=UC2yCyOMUeem-cYwliC-tLJg&format=Mrss + - name: Paul Rene Nichols + url: http://rss-bridge/?action=display&bridge=YoutubeBridge&context=By+channel+id&c=UCDCfI162vhPvwdxW6X4nmiw&format=Mrss + - name: Paul VanderKlay + url: http://rss-bridge/?action=display&bridge=YoutubeBridge&context=By+channel+id&c=UCGsDIP_K6J6VSTqlq-9IPlg&format=Mrss + - name: President Foxman + url: http://rss-bridge/?action=display&bridge=YoutubeBridge&context=By+channel+id&c=UCMVG5eqpYFVEB-a9IqAOuHA&format=Mrss + - name: Rafekelley + url: http://rss-bridge/?action=display&bridge=YoutubeBridge&context=By+channel+id&c=UCDIPXp88qjAV3TiaR5Uo3iQ&format=Mrss + - name: Randos United + url: http://rss-bridge/?action=display&bridge=YoutubeBridge&context=By+channel+id&c=UCEzWTLDYmL8soRdQec9Fsjw&format=Mrss + - name: Randos United 2 + url: http://rss-bridge/?action=display&bridge=YoutubeBridge&context=By+channel+id&c=UC1KgNsMdRoIA_njVmaDdHgA&format=Mrss + - name: Rebel Wisdom + url: http://rss-bridge/?action=display&bridge=YoutubeBridge&context=By+channel+id&c=UCFQ6Gptuq-sLflbJ4YY3Umw&format=Mrss + - name: Restoring Meaning + url: http://rss-bridge/?action=display&bridge=YoutubeBridge&context=By+channel+id&c=UCzX6R3ZLQh5Zma_5AsPcqPA&format=Mrss + - name: Rightinchrist + url: http://rss-bridge/?action=display&bridge=YoutubeBridge&context=By+channel+id&c=UCZA5mUAyYcCL1kYgxbeMNrA&format=Mrss + - name: Ron Copperman + url: http://rss-bridge/?action=display&bridge=YoutubeBridge&context=By+channel+id&c=UC5yLuFQCms4nb9K2bGQLqIw&format=Mrss + - name: Sartori Studios + url: http://rss-bridge/?action=display&bridge=YoutubeBridge&context=By+channel+id&c=UC8SErJkYnDsYGh1HxoZkl-g&format=Mrss + - name: Secular Koranism + url: http://rss-bridge/?action=display&bridge=YoutubeBridge&context=By+channel+id&c=UCFLovlJ8RFApfjrf2y157xg&format=Mrss + - name: Shoulder Serf + url: http://rss-bridge/?action=display&bridge=YoutubeBridge&context=By+channel+id&c=UChptV-kf8lnncGh7DA2m8Pw&format=Mrss + - name: Skankenstein + url: http://rss-bridge/?action=display&bridge=YoutubeBridge&context=By+channel+id&c=UCUSyTPWW4JaG1YfUPddw47Q&format=Mrss + - name: Strange Theology + url: http://rss-bridge/?action=display&bridge=YoutubeBridge&context=By+channel+id&c=UCEY1vGNBPsC3dCatZyK3Jkw&format=Mrss + - name: The Anadromist + url: http://rss-bridge/?action=display&bridge=YoutubeBridge&context=By+channel+id&c=UCIAtCuzdvgNJvSYILnHtdWA&format=Mrss + - name: The Chris Show + url: http://rss-bridge/?action=display&bridge=YoutubeBridge&context=By+channel+id&c=UClIDP7_Kzv_7tDQjTv9EhrA&format=Mrss + - name: The Meaning Code + url: http://rss-bridge/?action=display&bridge=YoutubeBridge&context=By+channel+id&c=UCgp_r6WlBwDSJrP43Mz07GQ&format=Mrss + - name: the plebistocrat + url: http://rss-bridge/?action=display&bridge=YoutubeBridge&context=By+channel+id&c=UCaJ0CqiiMSTq4X0rycUOIjw&format=Mrss + - name: The Young Levite + url: http://rss-bridge/?action=display&bridge=YoutubeBridge&context=By+channel+id&c=UC1a4VtU_SMSfdRiwMJR33YQ&format=Mrss + - name: TheCommonToad + url: http://rss-bridge/?action=display&bridge=YoutubeBridge&context=By+channel+id&c=UC-QiBn6GsM3JZJAeAQpaGAA&format=Mrss + - name: TheScrollersPodcast + url: http://rss-bridge/?action=display&bridge=YoutubeBridge&context=By+channel+id&c=UC5uv-BxzCrN93B_5qbOdRWw&format=Mrss + - name: Transfigured + url: http://rss-bridge/?action=display&bridge=YoutubeBridge&context=By+channel+id&c=UCg7Ed0lecvko58ibuX1XHng&format=Mrss + - name: UpCycleClub + url: http://rss-bridge/?action=display&bridge=YoutubeBridge&context=By+channel+id&c=UCzw2FNI3IRphcAoVcUENOgQ&format=Mrss + - name: Wavesofobsession + url: http://rss-bridge/?action=display&bridge=YoutubeBridge&context=By+channel+id&c=UCedgru6YCto3zyXjlbuQuqA&format=Mrss + +system: + update: 5m + max_per_feed: 5 + max_total: 200 + max_keep: 1000 + base_url: http://localhost:8097 \ No newline at end of file diff --git a/generate_feed_config.py b/generate_feed_config.py new file mode 100644 index 0000000..89d5017 --- /dev/null +++ b/generate_feed_config.py @@ -0,0 +1,91 @@ +#!/usr/bin/env python3 +""" +Generate feed-master configuration from channels.yml. +This ensures a single source of truth for the YouTube channels. +""" + +import sys +from pathlib import Path + +from .channel_config import build_rss_bridge_url, load_channel_entries + + +def generate_fm_config(channels_file, output_file, rss_bridge_host="rss-bridge"): + """Generate feed-master YAML configuration from channels.yml""" + + print(f"Reading channels from {channels_file}") + channels = load_channel_entries(Path(channels_file)) + print(f"Found {len(channels)} channels") + + # Generate feed configuration + config = [] + config.append("# Feed Master Configuration") + config.append("# Auto-generated from channels.yml") + config.append("# Do not edit manually - regenerate using generate_feed_config.py") + config.append("") + config.append("feeds:") + config.append(" youtube-unified:") + config.append(" title: YouTube Unified Feed") + config.append(" description: Aggregated feed from all YouTube channels") + config.append(" link: https://youtube.com") + config.append(' language: "en-us"') + config.append(" sources:") + + processed = 0 + skipped = 0 + + for channel in channels: + if not channel.get("rss_enabled", True): + skipped += 1 + continue + bridge_url = build_rss_bridge_url(channel, rss_bridge_host=rss_bridge_host) + if not bridge_url: + skipped += 1 + continue + name = channel.get("name", "Unknown") + config.append(f" - name: {name}") + config.append(f" url: {bridge_url}") + processed += 1 + + # Add system configuration + config.append("") + config.append("system:") + config.append(" update: 5m") + config.append(" max_per_feed: 5") + config.append(" max_total: 200") + config.append(" max_keep: 1000") + config.append(" base_url: http://localhost:8097") + + # Write output + print(f"\nProcessed {processed} channels, skipped {skipped}") + + with open(output_file, 'w') as f: + f.write('\n'.join(config)) + + print(f"Configuration written to {output_file}") + print(f"\nTo apply this configuration:") + print(f" 1. Copy {output_file} to feed-master/etc/fm.yml") + print(f" 2. Restart the feed-master service") + + +if __name__ == "__main__": + # Default paths + script_dir = Path(__file__).parent + channels_file = script_dir / "channels.yml" + output_file = script_dir / "feed-master-config" / "fm.yml" + + # Allow overriding via command line + if len(sys.argv) > 1: + channels_file = Path(sys.argv[1]) + if len(sys.argv) > 2: + output_file = Path(sys.argv[2]) + + if not channels_file.exists(): + print(f"Error: {channels_file} not found", file=sys.stderr) + print(f"\nUsage: {sys.argv[0]} [channels.yml] [output.yml]", file=sys.stderr) + sys.exit(1) + + # Ensure output directory exists + output_file.parent.mkdir(parents=True, exist_ok=True) + + generate_fm_config(channels_file, output_file) diff --git a/generate_feed_config_simple.py b/generate_feed_config_simple.py new file mode 100755 index 0000000..0714874 --- /dev/null +++ b/generate_feed_config_simple.py @@ -0,0 +1,88 @@ +#!/usr/bin/env python3 +""" +Generate feed-master configuration from channels.yml. +Simplified version that doesn't require RSS-Bridge to be running. +""" + +import sys +from pathlib import Path + +from .channel_config import build_rss_bridge_url, load_channel_entries + + +def generate_fm_config(channels_file, output_file, rss_bridge_host="rss-bridge"): + """Generate feed-master YAML configuration from channels.yml""" + + print(f"Reading channels from {channels_file}") + channels = load_channel_entries(Path(channels_file)) + print(f"Found {len(channels)} channels") + + # Generate feed configuration + config = [] + config.append("# Feed Master Configuration") + config.append("# Auto-generated from channels.yml") + config.append("# Do not edit manually - regenerate using generate_feed_config_simple.py") + config.append("") + config.append("feeds:") + config.append(" youtube-unified:") + config.append(" title: YouTube Unified Feed") + config.append(" description: Aggregated feed from all YouTube channels") + config.append(" link: https://youtube.com") + config.append(' language: "en-us"') + config.append(" sources:") + + processed = 0 + skipped = 0 + + for channel in channels: + if not channel.get("rss_enabled", True): + skipped += 1 + continue + bridge_url = build_rss_bridge_url(channel, rss_bridge_host=rss_bridge_host) + if not bridge_url: + skipped += 1 + continue + name = channel.get("name", "Unknown") + config.append(f" - name: {name}") + config.append(f" url: {bridge_url}") + processed += 1 + + # Add system configuration + config.append("") + config.append("system:") + config.append(" update: 5m") + config.append(" max_per_feed: 5") + config.append(" max_total: 200") + config.append(" max_keep: 1000") + config.append(" base_url: http://localhost:8097") + + # Write output + print(f"\nProcessed {processed} channels, skipped {skipped}") + + with open(output_file, 'w') as f: + f.write('\n'.join(config)) + + print(f"Configuration written to {output_file}") + + +if __name__ == "__main__": + # Default paths + script_dir = Path(__file__).parent + channels_file = script_dir / "channels.yml" + output_file = script_dir / "feed-master-config" / "fm.yml" + + # Allow overriding via command line + if len(sys.argv) > 1: + channels_file = Path(sys.argv[1]) + if len(sys.argv) > 2: + output_file = Path(sys.argv[2]) + + if not channels_file.exists(): + print(f"Error: {channels_file} not found", file=sys.stderr) + print(f"\nUsage: {sys.argv[0]} [channels.yml] [output.yml]", file=sys.stderr) + sys.exit(1) + + # Ensure output directory exists + output_file.parent.mkdir(parents=True, exist_ok=True) + + generate_fm_config(channels_file, output_file) diff --git a/search_app.py b/search_app.py index d4bc1d2..2a72579 100644 --- a/search_app.py +++ b/search_app.py @@ -5,6 +5,8 @@ Routes: GET / -> static HTML search page. GET /graph -> static reference graph UI. GET /api/channels -> channels aggregation. + GET /api/channel-list -> canonical channel list + feed URL. + GET /channels.txt -> raw channel URLs list. GET /api/search -> Elasticsearch keyword search. GET /api/graph -> reference graph API. GET /api/transcript -> transcript JSON payload. @@ -17,6 +19,8 @@ import json import logging import os import re +import urllib.parse +import xml.etree.ElementTree as ET from pathlib import Path from typing import Any, Deque, Dict, Iterable, List, Optional, Sequence, Set, Tuple @@ -25,9 +29,11 @@ from datetime import datetime from threading import Lock from time import monotonic +import requests from flask import Flask, jsonify, request, send_from_directory from .config import CONFIG, AppConfig +from .channel_config import load_channel_entries try: from elasticsearch import Elasticsearch # type: ignore @@ -45,6 +51,10 @@ DEFAULT_ELASTIC_TIMEOUT = int(os.environ.get("ELASTIC_TIMEOUT_SECONDS", "30")) _RATE_LIMIT_BUCKETS: Dict[str, Deque[float]] = defaultdict(deque) _RATE_LIMIT_LOCK = Lock() +_RSS_AUTHOR_CACHE: Dict[str, Tuple[str, float]] = {} +_RSS_AUTHOR_LOCK = Lock() +_RSS_AUTHOR_TTL_SECONDS = 60 * 60 * 24 +_RSS_OEMBED_LIMIT = 12 def _client_rate_key() -> str: @@ -101,6 +111,192 @@ def _ensure_client(config: AppConfig) -> "Elasticsearch": return Elasticsearch(config.elastic.url, **kwargs) +def _extract_video_id(url: str) -> Optional[str]: + if not url: + return None + try: + parsed = urllib.parse.urlparse(url.strip()) + except Exception: + return None + host = (parsed.netloc or "").lower() + path = parsed.path or "" + if host in {"youtu.be", "www.youtu.be"}: + return path.lstrip("/") or None + if host.endswith("youtube.com"): + if path == "/watch": + params = urllib.parse.parse_qs(parsed.query) + return (params.get("v") or [None])[0] + if path.startswith("/shorts/"): + return path.split("/", 2)[2] if len(path.split("/", 2)) > 2 else None + return None + + +def _lookup_channel_names( + client: "Elasticsearch", + index: str, + video_ids: Iterable[str], +) -> Dict[str, str]: + ids = [vid for vid in video_ids if vid] + if not ids: + return {} + + now = monotonic() + mapping: Dict[str, str] = {} + cached_hits = 0 + elastic_hits = 0 + remaining = [] + with _RSS_AUTHOR_LOCK: + for vid in ids: + cached = _RSS_AUTHOR_CACHE.get(vid) + if cached and (now - cached[1]) < _RSS_AUTHOR_TTL_SECONDS: + mapping[vid] = cached[0] + cached_hits += 1 + else: + remaining.append(vid) + + if remaining: + try: + response = client.mget(index=index, body={"ids": remaining}) + except Exception as exc: # pragma: no cover - elasticsearch handles errors + LOGGER.warning("RSS title lookup failed: %s", exc) + response = {} + for doc in response.get("docs", []): + if not doc.get("found"): + continue + source = doc.get("_source") or {} + name = source.get("channel_name") or source.get("channel_id") + if name: + vid = doc.get("_id", "") + mapping[vid] = str(name) + elastic_hits += 1 + with _RSS_AUTHOR_LOCK: + _RSS_AUTHOR_CACHE[vid] = (mapping[vid], now) + + missing = [vid for vid in remaining if vid not in mapping] + oembed_hits = 0 + oembed_attempts = 0 + if missing: + for vid in missing[:_RSS_OEMBED_LIMIT]: + oembed_attempts += 1 + video_url = f"https://www.youtube.com/watch?v={vid}" + oembed_url = ( + "https://www.youtube.com/oembed?format=json&url=" + + urllib.parse.quote(video_url, safe="") + ) + try: + response = requests.get(oembed_url, timeout=10) + if response.status_code != 200: + continue + data = response.json() + except Exception: + continue + author = data.get("author_name") + if not author: + continue + mapping[vid] = str(author) + oembed_hits += 1 + with _RSS_AUTHOR_LOCK: + _RSS_AUTHOR_CACHE[vid] = (mapping[vid], now) + + missing_count = max(len(ids) - cached_hits - elastic_hits - oembed_hits, 0) + if missing_count or oembed_attempts: + LOGGER.info( + "RSS title lookup: total=%d cached=%d elastic=%d oembed=%d missing=%d", + len(ids), + cached_hits, + elastic_hits, + oembed_hits, + missing_count, + ) + else: + LOGGER.debug( + "RSS title lookup: total=%d cached=%d elastic=%d", + len(ids), + cached_hits, + elastic_hits, + ) + + return mapping + + +def _rewrite_rss_payload( + content: bytes, + client: "Elasticsearch", + index: str, + feed_name: str, +) -> bytes: + try: + root = ET.fromstring(content) + except ET.ParseError: + LOGGER.warning("RSS rewrite skipped (invalid XML) for %s", feed_name) + return content + + channel = root.find("channel") + if channel is None: + LOGGER.warning("RSS rewrite skipped (missing channel) for %s", feed_name) + return content + + items = channel.findall("item") + total_items = len(items) + removed_errors = 0 + video_ids: Set[str] = set() + for item in list(items): + title_el = item.find("title") + title_text = (title_el.text or "").strip() if title_el is not None else "" + if "Bridge returned error" in title_text: + channel.remove(item) + removed_errors += 1 + continue + link_el = item.find("link") + guid_el = item.find("guid") + video_id = _extract_video_id((link_el.text or "") if link_el is not None else "") + if not video_id: + video_id = _extract_video_id((guid_el.text or "") if guid_el is not None else "") + if video_id: + video_ids.add(video_id) + + channel_name_map = _lookup_channel_names(client, index, video_ids) + if not channel_name_map: + LOGGER.info( + "RSS rewrite: feed=%s items=%d removed_errors=%d resolved=0", + feed_name, + total_items, + removed_errors, + ) + return ET.tostring(root, encoding="utf-8", xml_declaration=True) + + prefixed = 0 + for item in channel.findall("item"): + title_el = item.find("title") + if title_el is None or not title_el.text: + continue + link_el = item.find("link") + guid_el = item.find("guid") + video_id = _extract_video_id((link_el.text or "") if link_el is not None else "") + if not video_id: + video_id = _extract_video_id((guid_el.text or "") if guid_el is not None else "") + if not video_id: + continue + channel_name = channel_name_map.get(video_id) + if not channel_name: + continue + prefix = f"{channel_name} - " + if title_el.text.startswith(prefix): + continue + title_el.text = f"{channel_name} - {title_el.text}" + prefixed += 1 + + LOGGER.info( + "RSS rewrite: feed=%s items=%d removed_errors=%d prefixed=%d resolved=%d", + feed_name, + total_items, + removed_errors, + prefixed, + len(channel_name_map), + ) + return ET.tostring(root, encoding="utf-8", xml_declaration=True) + + def metrics_payload(data_root: Path, include_external: bool = True) -> Dict[str, Any]: total_items = 0 channel_counter: Counter = Counter() @@ -1077,6 +1273,72 @@ def create_app(config: AppConfig = CONFIG) -> Flask: data.sort(key=lambda item: item["Name"].lower()) return jsonify(data) + @app.route("/api/channel-list") + def channel_list(): + payload = { + "channels": [], + "rss_feed_url": config.rss_feed_url, + "source": str(config.channels_path), + } + try: + payload["channels"] = load_channel_entries(config.channels_path) + except FileNotFoundError: + LOGGER.warning("Channel list not found: %s", config.channels_path) + payload["error"] = "channels_not_found" + except Exception as exc: + LOGGER.exception("Failed to load channel list: %s", exc) + payload["error"] = "channels_load_failed" + return jsonify(payload) + + @app.route("/channels.txt") + def channel_urls(): + try: + channels = load_channel_entries(config.channels_path) + except FileNotFoundError: + LOGGER.warning("Channel list not found: %s", config.channels_path) + return jsonify({"error": "channels_not_found"}), 404 + except Exception as exc: + LOGGER.exception("Failed to load channel list: %s", exc) + return jsonify({"error": "channels_load_failed"}), 500 + + urls = [channel["url"] for channel in channels if channel.get("url")] + body = "\n".join(urls) + ("\n" if urls else "") + return (body, 200, {"Content-Type": "text/plain; charset=utf-8"}) + + def _rss_target(feed_name: str) -> str: + name = (feed_name or "").strip("/") + if not name: + name = "youtube-unified" + return f"{config.rss_feed_upstream.rstrip('/')}/rss/{name}" + + @app.route("/rss") + @app.route("/rss/") + def rss_proxy(feed_name: str = ""): + target = _rss_target(feed_name) + try: + upstream = requests.get(target, timeout=30) + except requests.RequestException as exc: + LOGGER.warning("RSS upstream error for %s: %s", target, exc) + return jsonify({"error": "rss_unavailable"}), 502 + + payload = _rewrite_rss_payload(upstream.content, client, index, feed_name or "rss") + headers = { + "Content-Type": upstream.headers.get( + "Content-Type", "application/xml; charset=UTF-8" + ) + } + cache_header = upstream.headers.get("Cache-Control") + if cache_header: + headers["Cache-Control"] = cache_header + etag = upstream.headers.get("ETag") + if etag: + headers["ETag"] = etag + last_modified = upstream.headers.get("Last-Modified") + if last_modified: + headers["Last-Modified"] = last_modified + + return (payload, upstream.status_code, headers) + @app.route("/api/graph") def graph_api(): video_id = (request.args.get("video_id") or "").strip() diff --git a/static/app.js b/static/app.js index 69e9205..4472945 100644 --- a/static/app.js +++ b/static/app.js @@ -45,6 +45,10 @@ const aboutBtn = document.getElementById("aboutBtn"); const aboutPanel = document.getElementById("aboutPanel"); const aboutCloseBtn = document.getElementById("aboutCloseBtn"); + const rssButton = document.getElementById("rssButton"); + const rssFeedLink = document.getElementById("rssFeedLink"); + const channelListLink = document.getElementById("channelListLink"); + const channelCount = document.getElementById("channelCount"); const resultsDiv = document.getElementById("results"); const metaDiv = document.getElementById("meta"); const metricsContainer = document.getElementById("metrics"); @@ -406,6 +410,57 @@ } } + async function loadChannelListInfo() { + if (!rssFeedLink && !channelListLink && !channelCount) return; + try { + const res = await fetch("/api/channel-list"); + const payload = await res.json(); + if (rssFeedLink) { + const feedUrl = payload.rss_feed_url || ""; + if (feedUrl) { + rssFeedLink.href = feedUrl; + rssFeedLink.textContent = feedUrl; + } else { + rssFeedLink.textContent = "Unavailable"; + rssFeedLink.removeAttribute("href"); + } + } + if (rssButton) { + const feedUrl = payload.rss_feed_url || ""; + if (feedUrl) { + rssButton.href = feedUrl; + rssButton.classList.remove("is-disabled"); + rssButton.removeAttribute("aria-disabled"); + } else { + rssButton.removeAttribute("href"); + rssButton.classList.add("is-disabled"); + rssButton.setAttribute("aria-disabled", "true"); + } + } + if (channelCount) { + const count = Array.isArray(payload.channels) ? payload.channels.length : 0; + channelCount.textContent = count ? `${count} channels` : "No channels loaded"; + } + if (channelListLink && payload.error) { + channelListLink.textContent = "Channel list unavailable"; + } + } catch (err) { + console.error("Failed to load channel list", err); + if (rssFeedLink) { + rssFeedLink.textContent = "Unavailable"; + rssFeedLink.removeAttribute("href"); + } + if (rssButton) { + rssButton.removeAttribute("href"); + rssButton.classList.add("is-disabled"); + rssButton.setAttribute("aria-disabled", "true"); + } + if (channelCount) { + channelCount.textContent = "Channel list unavailable"; + } + } + } + function updateUrl(q) { const next = new URL(window.location.href); if (q) { @@ -1732,6 +1787,7 @@ window.addEventListener("popstate", () => { setFromQuery(); loadMetrics(); loadYears(); + loadChannelListInfo(); loadChannels().then(() => runSearch(currentPage)); })(); diff --git a/static/index.html b/static/index.html index 37b3be5..9e954f6 100644 --- a/static/index.html +++ b/static/index.html @@ -21,6 +21,22 @@
+

Enter a phrase to query title, description, and transcript text.

@@ -129,6 +145,15 @@

Use the toggles to choose exact, fuzzy, or phrase matching. Query string mode accepts raw Lucene syntax.

Results are ranked by your chosen sort order; the timeline summarizes the same query.

You can download transcripts, copy MLA citations, or explore references via the graph button.

+
+
Unified RSS feed
+ Loading… +
+
+
Channel list
+ View JSON +
+
diff --git a/static/style.css b/static/style.css index dd33f9f..48bdf62 100644 --- a/static/style.css +++ b/static/style.css @@ -510,6 +510,22 @@ body.modal-open { color: #000; } +.about-panel__section { + margin-top: 8px; + padding-top: 6px; + border-top: 1px solid #c0c0c0; +} + +.about-panel__label { + font-weight: bold; + margin-bottom: 2px; +} + +.about-panel__meta { + font-size: 10px; + color: #555; +} + .about-panel__header button { border: none; background: transparent; @@ -549,6 +565,50 @@ body.modal-open { box-sizing: border-box; } +.window-actions { + display: flex; + justify-content: flex-end; + margin-bottom: 6px; +} + +.rss-button { + display: inline-flex; + align-items: center; + gap: 4px; + padding: 2px 6px; + border: 1px solid; + border-color: ButtonHighlight ButtonShadow ButtonShadow ButtonHighlight; + background: ButtonFace; + color: #000; + text-decoration: none; + font-size: 11px; + cursor: pointer; +} + +.rss-button:hover { + background: #f3f3f3; +} + +.rss-button:active { + border-color: ButtonShadow ButtonHighlight ButtonHighlight ButtonShadow; +} + +.rss-button.is-disabled { + opacity: 0.5; + cursor: default; + pointer-events: none; +} + +.rss-button__icon { + width: 14px; + height: 14px; + fill: #f38b00; +} + +.rss-button__label { + font-weight: bold; +} + /* Badges */ .badge-row { margin-top: 6px;