Add unified channel feed

This commit is contained in:
knight 2026-01-08 22:53:30 -05:00
parent 63fe922860
commit 30503628b5
14 changed files with 1319 additions and 1 deletions

View File

@ -9,3 +9,5 @@ node_modules
data data
videos videos
*.log *.log
feed-master-config/var
feed-master-config/images

4
.gitignore vendored
View File

@ -51,6 +51,10 @@ Thumbs.db
# Logs # Logs
*.log *.log
# Feed Master runtime cache
feed-master-config/var/
feed-master-config/images/
# Testing # Testing
.pytest_cache/ .pytest_cache/
.coverage .coverage

87
Makefile Normal file
View File

@ -0,0 +1,87 @@
# Makefile for TLC Search + Feed Master
.PHONY: help config up down restart logs status update-channels
help:
@echo "TLC Search + Feed Master - Management Commands"
@echo ""
@echo "Configuration:"
@echo " make config - Regenerate feed-master configuration from channels.yml"
@echo ""
@echo "Service Management:"
@echo " make up - Start all services"
@echo " make down - Stop all services"
@echo " make restart - Restart all services"
@echo " make logs - View all service logs"
@echo " make status - Check service status"
@echo ""
@echo "Updates:"
@echo " make update-channels - Regenerate config and restart feed-master"
@echo ""
@echo "Individual Services:"
@echo " make logs-feed - View feed-master logs"
@echo " make logs-bridge - View rss-bridge logs"
@echo " make logs-app - View TLC Search logs"
@echo " make restart-feed - Restart feed-master only"
# Generate feed-master configuration from channels.yml
config:
@echo "Generating feed-master configuration..."
python3 -m python_app.generate_feed_config_simple
@echo "Configuration updated!"
# Start all services
up:
docker compose up -d
@echo ""
@echo "Services started!"
@echo " - RSS Bridge: http://localhost:3001"
@echo " - Feed Master: http://localhost:8097/rss/youtube-unified"
@echo " - TLC Search: http://localhost:8080"
# Stop all services
down:
docker compose down
# Restart all services
restart:
docker compose restart
# View all logs
logs:
docker compose logs -f
# View feed-master logs
logs-feed:
docker compose logs -f feed-master
# View rss-bridge logs
logs-bridge:
docker compose logs -f rss-bridge
# View TLC Search logs
logs-app:
docker compose logs -f app
# Check service status
status:
@docker compose ps
@echo ""
@echo "Endpoints:"
@echo " - RSS Bridge: http://localhost:3001"
@echo " - Feed Master: http://localhost:8097/rss/youtube-unified"
@echo " - TLC Search: http://localhost:8080"
# Restart only feed-master
restart-feed:
docker compose restart feed-master
# Pull latest channel URLs and regenerate configuration
update-channels:
@echo "Regenerating feed-master configuration..."
python3 -m python_app.generate_feed_config_simple
@echo ""
@echo "Restarting feed-master..."
docker compose restart feed-master
@echo ""
@echo "Update complete!"

162
channel_config.py Normal file
View File

@ -0,0 +1,162 @@
from __future__ import annotations
import json
import re
from pathlib import Path
from typing import Any, Dict, List, Optional
_CHANNEL_ID_PATTERN = re.compile(r"(?:https?://)?(?:www\.)?youtube\.com/channel/([^/?#]+)")
_HANDLE_PATTERN = re.compile(r"(?:https?://)?(?:www\.)?youtube\.com/@([^/?#]+)")
def _strip_quotes(value: str) -> str:
if len(value) >= 2 and value[0] == value[-1] and value[0] in {"'", '"'}:
return value[1:-1]
return value
def _parse_yaml_channels(text: str) -> List[Dict[str, str]]:
channels: List[Dict[str, str]] = []
current: Dict[str, str] = {}
for raw_line in text.splitlines():
line = raw_line.strip()
if not line or line.startswith("#"):
continue
if line == "channels:":
continue
if line.startswith("- "):
if current:
channels.append(current)
current = {}
line = line[2:].strip()
if not line:
continue
if ":" not in line:
continue
key, value = line.split(":", 1)
current[key.strip()] = _strip_quotes(value.strip())
if current:
channels.append(current)
return channels
def _extract_from_url(url: str) -> Dict[str, Optional[str]]:
channel_id = None
handle = None
channel_match = _CHANNEL_ID_PATTERN.search(url)
if channel_match:
channel_id = channel_match.group(1)
handle_match = _HANDLE_PATTERN.search(url)
if handle_match:
handle = handle_match.group(1)
return {"id": channel_id, "handle": handle}
def _normalize_handle(handle: Optional[str]) -> Optional[str]:
if not handle:
return None
return handle.lstrip("@").strip() or None
def _parse_bool(value: Optional[object]) -> Optional[bool]:
if isinstance(value, bool):
return value
if value is None:
return None
text = str(value).strip().lower()
if text in {"1", "true", "yes", "y"}:
return True
if text in {"0", "false", "no", "n"}:
return False
return None
def _normalize_entry(entry: Dict[str, Any]) -> Optional[Dict[str, Any]]:
channel_id = entry.get("id") or entry.get("channel_id")
handle = _normalize_handle(entry.get("handle") or entry.get("username"))
url = entry.get("url")
name = entry.get("name")
rss_flag = _parse_bool(
entry.get("rss_enabled") or entry.get("rss") or entry.get("include_in_feed")
)
if url:
extracted = _extract_from_url(url)
channel_id = channel_id or extracted.get("id")
handle = handle or extracted.get("handle")
if not url:
if channel_id:
url = f"https://www.youtube.com/channel/{channel_id}"
elif handle:
url = f"https://www.youtube.com/@{handle}"
if not name:
name = handle or channel_id
if not name or not url:
return None
normalized = {
"id": channel_id or "",
"handle": handle or "",
"name": name,
"url": url,
"rss_enabled": True if rss_flag is None else rss_flag,
}
return normalized
def load_channel_entries(path: Path) -> List[Dict[str, str]]:
if not path.exists():
raise FileNotFoundError(path)
if path.suffix.lower() == ".json":
payload = json.loads(path.read_text(encoding="utf-8"))
if isinstance(payload, dict):
raw_entries = payload.get("channels", [])
else:
raw_entries = payload
else:
raw_entries = _parse_yaml_channels(path.read_text(encoding="utf-8"))
entries: List[Dict[str, str]] = []
for raw in raw_entries:
if not isinstance(raw, dict):
continue
raw_payload: Dict[str, Any] = {}
for key, value in raw.items():
if value is None:
continue
if isinstance(value, bool):
raw_payload[str(key).strip()] = value
else:
raw_payload[str(key).strip()] = str(value).strip()
normalized = _normalize_entry(raw_payload)
if normalized:
entries.append(normalized)
entries.sort(key=lambda item: item["name"].lower())
return entries
def build_rss_bridge_url(entry: Dict[str, str], rss_bridge_host: str = "rss-bridge") -> Optional[str]:
channel_id = entry.get("id") or ""
handle = _normalize_handle(entry.get("handle"))
if channel_id:
return (
f"http://{rss_bridge_host}/?action=display&bridge=YoutubeBridge"
f"&context=By+channel+id&c={channel_id}&format=Mrss"
)
if handle:
return (
f"http://{rss_bridge_host}/?action=display&bridge=YoutubeBridge"
f"&context=By+username&u={handle}&format=Mrss"
)
return None

258
channels.yml Normal file
View File

@ -0,0 +1,258 @@
# Shared YouTube Channel Configuration
# Used by both TLC Search (transcript collection) and Feed Master (RSS aggregation)
channels:
- id: UCCebR16tXbv5Ykk9_WtCCug
name: Channel UCCebR16tXbv
url: https://www.youtube.com/channel/UCCebR16tXbv5Ykk9_WtCCug/videos
- id: UC6vg0HkKKlgsWk-3HfV-vnw
name: A Quality Existence
url: https://www.youtube.com/channel/UC6vg0HkKKlgsWk-3HfV-vnw/videos
- id: UCeWWxwzgLYUbfjWowXhVdYw
name: Andrea with the Bangs
url: https://www.youtube.com/channel/UCeWWxwzgLYUbfjWowXhVdYw/videos
- id: UC952hDf_C4nYJdqwK7VzTxA
name: Charlie's Little Corner
url: https://www.youtube.com/channel/UC952hDf_C4nYJdqwK7VzTxA/videos
- id: UCU5SNBfTo4umhjYz6M0Jsmg
name: Christian Baxter
url: https://www.youtube.com/channel/UCU5SNBfTo4umhjYz6M0Jsmg/videos
- id: UC6Tvr9mBXNaAxLGRA_sUSRA
name: Finding Ideas
url: https://www.youtube.com/channel/UC6Tvr9mBXNaAxLGRA_sUSRA/videos
- id: UC4Rmxg7saTfwIpvq3QEzylQ
name: Ein Sof - Infinite Reflections
url: https://www.youtube.com/channel/UC4Rmxg7saTfwIpvq3QEzylQ/videos
- id: UCTdH4nh6JTcfKUAWvmnPoIQ
name: Eric Seitz
url: https://www.youtube.com/channel/UCTdH4nh6JTcfKUAWvmnPoIQ/videos
- id: UCsi_x8c12NW9FR7LL01QXKA
name: Grail Country
url: https://www.youtube.com/channel/UCsi_x8c12NW9FR7LL01QXKA/videos
- id: UCAqTQ5yLHHH44XWwWXLkvHQ
name: Grizwald Grim
url: https://www.youtube.com/channel/UCAqTQ5yLHHH44XWwWXLkvHQ/videos
- id: UCprytROeCztMOMe8plyJRMg
name: faturechi
url: https://www.youtube.com/channel/UCprytROeCztMOMe8plyJRMg/videos
- id: UCpqDUjTsof-kTNpnyWper_Q
name: John Vervaeke
url: https://www.youtube.com/channel/UCpqDUjTsof-kTNpnyWper_Q/videos
- id: UCL_f53ZEJxp8TtlOkHwMV9Q
name: Jordan B Peterson
url: https://www.youtube.com/channel/UCL_f53ZEJxp8TtlOkHwMV9Q/videos
- id: UCez1fzMRGctojfis2lfRYug
name: Lucas Vos
url: https://www.youtube.com/channel/UCez1fzMRGctojfis2lfRYug/videos
- id: UC2leFZRD0ZlQDQxpR2Zd8oA
name: Mary Kochan
url: https://www.youtube.com/channel/UC2leFZRD0ZlQDQxpR2Zd8oA/videos
- id: UC8SErJkYnDsYGh1HxoZkl-g
name: Sartori Studios
url: https://www.youtube.com/channel/UC8SErJkYnDsYGh1HxoZkl-g/videos
- id: UCEPOn4cgvrrerg_-q_Ygw1A
name: More Christ
url: https://www.youtube.com/channel/UCEPOn4cgvrrerg_-q_Ygw1A/videos
- id: UC2yCyOMUeem-cYwliC-tLJg
name: Paul Anleitner
url: https://www.youtube.com/channel/UC2yCyOMUeem-cYwliC-tLJg/videos
- id: UCGsDIP_K6J6VSTqlq-9IPlg
name: Paul VanderKlay
url: https://www.youtube.com/channel/UCGsDIP_K6J6VSTqlq-9IPlg/videos
- id: UCEzWTLDYmL8soRdQec9Fsjw
name: Randos United
url: https://www.youtube.com/channel/UCEzWTLDYmL8soRdQec9Fsjw/videos
- id: UC1KgNsMdRoIA_njVmaDdHgA
name: Randos United 2
url: https://www.youtube.com/channel/UC1KgNsMdRoIA_njVmaDdHgA/videos
- id: UCFQ6Gptuq-sLflbJ4YY3Umw
name: Rebel Wisdom
url: https://www.youtube.com/channel/UCFQ6Gptuq-sLflbJ4YY3Umw/videos
- id: UCEY1vGNBPsC3dCatZyK3Jkw
name: Strange Theology
url: https://www.youtube.com/channel/UCEY1vGNBPsC3dCatZyK3Jkw/videos
- id: UCIAtCuzdvgNJvSYILnHtdWA
name: The Anadromist
url: https://www.youtube.com/channel/UCIAtCuzdvgNJvSYILnHtdWA/videos
- id: UClIDP7_Kzv_7tDQjTv9EhrA
name: The Chris Show
url: https://www.youtube.com/channel/UClIDP7_Kzv_7tDQjTv9EhrA/videos
- id: UC-QiBn6GsM3JZJAeAQpaGAA
name: TheCommonToad
url: https://www.youtube.com/channel/UC-QiBn6GsM3JZJAeAQpaGAA/videos
- id: UCiJmdXTb76i8eIPXdJyf8ZQ
name: Channel UCiJmdXTb76i
url: https://www.youtube.com/channel/UCiJmdXTb76i8eIPXdJyf8ZQ/videos
- id: UCM9Z05vuQhMEwsV03u6DrLA
name: Cassidy van der Kamp
url: https://www.youtube.com/channel/UCM9Z05vuQhMEwsV03u6DrLA/videos
- id: UCgp_r6WlBwDSJrP43Mz07GQ
name: The Meaning Code
url: https://www.youtube.com/channel/UCgp_r6WlBwDSJrP43Mz07GQ/videos
- id: UC5uv-BxzCrN93B_5qbOdRWw
name: TheScrollersPodcast
url: https://www.youtube.com/channel/UC5uv-BxzCrN93B_5qbOdRWw/videos
- id: UCtCTSf3UwRU14nYWr_xm-dQ
name: Jonathan Pageau
url: https://www.youtube.com/channel/UCtCTSf3UwRU14nYWr_xm-dQ/videos
- id: UC1a4VtU_SMSfdRiwMJR33YQ
name: The Young Levite
url: https://www.youtube.com/channel/UC1a4VtU_SMSfdRiwMJR33YQ/videos
- id: UCg7Ed0lecvko58ibuX1XHng
name: Transfigured
url: https://www.youtube.com/channel/UCg7Ed0lecvko58ibuX1XHng/videos
- id: UCMVG5eqpYFVEB-a9IqAOuHA
name: President Foxman
url: https://www.youtube.com/channel/UCMVG5eqpYFVEB-a9IqAOuHA/videos
- id: UC8mJqpS_EBbMcyuzZDF0TEw
name: Neal Daedalus
url: https://www.youtube.com/channel/UC8mJqpS_EBbMcyuzZDF0TEw/videos
- id: UCGHuURJ1XFHzPSeokf6510A
name: Aphrael Pilotson
url: https://www.youtube.com/channel/UCGHuURJ1XFHzPSeokf6510A/videos
- id: UC704NVL2DyzYg3rMU9r1f7A
handle: chrishoward8473
name: Chris Howard
url: https://www.youtube.com/@chrishoward8473/videos
- id: UChptV-kf8lnncGh7DA2m8Pw
name: Shoulder Serf
url: https://www.youtube.com/channel/UChptV-kf8lnncGh7DA2m8Pw/videos
- id: UCzX6R3ZLQh5Zma_5AsPcqPA
name: Restoring Meaning
url: https://www.youtube.com/channel/UCzX6R3ZLQh5Zma_5AsPcqPA/videos
- id: UCiukuaNd_qzRDTW9qe2OC1w
name: Kale Zelden
url: https://www.youtube.com/channel/UCiukuaNd_qzRDTW9qe2OC1w/videos
- id: UC5yLuFQCms4nb9K2bGQLqIw
name: Ron Copperman
url: https://www.youtube.com/channel/UC5yLuFQCms4nb9K2bGQLqIw/videos
- id: UCVdSgEf9bLXFMBGSMhn7x4Q
name: Mark D Parker
url: https://www.youtube.com/channel/UCVdSgEf9bLXFMBGSMhn7x4Q/videos
- id: UC_dnk5D4tFCRYCrKIcQlcfw
name: Luke Thompson
url: https://www.youtube.com/channel/UC_dnk5D4tFCRYCrKIcQlcfw/videos
- id: UCT8Lq3ufaGEnCSS8WpFatqw
handle: Freerilian
name: Free Rilian
url: https://www.youtube.com/@Freerilian/videos
- id: UC977g6oGYIJDQnsZOGjQBBA
handle: marks.-ry7bm
name: Mark S
url: https://www.youtube.com/@marks.-ry7bm/videos
- id: UCbD1Pm0TOcRK2zaCrwgcTTg
handle: Adams-Fall
name: Adams Fall
url: https://www.youtube.com/@Adams-Fall/videos
- id: UCnojyPW0IgLWTQ0SaDQ1KBA
handle: mcmosav
name: mcmosav
url: https://www.youtube.com/@mcmosav/videos
- id: UCiOZYvBGHw1Y6wyzffwEp9g
handle: Landbeorht
name: Joseph Lambrecht
url: https://www.youtube.com/@Landbeorht/videos
- id: UCAXyF_HFeMgwS8nkGVeroAA
handle: Corner_Citizen
name: Corner Citizen
url: https://www.youtube.com/@Corner_Citizen/videos
- id: UCv2Qft5mZrmA9XAwnl9PU-g
handle: ethan.caughey
name: Ethan Caughey
url: https://www.youtube.com/@ethan.caughey/videos
- id: UCMJCtS8jKouJ2d8UIYzW3vg
handle: MarcInTbilisi
name: Marc Jackson
url: https://www.youtube.com/@MarcInTbilisi/videos
- id: UCk9O91WwruXmgu1NQrKZZEw
handle: climbingmt.sophia
name: Climbing Mt Sophia
url: https://www.youtube.com/@climbingmt.sophia/videos
- id: UCUSyTPWW4JaG1YfUPddw47Q
handle: Skankenstein
name: Skankenstein
url: https://www.youtube.com/@Skankenstein/videos
- id: UCzw2FNI3IRphcAoVcUENOgQ
handle: UpCycleClub
name: UpCycleClub
url: https://www.youtube.com/@UpCycleClub/videos
- id: UCQ7rVoApmYIpcmU7fB9RPyw
handle: JessPurviance
name: Jesspurviance
url: https://www.youtube.com/@JessPurviance/videos
- id: UCrZyTWGMdRM9_P26RKPvh3A
handle: greyhamilton52
name: Grey Hamilton
url: https://www.youtube.com/@greyhamilton52/videos
- id: UCDCfI162vhPvwdxW6X4nmiw
handle: paulrenenichols
name: Paul Rene Nichols
url: https://www.youtube.com/@paulrenenichols/videos
- id: UCFLovlJ8RFApfjrf2y157xg
handle: OfficialSecularKoranism
name: Secular Koranism
url: https://www.youtube.com/@OfficialSecularKoranism/videos
- id: UC_-YQbnPfBbIezMr1adZZiQ
handle: FromWhomAllBlessingsFlow
name: From Whom All Blessings Flow
url: https://www.youtube.com/@FromWhomAllBlessingsFlow/videos
- id: UCn5mf-fcpBmkepIpZ8eFRng
handle: FoodTruckEmily
name: Emily Rajeh
url: https://www.youtube.com/@FoodTruckEmily/videos
- id: UC6zHDj4D323xJkblnPTvY3Q
handle: O.G.Rose.Michelle.and.Daniel
name: OG Rose
url: https://www.youtube.com/@O.G.Rose.Michelle.and.Daniel/videos
- id: UC4GiA5Hnwy415uVRymxPK-w
handle: JonathanDumeer
name: Jonathan Dumeer
url: https://www.youtube.com/@JonathanDumeer/videos
- id: UCMzT-mdCqoyEv_-YZVtE7MQ
handle: JordanGreenhall
name: Jordan Hall
url: https://www.youtube.com/@JordanGreenhall/videos
- id: UC5goUoFM4LPim4eY4pwRXYw
handle: NechamaGluck
name: Nechama Gluck
url: https://www.youtube.com/@NechamaGluck/videos
- id: UCPUVeoQYyq8cndWwyczX6RA
handle: justinsmorningcoffee
name: Justinsmorningcoffee
url: https://www.youtube.com/@justinsmorningcoffee/videos
- id: UCB0C8DEIQlQzvSGuGriBxtA
handle: grahampardun
name: Grahampardun
url: https://www.youtube.com/@grahampardun/videos
- id: UCpLJJLVB_7v4Igq-9arja1A
handle: michaelmartin8681
name: Michaelmartin8681
url: https://www.youtube.com/@michaelmartin8681/videos
- id: UCxV18lwwh29DiWuooz7UCvg
handle: davidbusuttil9086
name: Davidbusuttil9086
url: https://www.youtube.com/@davidbusuttil9086/videos
- id: UCosBhpwwGh_ueYq4ZSi5dGw
handle: matthewparlato5626
name: Matthewparlato5626
url: https://www.youtube.com/@matthewparlato5626/videos
- id: UCwF5LWNOFou_50bT65bq4Bg
handle: lancecleaver227
name: Lancecleaver227
url: https://www.youtube.com/@lancecleaver227/videos
- id: UCaJ0CqiiMSTq4X0rycUOIjw
handle: theplebistocrat
name: the plebistocrat
url: https://www.youtube.com/@theplebistocrat/videos
- id: UCZA5mUAyYcCL1kYgxbeMNrA
handle: RightInChrist
name: Rightinchrist
url: https://www.youtube.com/@RightInChrist/videos
- id: UCDIPXp88qjAV3TiaR5Uo3iQ
handle: RafeKelley
name: Rafekelley
url: https://www.youtube.com/@RafeKelley/videos
- id: UCedgru6YCto3zyXjlbuQuqA
handle: WavesOfObsession
name: Wavesofobsession
url: https://www.youtube.com/@WavesOfObsession/videos

View File

@ -6,6 +6,9 @@ Environment Variables:
ELASTIC_USERNAME / ELASTIC_PASSWORD: Optional basic auth credentials. ELASTIC_USERNAME / ELASTIC_PASSWORD: Optional basic auth credentials.
ELASTIC_INDEX: Target index name (default: this_little_corner_py). ELASTIC_INDEX: Target index name (default: this_little_corner_py).
LOCAL_DATA_DIR: Root folder containing JSON metadata (default: ../data/video_metadata). LOCAL_DATA_DIR: Root folder containing JSON metadata (default: ../data/video_metadata).
CHANNELS_PATH: Path to the canonical channel list (default: ./channels.yml).
RSS_FEED_URL: Public URL/path for the unified RSS feed (default: /rss/youtube-unified).
RSS_FEED_UPSTREAM: Base URL to proxy feed requests (default: http://localhost:8097).
YOUTUBE_API_KEY: Optional API key for pulling metadata directly from YouTube. YOUTUBE_API_KEY: Optional API key for pulling metadata directly from YouTube.
RATE_LIMIT_ENABLED: Toggle API rate limiting (default: 1). RATE_LIMIT_ENABLED: Toggle API rate limiting (default: 1).
RATE_LIMIT_REQUESTS: Max requests per window per client (default: 60). RATE_LIMIT_REQUESTS: Max requests per window per client (default: 60).
@ -74,6 +77,9 @@ class AppConfig:
qdrant_vector_name: Optional[str] qdrant_vector_name: Optional[str]
qdrant_vector_size: int qdrant_vector_size: int
qdrant_embed_model: str qdrant_embed_model: str
channels_path: Path
rss_feed_url: str
rss_feed_upstream: str
def _env(name: str, default: Optional[str] = None) -> Optional[str]: def _env(name: str, default: Optional[str] = None) -> Optional[str]:
@ -110,6 +116,11 @@ def load_config() -> AppConfig:
requests=max(int(_env("RATE_LIMIT_REQUESTS", "60")), 0), requests=max(int(_env("RATE_LIMIT_REQUESTS", "60")), 0),
window_seconds=max(int(_env("RATE_LIMIT_WINDOW_SECONDS", "60")), 1), window_seconds=max(int(_env("RATE_LIMIT_WINDOW_SECONDS", "60")), 1),
) )
channels_path = Path(
_env("CHANNELS_PATH", str(Path(__file__).parent / "channels.yml"))
).expanduser()
rss_feed_url = _env("RSS_FEED_URL", "/rss/youtube-unified")
rss_feed_upstream = _env("RSS_FEED_UPSTREAM", "http://localhost:8097")
return AppConfig( return AppConfig(
elastic=elastic, elastic=elastic,
data=data, data=data,
@ -120,6 +131,9 @@ def load_config() -> AppConfig:
qdrant_vector_name=_env("QDRANT_VECTOR_NAME"), qdrant_vector_name=_env("QDRANT_VECTOR_NAME"),
qdrant_vector_size=int(_env("QDRANT_VECTOR_SIZE", "1024")), qdrant_vector_size=int(_env("QDRANT_VECTOR_SIZE", "1024")),
qdrant_embed_model=_env("QDRANT_EMBED_MODEL", "BAAI/bge-large-en-v1.5"), qdrant_embed_model=_env("QDRANT_EMBED_MODEL", "BAAI/bge-large-en-v1.5"),
channels_path=channels_path,
rss_feed_url=rss_feed_url or "",
rss_feed_upstream=rss_feed_upstream or "",
) )

View File

@ -1,8 +1,47 @@
version: "3.9" version: "3.9"
# Runs only the Flask app container, pointing to remote Elasticsearch/Qdrant. # TLC Search + Feed Master - Complete YouTube content indexing & RSS aggregation
# Provide ELASTIC_URL / QDRANT_URL (and related) via environment or a .env file. # Provide ELASTIC_URL / QDRANT_URL (and related) via environment or a .env file.
services: services:
# RSS Bridge - Converts YouTube channels to RSS feeds
rss-bridge:
image: rssbridge/rss-bridge:latest
container_name: tlc-rss-bridge
hostname: rss-bridge
restart: unless-stopped
logging:
driver: json-file
options:
max-size: "10m"
max-file: "5"
ports:
- "3001:80"
# Feed Master - Aggregates multiple RSS feeds into unified feed
feed-master:
image: umputun/feed-master:latest
container_name: tlc-feed-master
hostname: feed-master
restart: unless-stopped
depends_on:
- rss-bridge
logging:
driver: json-file
options:
max-size: "10m"
max-file: "5"
environment:
- DEBUG=false
- FM_DB=/srv/var/feed-master.bdb
- FM_CONF=/srv/etc/fm.yml
volumes:
- ./feed-master-config:/srv/etc
- ./feed-master-config/var:/srv/var
- ./feed-master-config/images:/srv/images
ports:
- "8097:8080"
# TLC Search - Flask app for searching YouTube transcripts
app: app:
build: build:
context: . context: .
@ -16,6 +55,9 @@ services:
ELASTIC_PASSWORD: ${ELASTIC_PASSWORD:-} ELASTIC_PASSWORD: ${ELASTIC_PASSWORD:-}
ELASTIC_API_KEY: ${ELASTIC_API_KEY:-} ELASTIC_API_KEY: ${ELASTIC_API_KEY:-}
ELASTIC_VERIFY_CERTS: ${ELASTIC_VERIFY_CERTS:-0} ELASTIC_VERIFY_CERTS: ${ELASTIC_VERIFY_CERTS:-0}
CHANNELS_PATH: ${CHANNELS_PATH:-/app/python_app/channels.yml}
RSS_FEED_URL: ${RSS_FEED_URL:-/rss/youtube-unified}
RSS_FEED_UPSTREAM: ${RSS_FEED_UPSTREAM:-http://feed-master:8080}
QDRANT_URL: ${QDRANT_URL:?set QDRANT_URL to your remote Qdrant URL} QDRANT_URL: ${QDRANT_URL:?set QDRANT_URL to your remote Qdrant URL}
QDRANT_COLLECTION: ${QDRANT_COLLECTION:-tlc-captions-full} QDRANT_COLLECTION: ${QDRANT_COLLECTION:-tlc-captions-full}
QDRANT_VECTOR_NAME: ${QDRANT_VECTOR_NAME:-} QDRANT_VECTOR_NAME: ${QDRANT_VECTOR_NAME:-}
@ -23,4 +65,5 @@ services:
QDRANT_EMBED_MODEL: ${QDRANT_EMBED_MODEL:-BAAI/bge-large-en-v1.5} QDRANT_EMBED_MODEL: ${QDRANT_EMBED_MODEL:-BAAI/bge-large-en-v1.5}
LOCAL_DATA_DIR: ${LOCAL_DATA_DIR:-/app/data/video_metadata} LOCAL_DATA_DIR: ${LOCAL_DATA_DIR:-/app/data/video_metadata}
volumes: volumes:
- ./channels.yml:/app/python_app/channels.yml:ro
- ./data:/app/data:ro - ./data:/app/data:ro

166
feed-master-config/fm.yml Normal file
View File

@ -0,0 +1,166 @@
# Feed Master Configuration
# Auto-generated from channels.yml
# Do not edit manually - regenerate using generate_feed_config_simple.py
feeds:
youtube-unified:
title: YouTube Unified Feed
description: Aggregated feed from all YouTube channels
link: https://youtube.com
language: "en-us"
sources:
- name: A Quality Existence
url: http://rss-bridge/?action=display&bridge=YoutubeBridge&context=By+channel+id&c=UC6vg0HkKKlgsWk-3HfV-vnw&format=Mrss
- name: Adams Fall
url: http://rss-bridge/?action=display&bridge=YoutubeBridge&context=By+channel+id&c=UCbD1Pm0TOcRK2zaCrwgcTTg&format=Mrss
- name: Andrea with the Bangs
url: http://rss-bridge/?action=display&bridge=YoutubeBridge&context=By+channel+id&c=UCeWWxwzgLYUbfjWowXhVdYw&format=Mrss
- name: Aphrael Pilotson
url: http://rss-bridge/?action=display&bridge=YoutubeBridge&context=By+channel+id&c=UCGHuURJ1XFHzPSeokf6510A&format=Mrss
- name: Cassidy van der Kamp
url: http://rss-bridge/?action=display&bridge=YoutubeBridge&context=By+channel+id&c=UCM9Z05vuQhMEwsV03u6DrLA&format=Mrss
- name: Channel UCCebR16tXbv
url: http://rss-bridge/?action=display&bridge=YoutubeBridge&context=By+channel+id&c=UCCebR16tXbv5Ykk9_WtCCug&format=Mrss
- name: Channel UCiJmdXTb76i
url: http://rss-bridge/?action=display&bridge=YoutubeBridge&context=By+channel+id&c=UCiJmdXTb76i8eIPXdJyf8ZQ&format=Mrss
- name: Charlie's Little Corner
url: http://rss-bridge/?action=display&bridge=YoutubeBridge&context=By+channel+id&c=UC952hDf_C4nYJdqwK7VzTxA&format=Mrss
- name: Chris Howard
url: http://rss-bridge/?action=display&bridge=YoutubeBridge&context=By+channel+id&c=UC704NVL2DyzYg3rMU9r1f7A&format=Mrss
- name: Christian Baxter
url: http://rss-bridge/?action=display&bridge=YoutubeBridge&context=By+channel+id&c=UCU5SNBfTo4umhjYz6M0Jsmg&format=Mrss
- name: Climbing Mt Sophia
url: http://rss-bridge/?action=display&bridge=YoutubeBridge&context=By+channel+id&c=UCk9O91WwruXmgu1NQrKZZEw&format=Mrss
- name: Corner Citizen
url: http://rss-bridge/?action=display&bridge=YoutubeBridge&context=By+channel+id&c=UCAXyF_HFeMgwS8nkGVeroAA&format=Mrss
- name: Davidbusuttil9086
url: http://rss-bridge/?action=display&bridge=YoutubeBridge&context=By+channel+id&c=UCxV18lwwh29DiWuooz7UCvg&format=Mrss
- name: Ein Sof - Infinite Reflections
url: http://rss-bridge/?action=display&bridge=YoutubeBridge&context=By+channel+id&c=UC4Rmxg7saTfwIpvq3QEzylQ&format=Mrss
- name: Emily Rajeh
url: http://rss-bridge/?action=display&bridge=YoutubeBridge&context=By+channel+id&c=UCn5mf-fcpBmkepIpZ8eFRng&format=Mrss
- name: Eric Seitz
url: http://rss-bridge/?action=display&bridge=YoutubeBridge&context=By+channel+id&c=UCTdH4nh6JTcfKUAWvmnPoIQ&format=Mrss
- name: Ethan Caughey
url: http://rss-bridge/?action=display&bridge=YoutubeBridge&context=By+channel+id&c=UCv2Qft5mZrmA9XAwnl9PU-g&format=Mrss
- name: faturechi
url: http://rss-bridge/?action=display&bridge=YoutubeBridge&context=By+channel+id&c=UCprytROeCztMOMe8plyJRMg&format=Mrss
- name: Finding Ideas
url: http://rss-bridge/?action=display&bridge=YoutubeBridge&context=By+channel+id&c=UC6Tvr9mBXNaAxLGRA_sUSRA&format=Mrss
- name: Free Rilian
url: http://rss-bridge/?action=display&bridge=YoutubeBridge&context=By+channel+id&c=UCT8Lq3ufaGEnCSS8WpFatqw&format=Mrss
- name: From Whom All Blessings Flow
url: http://rss-bridge/?action=display&bridge=YoutubeBridge&context=By+channel+id&c=UC_-YQbnPfBbIezMr1adZZiQ&format=Mrss
- name: Grahampardun
url: http://rss-bridge/?action=display&bridge=YoutubeBridge&context=By+channel+id&c=UCB0C8DEIQlQzvSGuGriBxtA&format=Mrss
- name: Grail Country
url: http://rss-bridge/?action=display&bridge=YoutubeBridge&context=By+channel+id&c=UCsi_x8c12NW9FR7LL01QXKA&format=Mrss
- name: Grey Hamilton
url: http://rss-bridge/?action=display&bridge=YoutubeBridge&context=By+channel+id&c=UCrZyTWGMdRM9_P26RKPvh3A&format=Mrss
- name: Grizwald Grim
url: http://rss-bridge/?action=display&bridge=YoutubeBridge&context=By+channel+id&c=UCAqTQ5yLHHH44XWwWXLkvHQ&format=Mrss
- name: Jesspurviance
url: http://rss-bridge/?action=display&bridge=YoutubeBridge&context=By+channel+id&c=UCQ7rVoApmYIpcmU7fB9RPyw&format=Mrss
- name: John Vervaeke
url: http://rss-bridge/?action=display&bridge=YoutubeBridge&context=By+channel+id&c=UCpqDUjTsof-kTNpnyWper_Q&format=Mrss
- name: Jonathan Dumeer
url: http://rss-bridge/?action=display&bridge=YoutubeBridge&context=By+channel+id&c=UC4GiA5Hnwy415uVRymxPK-w&format=Mrss
- name: Jonathan Pageau
url: http://rss-bridge/?action=display&bridge=YoutubeBridge&context=By+channel+id&c=UCtCTSf3UwRU14nYWr_xm-dQ&format=Mrss
- name: Jordan B Peterson
url: http://rss-bridge/?action=display&bridge=YoutubeBridge&context=By+channel+id&c=UCL_f53ZEJxp8TtlOkHwMV9Q&format=Mrss
- name: Jordan Hall
url: http://rss-bridge/?action=display&bridge=YoutubeBridge&context=By+channel+id&c=UCMzT-mdCqoyEv_-YZVtE7MQ&format=Mrss
- name: Joseph Lambrecht
url: http://rss-bridge/?action=display&bridge=YoutubeBridge&context=By+channel+id&c=UCiOZYvBGHw1Y6wyzffwEp9g&format=Mrss
- name: Justinsmorningcoffee
url: http://rss-bridge/?action=display&bridge=YoutubeBridge&context=By+channel+id&c=UCPUVeoQYyq8cndWwyczX6RA&format=Mrss
- name: Kale Zelden
url: http://rss-bridge/?action=display&bridge=YoutubeBridge&context=By+channel+id&c=UCiukuaNd_qzRDTW9qe2OC1w&format=Mrss
- name: Lancecleaver227
url: http://rss-bridge/?action=display&bridge=YoutubeBridge&context=By+channel+id&c=UCwF5LWNOFou_50bT65bq4Bg&format=Mrss
- name: Lucas Vos
url: http://rss-bridge/?action=display&bridge=YoutubeBridge&context=By+channel+id&c=UCez1fzMRGctojfis2lfRYug&format=Mrss
- name: Luke Thompson
url: http://rss-bridge/?action=display&bridge=YoutubeBridge&context=By+channel+id&c=UC_dnk5D4tFCRYCrKIcQlcfw&format=Mrss
- name: Marc Jackson
url: http://rss-bridge/?action=display&bridge=YoutubeBridge&context=By+channel+id&c=UCMJCtS8jKouJ2d8UIYzW3vg&format=Mrss
- name: Mark D Parker
url: http://rss-bridge/?action=display&bridge=YoutubeBridge&context=By+channel+id&c=UCVdSgEf9bLXFMBGSMhn7x4Q&format=Mrss
- name: Mark S
url: http://rss-bridge/?action=display&bridge=YoutubeBridge&context=By+channel+id&c=UC977g6oGYIJDQnsZOGjQBBA&format=Mrss
- name: Mary Kochan
url: http://rss-bridge/?action=display&bridge=YoutubeBridge&context=By+channel+id&c=UC2leFZRD0ZlQDQxpR2Zd8oA&format=Mrss
- name: Matthewparlato5626
url: http://rss-bridge/?action=display&bridge=YoutubeBridge&context=By+channel+id&c=UCosBhpwwGh_ueYq4ZSi5dGw&format=Mrss
- name: mcmosav
url: http://rss-bridge/?action=display&bridge=YoutubeBridge&context=By+channel+id&c=UCnojyPW0IgLWTQ0SaDQ1KBA&format=Mrss
- name: Michaelmartin8681
url: http://rss-bridge/?action=display&bridge=YoutubeBridge&context=By+channel+id&c=UCpLJJLVB_7v4Igq-9arja1A&format=Mrss
- name: More Christ
url: http://rss-bridge/?action=display&bridge=YoutubeBridge&context=By+channel+id&c=UCEPOn4cgvrrerg_-q_Ygw1A&format=Mrss
- name: Neal Daedalus
url: http://rss-bridge/?action=display&bridge=YoutubeBridge&context=By+channel+id&c=UC8mJqpS_EBbMcyuzZDF0TEw&format=Mrss
- name: Nechama Gluck
url: http://rss-bridge/?action=display&bridge=YoutubeBridge&context=By+channel+id&c=UC5goUoFM4LPim4eY4pwRXYw&format=Mrss
- name: OG Rose
url: http://rss-bridge/?action=display&bridge=YoutubeBridge&context=By+channel+id&c=UC6zHDj4D323xJkblnPTvY3Q&format=Mrss
- name: Paul Anleitner
url: http://rss-bridge/?action=display&bridge=YoutubeBridge&context=By+channel+id&c=UC2yCyOMUeem-cYwliC-tLJg&format=Mrss
- name: Paul Rene Nichols
url: http://rss-bridge/?action=display&bridge=YoutubeBridge&context=By+channel+id&c=UCDCfI162vhPvwdxW6X4nmiw&format=Mrss
- name: Paul VanderKlay
url: http://rss-bridge/?action=display&bridge=YoutubeBridge&context=By+channel+id&c=UCGsDIP_K6J6VSTqlq-9IPlg&format=Mrss
- name: President Foxman
url: http://rss-bridge/?action=display&bridge=YoutubeBridge&context=By+channel+id&c=UCMVG5eqpYFVEB-a9IqAOuHA&format=Mrss
- name: Rafekelley
url: http://rss-bridge/?action=display&bridge=YoutubeBridge&context=By+channel+id&c=UCDIPXp88qjAV3TiaR5Uo3iQ&format=Mrss
- name: Randos United
url: http://rss-bridge/?action=display&bridge=YoutubeBridge&context=By+channel+id&c=UCEzWTLDYmL8soRdQec9Fsjw&format=Mrss
- name: Randos United 2
url: http://rss-bridge/?action=display&bridge=YoutubeBridge&context=By+channel+id&c=UC1KgNsMdRoIA_njVmaDdHgA&format=Mrss
- name: Rebel Wisdom
url: http://rss-bridge/?action=display&bridge=YoutubeBridge&context=By+channel+id&c=UCFQ6Gptuq-sLflbJ4YY3Umw&format=Mrss
- name: Restoring Meaning
url: http://rss-bridge/?action=display&bridge=YoutubeBridge&context=By+channel+id&c=UCzX6R3ZLQh5Zma_5AsPcqPA&format=Mrss
- name: Rightinchrist
url: http://rss-bridge/?action=display&bridge=YoutubeBridge&context=By+channel+id&c=UCZA5mUAyYcCL1kYgxbeMNrA&format=Mrss
- name: Ron Copperman
url: http://rss-bridge/?action=display&bridge=YoutubeBridge&context=By+channel+id&c=UC5yLuFQCms4nb9K2bGQLqIw&format=Mrss
- name: Sartori Studios
url: http://rss-bridge/?action=display&bridge=YoutubeBridge&context=By+channel+id&c=UC8SErJkYnDsYGh1HxoZkl-g&format=Mrss
- name: Secular Koranism
url: http://rss-bridge/?action=display&bridge=YoutubeBridge&context=By+channel+id&c=UCFLovlJ8RFApfjrf2y157xg&format=Mrss
- name: Shoulder Serf
url: http://rss-bridge/?action=display&bridge=YoutubeBridge&context=By+channel+id&c=UChptV-kf8lnncGh7DA2m8Pw&format=Mrss
- name: Skankenstein
url: http://rss-bridge/?action=display&bridge=YoutubeBridge&context=By+channel+id&c=UCUSyTPWW4JaG1YfUPddw47Q&format=Mrss
- name: Strange Theology
url: http://rss-bridge/?action=display&bridge=YoutubeBridge&context=By+channel+id&c=UCEY1vGNBPsC3dCatZyK3Jkw&format=Mrss
- name: The Anadromist
url: http://rss-bridge/?action=display&bridge=YoutubeBridge&context=By+channel+id&c=UCIAtCuzdvgNJvSYILnHtdWA&format=Mrss
- name: The Chris Show
url: http://rss-bridge/?action=display&bridge=YoutubeBridge&context=By+channel+id&c=UClIDP7_Kzv_7tDQjTv9EhrA&format=Mrss
- name: The Meaning Code
url: http://rss-bridge/?action=display&bridge=YoutubeBridge&context=By+channel+id&c=UCgp_r6WlBwDSJrP43Mz07GQ&format=Mrss
- name: the plebistocrat
url: http://rss-bridge/?action=display&bridge=YoutubeBridge&context=By+channel+id&c=UCaJ0CqiiMSTq4X0rycUOIjw&format=Mrss
- name: The Young Levite
url: http://rss-bridge/?action=display&bridge=YoutubeBridge&context=By+channel+id&c=UC1a4VtU_SMSfdRiwMJR33YQ&format=Mrss
- name: TheCommonToad
url: http://rss-bridge/?action=display&bridge=YoutubeBridge&context=By+channel+id&c=UC-QiBn6GsM3JZJAeAQpaGAA&format=Mrss
- name: TheScrollersPodcast
url: http://rss-bridge/?action=display&bridge=YoutubeBridge&context=By+channel+id&c=UC5uv-BxzCrN93B_5qbOdRWw&format=Mrss
- name: Transfigured
url: http://rss-bridge/?action=display&bridge=YoutubeBridge&context=By+channel+id&c=UCg7Ed0lecvko58ibuX1XHng&format=Mrss
- name: UpCycleClub
url: http://rss-bridge/?action=display&bridge=YoutubeBridge&context=By+channel+id&c=UCzw2FNI3IRphcAoVcUENOgQ&format=Mrss
- name: Wavesofobsession
url: http://rss-bridge/?action=display&bridge=YoutubeBridge&context=By+channel+id&c=UCedgru6YCto3zyXjlbuQuqA&format=Mrss
system:
update: 5m
max_per_feed: 5
max_total: 200
max_keep: 1000
base_url: http://localhost:8097

91
generate_feed_config.py Normal file
View File

@ -0,0 +1,91 @@
#!/usr/bin/env python3
"""
Generate feed-master configuration from channels.yml.
This ensures a single source of truth for the YouTube channels.
"""
import sys
from pathlib import Path
from .channel_config import build_rss_bridge_url, load_channel_entries
def generate_fm_config(channels_file, output_file, rss_bridge_host="rss-bridge"):
"""Generate feed-master YAML configuration from channels.yml"""
print(f"Reading channels from {channels_file}")
channels = load_channel_entries(Path(channels_file))
print(f"Found {len(channels)} channels")
# Generate feed configuration
config = []
config.append("# Feed Master Configuration")
config.append("# Auto-generated from channels.yml")
config.append("# Do not edit manually - regenerate using generate_feed_config.py")
config.append("")
config.append("feeds:")
config.append(" youtube-unified:")
config.append(" title: YouTube Unified Feed")
config.append(" description: Aggregated feed from all YouTube channels")
config.append(" link: https://youtube.com")
config.append(' language: "en-us"')
config.append(" sources:")
processed = 0
skipped = 0
for channel in channels:
if not channel.get("rss_enabled", True):
skipped += 1
continue
bridge_url = build_rss_bridge_url(channel, rss_bridge_host=rss_bridge_host)
if not bridge_url:
skipped += 1
continue
name = channel.get("name", "Unknown")
config.append(f" - name: {name}")
config.append(f" url: {bridge_url}")
processed += 1
# Add system configuration
config.append("")
config.append("system:")
config.append(" update: 5m")
config.append(" max_per_feed: 5")
config.append(" max_total: 200")
config.append(" max_keep: 1000")
config.append(" base_url: http://localhost:8097")
# Write output
print(f"\nProcessed {processed} channels, skipped {skipped}")
with open(output_file, 'w') as f:
f.write('\n'.join(config))
print(f"Configuration written to {output_file}")
print(f"\nTo apply this configuration:")
print(f" 1. Copy {output_file} to feed-master/etc/fm.yml")
print(f" 2. Restart the feed-master service")
if __name__ == "__main__":
# Default paths
script_dir = Path(__file__).parent
channels_file = script_dir / "channels.yml"
output_file = script_dir / "feed-master-config" / "fm.yml"
# Allow overriding via command line
if len(sys.argv) > 1:
channels_file = Path(sys.argv[1])
if len(sys.argv) > 2:
output_file = Path(sys.argv[2])
if not channels_file.exists():
print(f"Error: {channels_file} not found", file=sys.stderr)
print(f"\nUsage: {sys.argv[0]} [channels.yml] [output.yml]", file=sys.stderr)
sys.exit(1)
# Ensure output directory exists
output_file.parent.mkdir(parents=True, exist_ok=True)
generate_fm_config(channels_file, output_file)

88
generate_feed_config_simple.py Executable file
View File

@ -0,0 +1,88 @@
#!/usr/bin/env python3
"""
Generate feed-master configuration from channels.yml.
Simplified version that doesn't require RSS-Bridge to be running.
"""
import sys
from pathlib import Path
from .channel_config import build_rss_bridge_url, load_channel_entries
def generate_fm_config(channels_file, output_file, rss_bridge_host="rss-bridge"):
"""Generate feed-master YAML configuration from channels.yml"""
print(f"Reading channels from {channels_file}")
channels = load_channel_entries(Path(channels_file))
print(f"Found {len(channels)} channels")
# Generate feed configuration
config = []
config.append("# Feed Master Configuration")
config.append("# Auto-generated from channels.yml")
config.append("# Do not edit manually - regenerate using generate_feed_config_simple.py")
config.append("")
config.append("feeds:")
config.append(" youtube-unified:")
config.append(" title: YouTube Unified Feed")
config.append(" description: Aggregated feed from all YouTube channels")
config.append(" link: https://youtube.com")
config.append(' language: "en-us"')
config.append(" sources:")
processed = 0
skipped = 0
for channel in channels:
if not channel.get("rss_enabled", True):
skipped += 1
continue
bridge_url = build_rss_bridge_url(channel, rss_bridge_host=rss_bridge_host)
if not bridge_url:
skipped += 1
continue
name = channel.get("name", "Unknown")
config.append(f" - name: {name}")
config.append(f" url: {bridge_url}")
processed += 1
# Add system configuration
config.append("")
config.append("system:")
config.append(" update: 5m")
config.append(" max_per_feed: 5")
config.append(" max_total: 200")
config.append(" max_keep: 1000")
config.append(" base_url: http://localhost:8097")
# Write output
print(f"\nProcessed {processed} channels, skipped {skipped}")
with open(output_file, 'w') as f:
f.write('\n'.join(config))
print(f"Configuration written to {output_file}")
if __name__ == "__main__":
# Default paths
script_dir = Path(__file__).parent
channels_file = script_dir / "channels.yml"
output_file = script_dir / "feed-master-config" / "fm.yml"
# Allow overriding via command line
if len(sys.argv) > 1:
channels_file = Path(sys.argv[1])
if len(sys.argv) > 2:
output_file = Path(sys.argv[2])
if not channels_file.exists():
print(f"Error: {channels_file} not found", file=sys.stderr)
print(f"\nUsage: {sys.argv[0]} [channels.yml] [output.yml]", file=sys.stderr)
sys.exit(1)
# Ensure output directory exists
output_file.parent.mkdir(parents=True, exist_ok=True)
generate_fm_config(channels_file, output_file)

View File

@ -5,6 +5,8 @@ Routes:
GET / -> static HTML search page. GET / -> static HTML search page.
GET /graph -> static reference graph UI. GET /graph -> static reference graph UI.
GET /api/channels -> channels aggregation. GET /api/channels -> channels aggregation.
GET /api/channel-list -> canonical channel list + feed URL.
GET /channels.txt -> raw channel URLs list.
GET /api/search -> Elasticsearch keyword search. GET /api/search -> Elasticsearch keyword search.
GET /api/graph -> reference graph API. GET /api/graph -> reference graph API.
GET /api/transcript -> transcript JSON payload. GET /api/transcript -> transcript JSON payload.
@ -17,6 +19,8 @@ import json
import logging import logging
import os import os
import re import re
import urllib.parse
import xml.etree.ElementTree as ET
from pathlib import Path from pathlib import Path
from typing import Any, Deque, Dict, Iterable, List, Optional, Sequence, Set, Tuple from typing import Any, Deque, Dict, Iterable, List, Optional, Sequence, Set, Tuple
@ -25,9 +29,11 @@ from datetime import datetime
from threading import Lock from threading import Lock
from time import monotonic from time import monotonic
import requests
from flask import Flask, jsonify, request, send_from_directory from flask import Flask, jsonify, request, send_from_directory
from .config import CONFIG, AppConfig from .config import CONFIG, AppConfig
from .channel_config import load_channel_entries
try: try:
from elasticsearch import Elasticsearch # type: ignore from elasticsearch import Elasticsearch # type: ignore
@ -45,6 +51,10 @@ DEFAULT_ELASTIC_TIMEOUT = int(os.environ.get("ELASTIC_TIMEOUT_SECONDS", "30"))
_RATE_LIMIT_BUCKETS: Dict[str, Deque[float]] = defaultdict(deque) _RATE_LIMIT_BUCKETS: Dict[str, Deque[float]] = defaultdict(deque)
_RATE_LIMIT_LOCK = Lock() _RATE_LIMIT_LOCK = Lock()
_RSS_AUTHOR_CACHE: Dict[str, Tuple[str, float]] = {}
_RSS_AUTHOR_LOCK = Lock()
_RSS_AUTHOR_TTL_SECONDS = 60 * 60 * 24
_RSS_OEMBED_LIMIT = 12
def _client_rate_key() -> str: def _client_rate_key() -> str:
@ -101,6 +111,192 @@ def _ensure_client(config: AppConfig) -> "Elasticsearch":
return Elasticsearch(config.elastic.url, **kwargs) return Elasticsearch(config.elastic.url, **kwargs)
def _extract_video_id(url: str) -> Optional[str]:
if not url:
return None
try:
parsed = urllib.parse.urlparse(url.strip())
except Exception:
return None
host = (parsed.netloc or "").lower()
path = parsed.path or ""
if host in {"youtu.be", "www.youtu.be"}:
return path.lstrip("/") or None
if host.endswith("youtube.com"):
if path == "/watch":
params = urllib.parse.parse_qs(parsed.query)
return (params.get("v") or [None])[0]
if path.startswith("/shorts/"):
return path.split("/", 2)[2] if len(path.split("/", 2)) > 2 else None
return None
def _lookup_channel_names(
client: "Elasticsearch",
index: str,
video_ids: Iterable[str],
) -> Dict[str, str]:
ids = [vid for vid in video_ids if vid]
if not ids:
return {}
now = monotonic()
mapping: Dict[str, str] = {}
cached_hits = 0
elastic_hits = 0
remaining = []
with _RSS_AUTHOR_LOCK:
for vid in ids:
cached = _RSS_AUTHOR_CACHE.get(vid)
if cached and (now - cached[1]) < _RSS_AUTHOR_TTL_SECONDS:
mapping[vid] = cached[0]
cached_hits += 1
else:
remaining.append(vid)
if remaining:
try:
response = client.mget(index=index, body={"ids": remaining})
except Exception as exc: # pragma: no cover - elasticsearch handles errors
LOGGER.warning("RSS title lookup failed: %s", exc)
response = {}
for doc in response.get("docs", []):
if not doc.get("found"):
continue
source = doc.get("_source") or {}
name = source.get("channel_name") or source.get("channel_id")
if name:
vid = doc.get("_id", "")
mapping[vid] = str(name)
elastic_hits += 1
with _RSS_AUTHOR_LOCK:
_RSS_AUTHOR_CACHE[vid] = (mapping[vid], now)
missing = [vid for vid in remaining if vid not in mapping]
oembed_hits = 0
oembed_attempts = 0
if missing:
for vid in missing[:_RSS_OEMBED_LIMIT]:
oembed_attempts += 1
video_url = f"https://www.youtube.com/watch?v={vid}"
oembed_url = (
"https://www.youtube.com/oembed?format=json&url="
+ urllib.parse.quote(video_url, safe="")
)
try:
response = requests.get(oembed_url, timeout=10)
if response.status_code != 200:
continue
data = response.json()
except Exception:
continue
author = data.get("author_name")
if not author:
continue
mapping[vid] = str(author)
oembed_hits += 1
with _RSS_AUTHOR_LOCK:
_RSS_AUTHOR_CACHE[vid] = (mapping[vid], now)
missing_count = max(len(ids) - cached_hits - elastic_hits - oembed_hits, 0)
if missing_count or oembed_attempts:
LOGGER.info(
"RSS title lookup: total=%d cached=%d elastic=%d oembed=%d missing=%d",
len(ids),
cached_hits,
elastic_hits,
oembed_hits,
missing_count,
)
else:
LOGGER.debug(
"RSS title lookup: total=%d cached=%d elastic=%d",
len(ids),
cached_hits,
elastic_hits,
)
return mapping
def _rewrite_rss_payload(
content: bytes,
client: "Elasticsearch",
index: str,
feed_name: str,
) -> bytes:
try:
root = ET.fromstring(content)
except ET.ParseError:
LOGGER.warning("RSS rewrite skipped (invalid XML) for %s", feed_name)
return content
channel = root.find("channel")
if channel is None:
LOGGER.warning("RSS rewrite skipped (missing channel) for %s", feed_name)
return content
items = channel.findall("item")
total_items = len(items)
removed_errors = 0
video_ids: Set[str] = set()
for item in list(items):
title_el = item.find("title")
title_text = (title_el.text or "").strip() if title_el is not None else ""
if "Bridge returned error" in title_text:
channel.remove(item)
removed_errors += 1
continue
link_el = item.find("link")
guid_el = item.find("guid")
video_id = _extract_video_id((link_el.text or "") if link_el is not None else "")
if not video_id:
video_id = _extract_video_id((guid_el.text or "") if guid_el is not None else "")
if video_id:
video_ids.add(video_id)
channel_name_map = _lookup_channel_names(client, index, video_ids)
if not channel_name_map:
LOGGER.info(
"RSS rewrite: feed=%s items=%d removed_errors=%d resolved=0",
feed_name,
total_items,
removed_errors,
)
return ET.tostring(root, encoding="utf-8", xml_declaration=True)
prefixed = 0
for item in channel.findall("item"):
title_el = item.find("title")
if title_el is None or not title_el.text:
continue
link_el = item.find("link")
guid_el = item.find("guid")
video_id = _extract_video_id((link_el.text or "") if link_el is not None else "")
if not video_id:
video_id = _extract_video_id((guid_el.text or "") if guid_el is not None else "")
if not video_id:
continue
channel_name = channel_name_map.get(video_id)
if not channel_name:
continue
prefix = f"{channel_name} - "
if title_el.text.startswith(prefix):
continue
title_el.text = f"{channel_name} - {title_el.text}"
prefixed += 1
LOGGER.info(
"RSS rewrite: feed=%s items=%d removed_errors=%d prefixed=%d resolved=%d",
feed_name,
total_items,
removed_errors,
prefixed,
len(channel_name_map),
)
return ET.tostring(root, encoding="utf-8", xml_declaration=True)
def metrics_payload(data_root: Path, include_external: bool = True) -> Dict[str, Any]: def metrics_payload(data_root: Path, include_external: bool = True) -> Dict[str, Any]:
total_items = 0 total_items = 0
channel_counter: Counter = Counter() channel_counter: Counter = Counter()
@ -1077,6 +1273,72 @@ def create_app(config: AppConfig = CONFIG) -> Flask:
data.sort(key=lambda item: item["Name"].lower()) data.sort(key=lambda item: item["Name"].lower())
return jsonify(data) return jsonify(data)
@app.route("/api/channel-list")
def channel_list():
payload = {
"channels": [],
"rss_feed_url": config.rss_feed_url,
"source": str(config.channels_path),
}
try:
payload["channels"] = load_channel_entries(config.channels_path)
except FileNotFoundError:
LOGGER.warning("Channel list not found: %s", config.channels_path)
payload["error"] = "channels_not_found"
except Exception as exc:
LOGGER.exception("Failed to load channel list: %s", exc)
payload["error"] = "channels_load_failed"
return jsonify(payload)
@app.route("/channels.txt")
def channel_urls():
try:
channels = load_channel_entries(config.channels_path)
except FileNotFoundError:
LOGGER.warning("Channel list not found: %s", config.channels_path)
return jsonify({"error": "channels_not_found"}), 404
except Exception as exc:
LOGGER.exception("Failed to load channel list: %s", exc)
return jsonify({"error": "channels_load_failed"}), 500
urls = [channel["url"] for channel in channels if channel.get("url")]
body = "\n".join(urls) + ("\n" if urls else "")
return (body, 200, {"Content-Type": "text/plain; charset=utf-8"})
def _rss_target(feed_name: str) -> str:
name = (feed_name or "").strip("/")
if not name:
name = "youtube-unified"
return f"{config.rss_feed_upstream.rstrip('/')}/rss/{name}"
@app.route("/rss")
@app.route("/rss/<path:feed_name>")
def rss_proxy(feed_name: str = ""):
target = _rss_target(feed_name)
try:
upstream = requests.get(target, timeout=30)
except requests.RequestException as exc:
LOGGER.warning("RSS upstream error for %s: %s", target, exc)
return jsonify({"error": "rss_unavailable"}), 502
payload = _rewrite_rss_payload(upstream.content, client, index, feed_name or "rss")
headers = {
"Content-Type": upstream.headers.get(
"Content-Type", "application/xml; charset=UTF-8"
)
}
cache_header = upstream.headers.get("Cache-Control")
if cache_header:
headers["Cache-Control"] = cache_header
etag = upstream.headers.get("ETag")
if etag:
headers["ETag"] = etag
last_modified = upstream.headers.get("Last-Modified")
if last_modified:
headers["Last-Modified"] = last_modified
return (payload, upstream.status_code, headers)
@app.route("/api/graph") @app.route("/api/graph")
def graph_api(): def graph_api():
video_id = (request.args.get("video_id") or "").strip() video_id = (request.args.get("video_id") or "").strip()

View File

@ -45,6 +45,10 @@
const aboutBtn = document.getElementById("aboutBtn"); const aboutBtn = document.getElementById("aboutBtn");
const aboutPanel = document.getElementById("aboutPanel"); const aboutPanel = document.getElementById("aboutPanel");
const aboutCloseBtn = document.getElementById("aboutCloseBtn"); const aboutCloseBtn = document.getElementById("aboutCloseBtn");
const rssButton = document.getElementById("rssButton");
const rssFeedLink = document.getElementById("rssFeedLink");
const channelListLink = document.getElementById("channelListLink");
const channelCount = document.getElementById("channelCount");
const resultsDiv = document.getElementById("results"); const resultsDiv = document.getElementById("results");
const metaDiv = document.getElementById("meta"); const metaDiv = document.getElementById("meta");
const metricsContainer = document.getElementById("metrics"); const metricsContainer = document.getElementById("metrics");
@ -406,6 +410,57 @@
} }
} }
async function loadChannelListInfo() {
if (!rssFeedLink && !channelListLink && !channelCount) return;
try {
const res = await fetch("/api/channel-list");
const payload = await res.json();
if (rssFeedLink) {
const feedUrl = payload.rss_feed_url || "";
if (feedUrl) {
rssFeedLink.href = feedUrl;
rssFeedLink.textContent = feedUrl;
} else {
rssFeedLink.textContent = "Unavailable";
rssFeedLink.removeAttribute("href");
}
}
if (rssButton) {
const feedUrl = payload.rss_feed_url || "";
if (feedUrl) {
rssButton.href = feedUrl;
rssButton.classList.remove("is-disabled");
rssButton.removeAttribute("aria-disabled");
} else {
rssButton.removeAttribute("href");
rssButton.classList.add("is-disabled");
rssButton.setAttribute("aria-disabled", "true");
}
}
if (channelCount) {
const count = Array.isArray(payload.channels) ? payload.channels.length : 0;
channelCount.textContent = count ? `${count} channels` : "No channels loaded";
}
if (channelListLink && payload.error) {
channelListLink.textContent = "Channel list unavailable";
}
} catch (err) {
console.error("Failed to load channel list", err);
if (rssFeedLink) {
rssFeedLink.textContent = "Unavailable";
rssFeedLink.removeAttribute("href");
}
if (rssButton) {
rssButton.removeAttribute("href");
rssButton.classList.add("is-disabled");
rssButton.setAttribute("aria-disabled", "true");
}
if (channelCount) {
channelCount.textContent = "Channel list unavailable";
}
}
}
function updateUrl(q) { function updateUrl(q) {
const next = new URL(window.location.href); const next = new URL(window.location.href);
if (q) { if (q) {
@ -1732,6 +1787,7 @@ window.addEventListener("popstate", () => {
setFromQuery(); setFromQuery();
loadMetrics(); loadMetrics();
loadYears(); loadYears();
loadChannelListInfo();
loadChannels().then(() => runSearch(currentPage)); loadChannels().then(() => runSearch(currentPage));
})(); })();

View File

@ -21,6 +21,22 @@
</div> </div>
</div> </div>
<div class="window-body"> <div class="window-body">
<div class="window-actions">
<a
id="rssButton"
class="rss-button"
href="/rss"
target="_blank"
rel="noopener"
title="Unified RSS feed"
aria-label="Unified RSS feed"
>
<svg class="rss-button__icon" viewBox="0 0 24 24" aria-hidden="true">
<path d="M6 18a2 2 0 1 0 0 4a2 2 0 0 0 0-4zm-4 6a4 4 0 0 1 4-4a4 4 0 0 1 4 4h-2a2 2 0 0 0-2-2a2 2 0 0 0-2 2zm0-8v-2c6.627 0 12 5.373 12 12h-2c0-5.523-4.477-10-10-10zm0-4V4c11.046 0 20 8.954 20 20h-2c0-9.941-8.059-18-18-18z"/>
</svg>
<span class="rss-button__label">RSS</span>
</a>
</div>
<p>Enter a phrase to query title, description, and transcript text.</p> <p>Enter a phrase to query title, description, and transcript text.</p>
<fieldset> <fieldset>
@ -129,6 +145,15 @@
<p>Use the toggles to choose exact, fuzzy, or phrase matching. Query string mode accepts raw Lucene syntax.</p> <p>Use the toggles to choose exact, fuzzy, or phrase matching. Query string mode accepts raw Lucene syntax.</p>
<p>Results are ranked by your chosen sort order; the timeline summarizes the same query.</p> <p>Results are ranked by your chosen sort order; the timeline summarizes the same query.</p>
<p>You can download transcripts, copy MLA citations, or explore references via the graph button.</p> <p>You can download transcripts, copy MLA citations, or explore references via the graph button.</p>
<div class="about-panel__section">
<div class="about-panel__label">Unified RSS feed</div>
<a id="rssFeedLink" href="#" target="_blank" rel="noopener">Loading…</a>
</div>
<div class="about-panel__section">
<div class="about-panel__label">Channel list</div>
<a id="channelListLink" href="/api/channel-list" target="_blank" rel="noopener">View JSON</a>
<div id="channelCount" class="about-panel__meta"></div>
</div>
</div> </div>
</div> </div>

View File

@ -510,6 +510,22 @@ body.modal-open {
color: #000; color: #000;
} }
.about-panel__section {
margin-top: 8px;
padding-top: 6px;
border-top: 1px solid #c0c0c0;
}
.about-panel__label {
font-weight: bold;
margin-bottom: 2px;
}
.about-panel__meta {
font-size: 10px;
color: #555;
}
.about-panel__header button { .about-panel__header button {
border: none; border: none;
background: transparent; background: transparent;
@ -549,6 +565,50 @@ body.modal-open {
box-sizing: border-box; box-sizing: border-box;
} }
.window-actions {
display: flex;
justify-content: flex-end;
margin-bottom: 6px;
}
.rss-button {
display: inline-flex;
align-items: center;
gap: 4px;
padding: 2px 6px;
border: 1px solid;
border-color: ButtonHighlight ButtonShadow ButtonShadow ButtonHighlight;
background: ButtonFace;
color: #000;
text-decoration: none;
font-size: 11px;
cursor: pointer;
}
.rss-button:hover {
background: #f3f3f3;
}
.rss-button:active {
border-color: ButtonShadow ButtonHighlight ButtonHighlight ButtonShadow;
}
.rss-button.is-disabled {
opacity: 0.5;
cursor: default;
pointer-events: none;
}
.rss-button__icon {
width: 14px;
height: 14px;
fill: #f38b00;
}
.rss-button__label {
font-weight: bold;
}
/* Badges */ /* Badges */
.badge-row { .badge-row {
margin-top: 6px; margin-top: 6px;