commit fcdc6ecb9bd962e6aa0366c22e68279a2f255781 Author: knight Date: Sun Nov 2 01:14:36 2025 -0400 Initial commit diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..ac732eb --- /dev/null +++ b/.gitignore @@ -0,0 +1,60 @@ +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] +*$py.class + +# C extensions +*.so + +# Distribution / packaging +.Python +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +*.egg-info/ +.installed.cfg +*.egg + +# Virtual environments +.venv/ +venv/ +ENV/ +env/ + +# IDE +.vscode/ +.idea/ +*.swp +*.swo +*~ + +# Environment variables +.env +.env.local + +# Elasticsearch data +data/ + +# OS +.DS_Store +Thumbs.db + +# Logs +*.log + +# Testing +.pytest_cache/ +.coverage +htmlcov/ + +# mypy +.mypy_cache/ diff --git a/README.md b/README.md new file mode 100644 index 0000000..126e74b --- /dev/null +++ b/README.md @@ -0,0 +1,87 @@ +# Python Search Toolkit (Rough Draft) + +This minimal Python implementation covers three core needs: + +1. **Collect transcripts** from YouTube channels. +2. **Ingest transcripts/metadata** into Elasticsearch. +3. **Expose a simple Flask search UI** that queries Elasticsearch directly. + +The code lives alongside the existing C# stack so you can experiment without +touching production infrastructure. + +## Setup + +```bash +python -m venv .venv +source .venv/bin/activate +pip install -r python_app/requirements.txt +``` + +Configure your environment as needed: + +```bash +export ELASTIC_URL=http://localhost:9200 +export ELASTIC_INDEX=this_little_corner_py +export ELASTIC_USERNAME=elastic # optional +export ELASTIC_PASSWORD=secret # optional +export ELASTIC_API_KEY=XXXX # optional alternative auth +export ELASTIC_CA_CERT=/path/to/ca.pem # optional, for self-signed TLS +export ELASTIC_VERIFY_CERTS=1 # set to 0 to skip verification (dev only) +export ELASTIC_DEBUG=0 # set to 1 for verbose request/response logging +export LOCAL_DATA_DIR=./data/video_metadata # defaults to this +export YOUTUBE_API_KEY=AIza... # required for live collection +``` + +## 1. Collect Transcripts + +```bash +python -m python_app.transcript_collector \ + --channel UCxxxx \ + --output data/raw \ + --max-pages 2 +``` + +Each video becomes a JSON file containing metadata plus transcript segments +(`TranscriptSegment`). Downloads require both `google-api-python-client` and +`youtube-transcript-api`, as well as a valid `YOUTUBE_API_KEY`. + +> Already have cached JSON? You can skip this step and move straight to ingesting. + +## 2. Ingest Into Elasticsearch + +```bash +python -m python_app.ingest \ + --source data/video_metadata \ + --index this_little_corner_py +``` + +The script walks the source directory, builds `bulk` requests, and creates the +index with a lightweight mapping when needed. Authentication is handled via +`ELASTIC_USERNAME` / `ELASTIC_PASSWORD` if set. + +## 3. Serve the Search Frontend + +```bash +python -m python_app.search_app +``` + +Visit and you’ll see a barebones UI that: + +- Lists channels via a terms aggregation. +- Queries titles/descriptions/transcripts with toggleable exact, fuzzy, and phrase clauses plus optional date sorting. +- Surfaces transcript highlights. +- Lets you pull the full transcript for any result on demand. +- Shows a stacked-by-channel timeline for each search query (with `/frequency` offering a standalone explorer) powered by D3.js. +- Supports a query-string mode toggle so you can write advanced Lucene queries (e.g. `meaning OR purpose`, `meaning~2` for fuzzy matches, `title:(meaning crisis)`), while the default toggles stay AND-backed. + +## Integration Notes + +- All modules share configuration through `python_app.config.CONFIG`, so you can + fine-tune paths or credentials centrally. +- The ingest flow reuses existing JSON schema from `data/video_metadata`, so no + re-download is necessary if you already have the dumps. +- Everything is intentionally simple (no Celery, task queues, or custom auth) to + keep the draft approachable and easy to extend. + +Feel free to expand on this scaffold—add proper logging, schedule transcript +updates, or flesh out the UI—once you’re happy with the baseline behaviour. diff --git a/__init__.py b/__init__.py new file mode 100644 index 0000000..3358229 --- /dev/null +++ b/__init__.py @@ -0,0 +1,11 @@ +""" +Minimal Python toolkit for collecting YouTube transcripts, ingesting them into +Elasticsearch, and serving a lightweight search API/front-end. + +Modules: + config: shared configuration helpers (Elastic endpoint, data paths, etc.). + transcript_collector: fetches channel metadata and transcripts. + ingest: pushes transcript JSON into Elasticsearch. + search_app: Flask app exposing simple search and transcript endpoints. +""" + diff --git a/config.py b/config.py new file mode 100644 index 0000000..9e7eb26 --- /dev/null +++ b/config.py @@ -0,0 +1,81 @@ +""" +Centralised configuration helpers for the Python search toolkit. + +Environment Variables: + ELASTIC_URL: Base URL to the Elasticsearch node (default: http://localhost:9200). + ELASTIC_USERNAME / ELASTIC_PASSWORD: Optional basic auth credentials. + ELASTIC_INDEX: Target index name (default: this_little_corner_py). + LOCAL_DATA_DIR: Root folder containing JSON metadata (default: ../data/video_metadata). + YOUTUBE_API_KEY: Optional API key for pulling metadata directly from YouTube. +""" + +from __future__ import annotations + +import os +from dataclasses import dataclass +from pathlib import Path +from typing import Optional + + +@dataclass(frozen=True) +class ElasticSettings: + url: str + username: Optional[str] + password: Optional[str] + index: str + ca_cert: Optional[Path] + verify_certs: bool + api_key: Optional[str] + debug: bool + + +@dataclass(frozen=True) +class DataSettings: + root: Path + + +@dataclass(frozen=True) +class YoutubeSettings: + api_key: Optional[str] + + +@dataclass(frozen=True) +class AppConfig: + elastic: ElasticSettings + data: DataSettings + youtube: YoutubeSettings + + +def _env(name: str, default: Optional[str] = None) -> Optional[str]: + """Return an environment variable value with optional default.""" + value = os.environ.get(name) + if value is None: + return default + stripped = value.strip() + return stripped or default + + +def load_config() -> AppConfig: + """Collect configuration from environment variables.""" + elastic = ElasticSettings( + url=_env("ELASTIC_URL", "http://localhost:9200"), + username=_env("ELASTIC_USERNAME"), + password=_env("ELASTIC_PASSWORD"), + index=_env("ELASTIC_INDEX", "this_little_corner_py"), + ca_cert=Path(_env("ELASTIC_CA_CERT")).expanduser() if _env("ELASTIC_CA_CERT") else None, + verify_certs=_env("ELASTIC_VERIFY_CERTS", "1") not in {"0", "false", "False"}, + api_key=_env("ELASTIC_API_KEY"), + debug=_env("ELASTIC_DEBUG", "0") in {"1", "true", "True"}, + ) + data_root = Path( + _env( + "LOCAL_DATA_DIR", + Path(__file__).resolve().parents[1] / "data" / "video_metadata", + ) + ) + data = DataSettings(root=data_root) + youtube = YoutubeSettings(api_key=_env("YOUTUBE_API_KEY")) + return AppConfig(elastic=elastic, data=data, youtube=youtube) + + +CONFIG = load_config() diff --git a/ingest.py b/ingest.py new file mode 100644 index 0000000..ec61478 --- /dev/null +++ b/ingest.py @@ -0,0 +1,193 @@ +""" +Utilities for indexing transcript JSON documents into Elasticsearch. + +Usage: + python -m python_app.ingest --source data/video_metadata --index corner +""" + +from __future__ import annotations + +import argparse +import json +import logging +from pathlib import Path +from typing import Dict, Iterable, Iterator, Optional + +from .config import CONFIG, AppConfig + +try: + from elasticsearch import Elasticsearch, helpers # type: ignore +except ImportError: # pragma: no cover - dependency optional + Elasticsearch = None + helpers = None + + +LOGGER = logging.getLogger(__name__) + + +def _ensure_client(config: AppConfig) -> "Elasticsearch": + if Elasticsearch is None: + raise RuntimeError( + "elasticsearch package not installed. " + "Install elasticsearch>=7 to index documents." + ) + kwargs = {} + if config.elastic.api_key: + kwargs["api_key"] = config.elastic.api_key + elif config.elastic.username and config.elastic.password: + kwargs["basic_auth"] = ( + config.elastic.username, + config.elastic.password, + ) + if config.elastic.ca_cert: + kwargs["ca_certs"] = str(config.elastic.ca_cert) + kwargs["verify_certs"] = config.elastic.verify_certs + return Elasticsearch(config.elastic.url, **kwargs) + + +def iter_json_documents(data_root: Path) -> Iterator[Dict]: + """Yield JSON objects from the provided directory tree.""" + if not data_root.exists(): + raise FileNotFoundError(f"{data_root} does not exist") + for path in sorted(data_root.rglob("*.json")): + try: + with path.open("r", encoding="utf-8") as handle: + doc = json.load(handle) + doc.setdefault("video_id", path.stem) + yield doc + except Exception as exc: + LOGGER.warning("Skipping %s: %s", path, exc) + + +def build_bulk_actions( + docs: Iterable[Dict], *, index: Optional[str] = None +) -> Iterator[Dict]: + """Translate raw JSON dictionaries into Elasticsearch bulk actions.""" + for doc in docs: + video_id = doc.get("video_id") + if not video_id: + continue + parts = doc.get("transcript_parts") or doc.get("transcript") or [] + transcript_full = doc.get("transcript_full") + if not transcript_full and isinstance(parts, list): + transcript_full = " ".join( + segment.get("text", "") if isinstance(segment, dict) else str(segment) + for segment in parts + ).strip() + yield { + "_id": video_id, + "_index": index or CONFIG.elastic.index, + "_op_type": "index", + "_source": { + "video_id": video_id, + "channel_id": doc.get("channel_id"), + "channel_name": doc.get("channel_name"), + "title": doc.get("title"), + "description": doc.get("description"), + "date": doc.get("date") or doc.get("published_at"), + "url": doc.get("url"), + "duration": doc.get("duration"), + "transcript_full": transcript_full, + "transcript_secondary_full": doc.get("transcript_secondary_full"), + "transcript_parts": parts, + }, + } + + +def ensure_index(client: "Elasticsearch", index: str) -> None: + """Create the target index with a minimal mapping if it is missing.""" + if client.indices.exists(index=index): + return + LOGGER.info("Creating index %s", index) + client.indices.create( + index=index, + mappings={ + "properties": { + "video_id": {"type": "keyword"}, + "channel_id": {"type": "keyword"}, + "channel_name": {"type": "keyword"}, + "title": {"type": "text"}, + "description": {"type": "text"}, + "date": {"type": "date", "format": "strict_date_optional_time"}, + "url": {"type": "keyword"}, + "duration": {"type": "float"}, + "transcript_full": {"type": "text"}, + "transcript_secondary_full": {"type": "text"}, + "transcript_parts": { + "type": "nested", + "properties": { + "start": {"type": "float"}, + "duration": {"type": "float"}, + "text": {"type": "text"}, + }, + }, + } + }, + ) + + +def ingest_directory( + data_root: Path, + *, + config: AppConfig = CONFIG, + index: Optional[str] = None, + batch_size: int = 500, + request_timeout: int = 120, +) -> None: + """Bulk index every JSON file in the directory tree.""" + client = _ensure_client(config) + target_index = index or config.elastic.index + ensure_index(client, target_index) + docs = iter_json_documents(data_root) + actions = build_bulk_actions(docs, index=target_index) + bulk_client = client.options(request_timeout=request_timeout) + helpers.bulk( + bulk_client, + actions, + chunk_size=batch_size, + ) + LOGGER.info("Ingestion complete for %s", target_index) + + +def _build_parser() -> argparse.ArgumentParser: + parser = argparse.ArgumentParser( + description="Ingest transcript JSON files into Elasticsearch." + ) + parser.add_argument( + "--source", + type=Path, + default=CONFIG.data.root, + help="Directory containing per-video JSON files.", + ) + parser.add_argument( + "--index", + help="Override the Elasticsearch index name.", + ) + parser.add_argument( + "--batch-size", + type=int, + default=500, + help="Bulk ingest batch size.", + ) + parser.add_argument( + "--timeout", + type=int, + default=120, + help="Request timeout (seconds) for bulk operations.", + ) + return parser + + +def main() -> None: + logging.basicConfig(level=logging.INFO, format="%(levelname)s %(message)s") + args = _build_parser().parse_args() + ingest_directory( + args.source, + index=args.index, + batch_size=args.batch_size, + request_timeout=args.timeout, + ) + + +if __name__ == "__main__": # pragma: no cover + main() diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..c8bcc37 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,4 @@ +Flask>=2.3 +elasticsearch>=7.0.0,<9.0.0 +youtube-transcript-api>=0.6 +google-api-python-client>=2.0.0 diff --git a/search_app.py b/search_app.py new file mode 100644 index 0000000..5a88482 --- /dev/null +++ b/search_app.py @@ -0,0 +1,910 @@ +""" +Flask application exposing a minimal search API backed by Elasticsearch. + +Routes: + GET / -> Static HTML search page. + GET /api/channels -> List available channels (via terms aggregation). + GET /api/search -> Search index with pagination and simple highlighting. + GET /api/transcript -> Return full transcript for a given video_id. +""" + +from __future__ import annotations + +import copy +import json +import logging +import re +from pathlib import Path +from typing import Any, Dict, Iterable, List, Optional, Sequence, Set + +from collections import Counter +from datetime import datetime + +from flask import Flask, jsonify, request, send_from_directory + +from .config import CONFIG, AppConfig + +try: + from elasticsearch import Elasticsearch # type: ignore + from elasticsearch import BadRequestError # type: ignore +except ImportError: # pragma: no cover - dependency optional + Elasticsearch = None + BadRequestError = Exception # type: ignore + +LOGGER = logging.getLogger(__name__) + + +def _ensure_client(config: AppConfig) -> "Elasticsearch": + if Elasticsearch is None: + raise RuntimeError( + "elasticsearch package not installed. " + "Install elasticsearch>=7 to run the Flask search app." + ) + kwargs = {} + if config.elastic.api_key: + kwargs["api_key"] = config.elastic.api_key + elif config.elastic.username and config.elastic.password: + kwargs["basic_auth"] = ( + config.elastic.username, + config.elastic.password, + ) + if config.elastic.ca_cert: + kwargs["ca_certs"] = str(config.elastic.ca_cert) + kwargs["verify_certs"] = config.elastic.verify_certs + return Elasticsearch(config.elastic.url, **kwargs) + + +def metrics_payload(data_root: Path) -> Dict[str, Any]: + total_items = 0 + channel_counter: Counter = Counter() + channel_name_map: Dict[str, str] = {} + year_counter: Counter = Counter() + month_counter: Counter = Counter() + + if not data_root.exists(): + LOGGER.warning("Data directory %s not found; metrics will be empty.", data_root) + return { + "totalItems": 0, + "totalChannels": 0, + "itemsPerChannel": [], + "yearHistogram": [], + "recentMonths": [], + } + + for path in data_root.rglob("*.json"): + try: + with path.open("r", encoding="utf-8") as handle: + doc = json.load(handle) + except Exception: + continue + + total_items += 1 + + channel_id = doc.get("channel_id") + channel_name = doc.get("channel_name") or channel_id + if channel_id: + channel_counter[channel_id] += 1 + if channel_name and channel_id not in channel_name_map: + channel_name_map[channel_id] = channel_name + + date_value = doc.get("date") or doc.get("published_at") + dt: Optional[datetime] = None + if isinstance(date_value, str): + for fmt in ("%Y-%m-%d", "%Y-%m-%dT%H:%M:%S", "%Y-%m-%dT%H:%M:%SZ"): + try: + dt = datetime.strptime(date_value[: len(fmt)], fmt) + break + except Exception: + continue + elif isinstance(date_value, (int, float)): + try: + dt = datetime.fromtimestamp(date_value) + except Exception: + dt = None + + if dt: + year_counter[str(dt.year)] += 1 + month_counter[dt.strftime("%Y-%m")] += 1 + + items_per_channel = [ + { + "label": channel_name_map.get(cid, cid), + "count": count, + } + for cid, count in channel_counter.most_common() + ] + + year_histogram = [ + {"bucket": year, "count": year_counter[year]} + for year in sorted(year_counter.keys()) + ] + + recent_months = sorted(month_counter.keys()) + recent_months = recent_months[-12:] + recent_months_payload = [ + {"bucket": month, "count": month_counter[month]} for month in recent_months + ] + + return { + "totalItems": total_items, + "totalChannels": len(channel_counter), + "itemsPerChannel": items_per_channel, + "yearHistogram": year_histogram, + "recentMonths": recent_months_payload, + } + + +def elastic_metrics_payload( + client: "Elasticsearch", + index: str, + *, + channel_field_candidates: Optional[List[str]] = None, + debug: bool = False, +) -> Dict[str, Any]: + if channel_field_candidates is None: + channel_field_candidates = ["channel_id.keyword", "channel_id"] + + base_body: Dict[str, Any] = { + "size": 0, + "track_total_hits": True, + "aggs": { + "channels": { + "terms": { + "field": "channel_id.keyword", + "size": 500, + "order": {"_count": "desc"}, + }, + "aggs": { + "name": { + "top_hits": { + "size": 1, + "_source": {"includes": ["channel_name"]}, + } + } + }, + }, + "year_histogram": { + "date_histogram": { + "field": "date", + "calendar_interval": "year", + "format": "yyyy", + } + }, + "month_histogram": { + "date_histogram": { + "field": "date", + "calendar_interval": "month", + "format": "yyyy-MM", + "order": {"_key": "asc"}, + } + }, + }, + } + + last_error: Optional[Exception] = None + response: Optional[Dict[str, Any]] = None + for candidate_field in channel_field_candidates: + body = json.loads(json.dumps(base_body)) + body["aggs"]["channels"]["terms"]["field"] = candidate_field + try: + if debug: + LOGGER.info( + "Elasticsearch metrics request: %s", + json.dumps({"index": index, "body": body}, indent=2), + ) + response = client.search(index=index, body=body) + break + except BadRequestError as exc: + last_error = exc + if debug: + LOGGER.warning( + "Metrics aggregation failed for field %s: %s", + candidate_field, + exc, + ) + if response is None: + raise last_error or RuntimeError("Unable to compute metrics from Elasticsearch.") + + hits = response.get("hits", {}) + total_items = hits.get("total", {}).get("value", 0) + + if debug: + LOGGER.info( + "Elasticsearch metrics response: %s", + json.dumps(response, indent=2, default=str), + ) + + aggregations = response.get("aggregations", {}) + channel_buckets = aggregations.get("channels", {}).get("buckets", []) + items_per_channel = [] + for bucket in channel_buckets: + key = bucket.get("key") + channel_name = key + top_hits = ( + bucket.get("name", {}) + .get("hits", {}) + .get("hits", []) + ) + if top_hits: + channel_name = ( + top_hits[0] + .get("_source", {}) + .get("channel_name", channel_name) + ) + items_per_channel.append( + {"label": channel_name or key, "count": bucket.get("doc_count", 0)} + ) + + year_buckets = aggregations.get("year_histogram", {}).get("buckets", []) + year_histogram = [ + { + "bucket": bucket.get("key_as_string") + or str(bucket.get("key")), + "count": bucket.get("doc_count", 0), + } + for bucket in year_buckets + ] + + month_buckets = aggregations.get("month_histogram", {}).get("buckets", []) + recent_months_entries = [ + { + "bucket": bucket.get("key_as_string") + or str(bucket.get("key")), + "count": bucket.get("doc_count", 0), + "_key": bucket.get("key"), + } + for bucket in month_buckets + ] + recent_months_entries.sort(key=lambda item: item.get("_key", 0)) + recent_months_payload = [ + {"bucket": entry["bucket"], "count": entry["count"]} + for entry in recent_months_entries[-12:] + ] + + return { + "totalItems": total_items, + "totalChannels": len(items_per_channel), + "itemsPerChannel": items_per_channel, + "yearHistogram": year_histogram, + "recentMonths": recent_months_payload, + } + + +def parse_channel_params(values: Iterable[Optional[str]]) -> List[str]: + seen: Set[str] = set() + channels: List[str] = [] + for value in values: + if not value: + continue + for part in str(value).split(","): + cleaned = part.strip() + if not cleaned or cleaned.lower() == "all": + continue + if cleaned not in seen: + seen.add(cleaned) + channels.append(cleaned) + return channels + + +def build_channel_filter(channels: Optional[Sequence[str]]) -> Optional[Dict]: + if not channels: + return None + per_channel_clauses: List[Dict[str, Any]] = [] + for value in channels: + if not value: + continue + per_channel_clauses.append( + { + "bool": { + "should": [ + {"term": {"channel_id.keyword": value}}, + {"term": {"channel_id": value}}, + ], + "minimum_should_match": 1, + } + } + ) + if not per_channel_clauses: + return None + if len(per_channel_clauses) == 1: + return per_channel_clauses[0] + return { + "bool": { + "should": per_channel_clauses, + "minimum_should_match": 1, + } + } + + +def build_query_payload( + query: str, + *, + channels: Optional[Sequence[str]] = None, + sort: str = "relevant", + use_exact: bool = True, + use_fuzzy: bool = True, + use_phrase: bool = True, + use_query_string: bool = False, +) -> Dict: + filters: List[Dict] = [] + should: List[Dict] = [] + + channel_filter = build_channel_filter(channels) + if channel_filter: + filters.append(channel_filter) + + if use_query_string: + base_fields = ["title^3", "description^2", "transcript_full", "transcript_secondary_full"] + qs_query = (query or "").strip() or "*" + query_body: Dict[str, Any] = { + "query_string": { + "query": qs_query, + "default_operator": "AND", + "fields": base_fields, + } + } + if filters: + query_body = {"bool": {"must": query_body, "filter": filters}} + body: Dict = { + "query": query_body, + "highlight": { + "fields": { + "transcript_full": { + "fragment_size": 160, + "number_of_fragments": 5, + "fragmenter": "span", + }, + "transcript_secondary_full": { + "fragment_size": 160, + "number_of_fragments": 5, + "fragmenter": "span", + }, + "title": {"number_of_fragments": 0}, + "description": { + "fragment_size": 160, + "number_of_fragments": 1, + }, + }, + "require_field_match": False, + "pre_tags": [""], + "post_tags": [""], + "encoder": "html", + "max_analyzed_offset": 900000, + }, + } + if sort == "newer": + body["sort"] = [{"date": {"order": "desc"}}] + elif sort == "older": + body["sort"] = [{"date": {"order": "asc"}}] + return body + + if query: + base_fields = ["title^3", "description^2", "transcript_full", "transcript_secondary_full"] + if use_phrase: + should.append( + { + "match_phrase": { + "transcript_full": { + "query": query, + "slop": 2, + "boost": 10.0, + } + } + } + ) + should.append( + { + "match_phrase": { + "transcript_secondary_full": { + "query": query, + "slop": 2, + "boost": 10.0, + } + } + } + ) + if use_fuzzy: + should.append( + { + "multi_match": { + "query": query, + "fields": base_fields, + "type": "best_fields", + "operator": "and", + "fuzziness": "AUTO", + "prefix_length": 1, + "max_expansions": 50, + "boost": 1.5, + } + } + ) + if use_exact: + should.append( + { + "multi_match": { + "query": query, + "fields": base_fields, + "type": "best_fields", + "operator": "and", + "boost": 3.0, + } + } + ) + + if should: + query_body: Dict = { + "bool": { + "should": should, + "minimum_should_match": 1, + } + } + if filters: + query_body["bool"]["filter"] = filters + elif filters: + query_body = {"bool": {"filter": filters}} + else: + query_body = {"match_all": {}} + + body: Dict = { + "query": query_body, + "highlight": { + "fields": { + "transcript_full": { + "fragment_size": 160, + "number_of_fragments": 5, + "fragmenter": "span", + }, + "transcript_secondary_full": { + "fragment_size": 160, + "number_of_fragments": 5, + "fragmenter": "span", + }, + "title": {"number_of_fragments": 0}, + "description": { + "fragment_size": 160, + "number_of_fragments": 1, + }, + }, + "require_field_match": False, + "pre_tags": [""], + "post_tags": [""], + "encoder": "html", + "max_analyzed_offset": 900000, + }, + } + if query_body.get("match_all") is None: + body["highlight"]["highlight_query"] = copy.deepcopy(query_body) + + if sort == "newer": + body["sort"] = [{"date": {"order": "desc"}}] + elif sort == "older": + body["sort"] = [{"date": {"order": "asc"}}] + return body + + +def create_app(config: AppConfig = CONFIG) -> Flask: + app = Flask(__name__, static_folder=str(Path(__file__).parent / "static")) + client = _ensure_client(config) + index = config.elastic.index + + @app.route("/") + def index_page(): + return send_from_directory(app.static_folder, "index.html") + + @app.route("/static/") + def static_files(filename: str): + return send_from_directory(app.static_folder, filename) + + @app.route("/api/channels") + def channels(): + base_channels_body = { + "size": 0, + "aggs": { + "channels": { + "terms": {"field": "channel_id", "size": 200}, + "aggs": { + "name": { + "top_hits": { + "size": 1, + "_source": {"includes": ["channel_name"]}, + } + } + }, + } + }, + } + + def run_channels_request(field_name: str): + body = json.loads(json.dumps(base_channels_body)) # deep copy + body["aggs"]["channels"]["terms"]["field"] = field_name + if config.elastic.debug: + LOGGER.info( + "Elasticsearch channels request: %s", + json.dumps({"index": index, "body": body}, indent=2), + ) + return client.search(index=index, body=body) + + response = None + last_error = None + for candidate_field in ("channel_id.keyword", "channel_id"): + try: + response = run_channels_request(candidate_field) + if config.elastic.debug: + LOGGER.info("Channels aggregation used field: %s", candidate_field) + break + except BadRequestError as exc: + last_error = exc + if config.elastic.debug: + LOGGER.warning( + "Channels aggregation failed for field %s: %s", + candidate_field, + exc, + ) + if response is None: + raise last_error or RuntimeError("Unable to aggregate channels.") + + if config.elastic.debug: + LOGGER.info( + "Elasticsearch channels response: %s", + json.dumps(response, indent=2, default=str), + ) + buckets = ( + response.get("aggregations", {}) + .get("channels", {}) + .get("buckets", []) + ) + data = [ + { + "Id": bucket.get("key"), + "Name": ( + bucket.get("name", {}) + .get("hits", {}) + .get("hits", [{}])[0] + .get("_source", {}) + .get("channel_name", bucket.get("key")) + ), + "Count": bucket.get("doc_count", 0), + } + for bucket in buckets + ] + data.sort(key=lambda item: item["Name"].lower()) + return jsonify(data) + + @app.route("/api/search") + def search(): + query = request.args.get("q", "", type=str) + raw_channels: List[Optional[str]] = request.args.getlist("channel_id") + legacy_channel = request.args.get("channel", type=str) + if legacy_channel: + raw_channels.append(legacy_channel) + channels = parse_channel_params(raw_channels) + sort = request.args.get("sort", "relevant", type=str) + page = max(request.args.get("page", 0, type=int), 0) + size = max(request.args.get("size", 10, type=int), 1) + + def parse_flag(name: str, default: bool = True) -> bool: + value = request.args.get(name) + if value is None: + return default + return value.lower() not in {"0", "false", "no"} + + use_exact = parse_flag("exact", True) + use_fuzzy = parse_flag("fuzzy", True) + use_phrase = parse_flag("phrase", True) + use_query_string = parse_flag("query_string", False) + if use_query_string: + use_exact = use_fuzzy = use_phrase = False + + payload = build_query_payload( + query, + channels=channels, + sort=sort, + use_exact=use_exact, + use_fuzzy=use_fuzzy, + use_phrase=use_phrase, + use_query_string=use_query_string, + ) + start = page * size + if config.elastic.debug: + LOGGER.info( + "Elasticsearch search request: %s", + json.dumps( + { + "index": index, + "from": start, + "size": size, + "body": payload, + "channels": channels, + "toggles": { + "exact": use_exact, + "fuzzy": use_fuzzy, + "phrase": use_phrase, + }, + }, + indent=2, + ), + ) + response = client.search( + index=index, + from_=start, + size=size, + body=payload, + ) + if config.elastic.debug: + LOGGER.info( + "Elasticsearch search response: %s", + json.dumps(response, indent=2, default=str), + ) + + hits = response.get("hits", {}) + total = hits.get("total", {}).get("value", 0) + documents = [] + for hit in hits.get("hits", []): + source = hit.get("_source", {}) + highlight_map = hit.get("highlight", {}) + transcript_highlight = ( + (highlight_map.get("transcript_full", []) or []) + + (highlight_map.get("transcript_secondary_full", []) or []) + ) + + title_html = ( + highlight_map.get("title") + or [source.get("title") or "Untitled"] + )[0] + description_html = ( + highlight_map.get("description") + or [source.get("description") or ""] + )[0] + documents.append( + { + "video_id": source.get("video_id"), + "channel_id": source.get("channel_id"), + "channel_name": source.get("channel_name"), + "title": source.get("title"), + "titleHtml": title_html, + "description": source.get("description"), + "descriptionHtml": description_html, + "date": source.get("date"), + "url": source.get("url"), + "toHighlight": transcript_highlight, + "highlightSource": { + "primary": bool(highlight_map.get("transcript_full")), + "secondary": bool(highlight_map.get("transcript_secondary_full")), + }, + } + ) + + return jsonify( + { + "items": documents, + "totalResults": total, + "totalPages": (total + size - 1) // size, + "currentPage": page, + } + ) + + @app.route("/api/metrics") + def metrics(): + try: + data = elastic_metrics_payload( + client, + index, + channel_field_candidates=["channel_id.keyword", "channel_id"], + debug=config.elastic.debug, + ) + except Exception: + LOGGER.exception( + "Falling back to local metrics payload due to Elasticsearch error.", + exc_info=True, + ) + data = metrics_payload(config.data.root) + return jsonify(data) + + @app.route("/api/frequency") + def frequency(): + raw_term = request.args.get("term", type=str) or "" + use_query_string = request.args.get("query_string", default="0", type=str) + use_query_string = (use_query_string or "").lower() in {"1", "true", "yes"} + term = raw_term.strip() + if not term and not use_query_string: + return ("term parameter is required", 400) + if use_query_string and not term: + term = "*" + + raw_channels: List[Optional[str]] = request.args.getlist("channel_id") + legacy_channel = request.args.get("channel", type=str) + if legacy_channel: + raw_channels.append(legacy_channel) + channels = parse_channel_params(raw_channels) + interval = (request.args.get("interval", "month") or "month").lower() + allowed_intervals = {"day", "week", "month", "quarter", "year"} + if interval not in allowed_intervals: + interval = "month" + start = request.args.get("start", type=str) + end = request.args.get("end", type=str) + + filters: List[Dict] = [] + channel_filter = build_channel_filter(channels) + if channel_filter: + filters.append(channel_filter) + if start or end: + range_filter: Dict[str, Dict[str, Dict[str, str]]] = {"range": {"date": {}}} + if start: + range_filter["range"]["date"]["gte"] = start + if end: + range_filter["range"]["date"]["lte"] = end + filters.append(range_filter) + + base_fields = ["title^3", "description^2", "transcript_full", "transcript_secondary_full"] + if use_query_string: + qs_query = term or "*" + must_clause: List[Dict[str, Any]] = [ + { + "query_string": { + "query": qs_query, + "default_operator": "AND", + "fields": base_fields, + } + } + ] + else: + must_clause = [ + { + "multi_match": { + "query": term, + "fields": base_fields, + "type": "best_fields", + "operator": "and", + } + } + ] + + query: Dict[str, Any] = {"bool": {"must": must_clause}} + if filters: + query["bool"]["filter"] = filters + + histogram: Dict[str, Any] = { + "field": "date", + "calendar_interval": interval, + "min_doc_count": 0, + } + if start or end: + bounds: Dict[str, str] = {} + if start: + bounds["min"] = start + if end: + bounds["max"] = end + if bounds: + histogram["extended_bounds"] = bounds + + channel_terms_size = max(6, len(channels)) if channels else 6 + + body = { + "size": 0, + "query": query, + "aggs": { + "over_time": { + "date_histogram": histogram, + "aggs": { + "by_channel": { + "terms": { + "field": "channel_id.keyword", + "size": channel_terms_size, + "order": {"_count": "desc"}, + } + } + }, + } + }, + } + + if config.elastic.debug: + LOGGER.info( + "Elasticsearch frequency request: %s", + json.dumps( + { + "index": index, + "body": body, + "term": term, + "interval": interval, + "channels": channels, + "start": start, + "end": end, + "query_string": use_query_string, + }, + indent=2, + ), + ) + + response = client.search(index=index, body=body) + + if config.elastic.debug: + LOGGER.info( + "Elasticsearch frequency response: %s", + json.dumps(response, indent=2, default=str), + ) + + raw_buckets = ( + response.get("aggregations", {}) + .get("over_time", {}) + .get("buckets", []) + ) + + channel_totals: Dict[str, int] = {} + buckets: List[Dict[str, Any]] = [] + for bucket in raw_buckets: + date_str = bucket.get("key_as_string") + total = bucket.get("doc_count", 0) + channel_entries: List[Dict[str, Any]] = [] + for ch_bucket in bucket.get("by_channel", {}).get("buckets", []): + cid = ch_bucket.get("key") + count = ch_bucket.get("doc_count", 0) + if cid: + channel_entries.append({"id": cid, "count": count}) + channel_totals[cid] = channel_totals.get(cid, 0) + count + buckets.append( + {"date": date_str, "total": total, "channels": channel_entries} + ) + + ranked_channels = sorted( + [{"id": cid, "total": total} for cid, total in channel_totals.items()], + key=lambda item: item["total"], + reverse=True, + ) + + payload = { + "term": raw_term if not use_query_string else term, + "interval": interval, + "buckets": buckets, + "channels": ranked_channels, + "totalResults": response.get("hits", {}) + .get("total", {}) + .get("value", 0), + } + return jsonify(payload) + + @app.route("/frequency") + def frequency_page(): + return send_from_directory(app.static_folder, "frequency.html") + + @app.route("/api/transcript") + def transcript(): + video_id = request.args.get("video_id", type=str) + if not video_id: + return ("video_id not set", 400) + response = client.get(index=index, id=video_id, ignore=[404]) + if config.elastic.debug: + LOGGER.info( + "Elasticsearch transcript request: index=%s id=%s", index, video_id + ) + LOGGER.info( + "Elasticsearch transcript response: %s", + json.dumps(response, indent=2, default=str) + if response + else "None", + ) + if not response or not response.get("found"): + return ("not found", 404) + source = response["_source"] + return jsonify( + { + "video_id": source.get("video_id"), + "title": source.get("title"), + "transcript_parts": source.get("transcript_parts", []), + "transcript_full": source.get("transcript_full"), + "transcript_secondary_parts": source.get("transcript_secondary_parts", []), + "transcript_secondary_full": source.get("transcript_secondary_full"), + } + ) + + return app + + +def main() -> None: # pragma: no cover + logging.basicConfig(level=logging.INFO, format="%(levelname)s %(message)s") + app = create_app() + app.run(host="0.0.0.0", port=8080, debug=True) + + +if __name__ == "__main__": # pragma: no cover + main() diff --git a/static/app.js b/static/app.js new file mode 100644 index 0000000..3324fb4 --- /dev/null +++ b/static/app.js @@ -0,0 +1,733 @@ +(() => { + let qs = new URLSearchParams(window.location.search); + const qInput = document.getElementById("q"); + const channelDropdown = document.getElementById("channelDropdown"); + const channelSummary = document.getElementById("channelSummary"); + const channelOptions = document.getElementById("channelOptions"); + const sortSel = document.getElementById("sort"); + const sizeSel = document.getElementById("size"); + const exactToggle = document.getElementById("exactToggle"); + const fuzzyToggle = document.getElementById("fuzzyToggle"); + const phraseToggle = document.getElementById("phraseToggle"); + const queryToggle = document.getElementById("queryStringToggle"); + const searchBtn = document.getElementById("searchBtn"); + const resultsDiv = document.getElementById("results"); + const metaDiv = document.getElementById("meta"); + const metricsContainer = document.getElementById("metrics"); + const metricsStatus = document.getElementById("metricsStatus"); + const metricsContent = document.getElementById("metricsContent"); + const freqSummary = document.getElementById("frequencySummary"); + const freqChart = document.getElementById("frequencyChart"); + const channelMap = new Map(); + const selectedChannels = new Set(); + let pendingChannelSelection = []; + let channelsReady = false; + let suppressChannelChange = false; + let allChannelsCheckbox = null; + let previousToggleState = { exact: true, fuzzy: true, phrase: true }; + let currentPage = + parseInt(qs.get("page") || "0", 10) || + 0; + + function parseBoolParam(name, defaultValue) { + const raw = qs.get(name); + if (raw === null) return defaultValue; + const lowered = raw.toLowerCase(); + return !["0", "false", "no"].includes(lowered); + } + + function parseChannelParams(params) { + const collected = []; + if (!params) return collected; + const seen = new Set(); + const rawValues = params.getAll("channel_id"); + const legacy = params.get("channel"); + if (legacy) rawValues.push(legacy); + rawValues.forEach((value) => { + if (value == null) return; + String(value) + .split(",") + .map((part) => part.trim()) + .filter((part) => part && part.toLowerCase() !== "all") + .forEach((part) => { + if (!seen.has(part)) { + seen.add(part); + collected.push(part); + } + }); + }); + return collected; + } + + function getSelectedChannels() { + return Array.from(selectedChannels); + } + + function ensureAllCheckboxState() { + if (allChannelsCheckbox) { + allChannelsCheckbox.checked = selectedChannels.size === 0; + } + } + + function updateChannelSummary() { + if (!channelSummary) return; + if (!selectedChannels.size) { + channelSummary.textContent = "All Channels"; + return; + } + const names = Array.from(selectedChannels).map( + (id) => channelMap.get(id) || id + ); + if (names.length > 1) { + names.sort((a, b) => a.localeCompare(b, undefined, { sensitivity: "base" })); + } + let label = names.slice(0, 3).join(", "); + if (names.length > 3) { + label += ` +${names.length - 3} more`; + } + channelSummary.textContent = label; + } + + function applyChannelSelection(ids, { silent = false } = {}) { + selectedChannels.clear(); + ids.forEach((id) => selectedChannels.add(id)); + pendingChannelSelection = getSelectedChannels(); + ensureAllCheckboxState(); + if (channelOptions) { + suppressChannelChange = true; + const checkboxes = channelOptions.querySelectorAll( + 'input[type="checkbox"][data-channel="1"]' + ); + checkboxes.forEach((checkbox) => { + checkbox.checked = selectedChannels.has(checkbox.value); + }); + suppressChannelChange = false; + } + updateChannelSummary(); + if (!silent && channelsReady) { + runSearch(0); + } + } + + function setFromQuery() { + qInput.value = qs.get("q") || ""; + sortSel.value = qs.get("sort") || "relevant"; + sizeSel.value = qs.get("size") || "10"; + pendingChannelSelection = parseChannelParams(qs); + applyChannelSelection(pendingChannelSelection, { silent: true }); + exactToggle.checked = parseBoolParam("exact", true); + fuzzyToggle.checked = parseBoolParam("fuzzy", true); + phraseToggle.checked = parseBoolParam("phrase", true); + queryToggle.checked = parseBoolParam("query_string", false); + applyQueryMode(); + rememberToggleState(); + } + + function applyQueryMode() { + if (!queryToggle) return; + if (queryToggle.checked) { + if (!exactToggle.disabled) { + previousToggleState = { + exact: exactToggle.checked, + fuzzy: fuzzyToggle.checked, + phrase: phraseToggle.checked, + }; + } + exactToggle.checked = false; + fuzzyToggle.checked = false; + phraseToggle.checked = false; + exactToggle.disabled = true; + fuzzyToggle.disabled = true; + phraseToggle.disabled = true; + } else { + exactToggle.disabled = false; + fuzzyToggle.disabled = false; + phraseToggle.disabled = false; + exactToggle.checked = previousToggleState.exact; + fuzzyToggle.checked = previousToggleState.fuzzy; + phraseToggle.checked = previousToggleState.phrase; + } + } + + function rememberToggleState() { + if (queryToggle && !queryToggle.checked) { + previousToggleState = { + exact: !!exactToggle.checked, + fuzzy: !!fuzzyToggle.checked, + phrase: !!phraseToggle.checked, + }; + } + } + + if (channelOptions) { + channelOptions.addEventListener("change", (event) => { + const target = event.target; + if (!(target instanceof HTMLInputElement) || target.type !== "checkbox") { + return; + } + if (suppressChannelChange) { + return; + } + if (target.dataset.all === "1") { + if (!target.checked && !selectedChannels.size) { + suppressChannelChange = true; + target.checked = true; + suppressChannelChange = false; + return; + } + if (target.checked) { + selectedChannels.clear(); + pendingChannelSelection = []; + suppressChannelChange = true; + const others = channelOptions.querySelectorAll( + 'input[type="checkbox"][data-channel="1"]' + ); + others.forEach((checkbox) => { + checkbox.checked = false; + }); + suppressChannelChange = false; + ensureAllCheckboxState(); + updateChannelSummary(); + if (channelsReady) { + runSearch(0); + } + } + return; + } + + const id = target.value; + if (!id) return; + if (target.checked) { + selectedChannels.add(id); + } else { + selectedChannels.delete(id); + } + pendingChannelSelection = getSelectedChannels(); + ensureAllCheckboxState(); + updateChannelSummary(); + if (channelsReady) { + runSearch(0); + } + }); + } + + async function loadChannels() { + if (!channelOptions) { + channelsReady = true; + return; + } + try { + const res = await fetch("/api/channels"); + const data = await res.json(); + channelMap.clear(); + channelOptions.innerHTML = ""; + + const listFragment = document.createDocumentFragment(); + + const allLabel = document.createElement("label"); + allLabel.className = "channel-option"; + allChannelsCheckbox = document.createElement("input"); + allChannelsCheckbox.type = "checkbox"; + allChannelsCheckbox.dataset.all = "1"; + allChannelsCheckbox.checked = selectedChannels.size === 0; + const allText = document.createElement("span"); + allText.textContent = "All Channels"; + allLabel.appendChild(allChannelsCheckbox); + allLabel.appendChild(allText); + listFragment.appendChild(allLabel); + + data.forEach((item) => { + const label = document.createElement("label"); + label.className = "channel-option"; + const checkbox = document.createElement("input"); + checkbox.type = "checkbox"; + checkbox.value = item.Id; + checkbox.dataset.channel = "1"; + const text = document.createElement("span"); + text.textContent = `${item.Name} (${item.Count})`; + label.appendChild(checkbox); + label.appendChild(text); + listFragment.appendChild(label); + channelMap.set(item.Id, item.Name); + }); + + channelOptions.appendChild(listFragment); + + if (!data.length) { + const empty = document.createElement("div"); + empty.textContent = "No channels available."; + channelOptions.appendChild(empty); + } + + const initialSelection = pendingChannelSelection.length + ? pendingChannelSelection + : Array.from(selectedChannels); + applyChannelSelection(initialSelection, { silent: true }); + channelsReady = true; + updateChannelSummary(); + } catch (err) { + console.error("Failed to load channels", err); + channelOptions.innerHTML = "
Failed to load channels.
"; + channelsReady = true; + ensureAllCheckboxState(); + updateChannelSummary(); + } + } + + function updateUrl(q, sort, channels, page, size, exact, fuzzy, phrase, queryMode) { + const next = new URL(window.location.href); + next.searchParams.set("q", q); + next.searchParams.set("sort", sort); + next.searchParams.delete("channel_id"); + next.searchParams.delete("channel"); + channels.forEach((id) => next.searchParams.append("channel_id", id)); + next.searchParams.set("page", page); + next.searchParams.set("size", size); + next.searchParams.set("exact", exact ? "1" : "0"); + next.searchParams.set("fuzzy", fuzzy ? "1" : "0"); + next.searchParams.set("phrase", phrase ? "1" : "0"); + next.searchParams.set("query_string", queryMode ? "1" : "0"); + history.pushState({}, "", next.toString()); + } + + function fmtDate(value) { + try { + return (value || "").split("T")[0]; + } catch { + return value; + } + } + + function fmtNumber(n) { + if (typeof n === "number") return n.toLocaleString(); + return n; + } + + + // Transcript viewer functionality removed. + + function renderMetrics(data) { + if (!metricsContent) return; + metricsContent.innerHTML = ""; + if (!data) return; + + if (metricsStatus) { + metricsStatus.textContent = ""; + } + + const summary = document.createElement("div"); + summary.innerHTML = `Entries: ${fmtNumber(data.totalItems)} • Channels: ${fmtNumber(data.totalChannels)}`; + metricsContent.appendChild(summary); + + if (Array.isArray(data.itemsPerChannel) && data.itemsPerChannel.length) { + const top = data.itemsPerChannel.slice(0, 5); + const channelHeader = document.createElement("div"); + channelHeader.style.marginTop = "8px"; + channelHeader.innerHTML = "Top Channels"; + metricsContent.appendChild(channelHeader); + + const channelList = document.createElement("div"); + channelList.className = "muted"; + top.forEach((entry) => { + const row = document.createElement("div"); + row.textContent = `${entry.label}: ${fmtNumber(entry.count)}`; + channelList.appendChild(row); + }); + metricsContent.appendChild(channelList); + } +} + +async function loadMetrics() { + if (!metricsContainer) return; + metricsContainer.dataset.loading = "1"; + if (!metricsContainer.dataset.loaded && metricsStatus) { + metricsStatus.textContent = "Loading metrics…"; + } + try { + const res = await fetch("/api/metrics"); + const data = await res.json(); + renderMetrics(data); + metricsContainer.dataset.loaded = "1"; + } catch (err) { + console.error("Failed to load metrics", err); + if (!metricsContainer.dataset.loaded && metricsStatus) { + metricsStatus.textContent = "Metrics unavailable."; + } + } finally { + delete metricsContainer.dataset.loading; + } +} + +function clearFrequency(message) { + if (freqSummary) { + freqSummary.textContent = message || ""; + } + if (freqChart) { + freqChart.innerHTML = ""; + } +} + +function renderFrequencyChart(buckets, channelTotals) { + if (!freqChart || typeof d3 === "undefined") { + return; + } + freqChart.innerHTML = ""; + if (!buckets.length) { + clearFrequency("No matches for this query."); + return; + } + + let channelsOrder = + (channelTotals && channelTotals.length + ? channelTotals.map((entry) => entry.id) + : []) || []; + if (!channelsOrder.length) { + const unique = new Set(); + buckets.forEach((bucket) => { + (bucket.channels || []).forEach((entry) => unique.add(entry.id)); + }); + channelsOrder = Array.from(unique); + } + channelsOrder = channelsOrder.slice(0, 6); + if (!channelsOrder.length) { + clearFrequency("No matches for this query."); + return; + } + + const dateKeyFormat = d3.timeFormat("%Y-%m-%d"); + const parsed = buckets + .map((bucket) => { + const parsedDate = d3.isoParse(bucket.date) || new Date(bucket.date); + if (!(parsedDate instanceof Date) || Number.isNaN(parsedDate.valueOf())) { + return null; + } + const counts = {}; + (bucket.channels || []).forEach((entry) => { + if (channelsOrder.includes(entry.id)) { + counts[entry.id] = entry.count || 0; + } + }); + return { + date: parsedDate, + dateKey: dateKeyFormat(parsedDate), + counts, + }; + }) + .filter(Boolean); + + if (!parsed.length) { + clearFrequency("Timeline unavailable."); + return; + } + + const margin = { top: 12, right: 12, bottom: 52, left: 56 }; + const fullWidth = freqChart.clientWidth || 360; + const fullHeight = 220; + const width = fullWidth - margin.left - margin.right; + const height = fullHeight - margin.top - margin.bottom; + + const svg = d3 + .select(freqChart) + .append("svg") + .attr("width", fullWidth) + .attr("height", fullHeight); + + const g = svg + .append("g") + .attr("transform", `translate(${margin.left},${margin.top})`); + + const x = d3 + .scaleBand() + .domain(parsed.map((entry) => entry.dateKey)) + .range([0, width]) + .padding(0.25); + + const yMax = d3.max(parsed, (entry) => + d3.sum(channelsOrder, (key) => entry.counts[key] || 0) + ); + + const y = d3 + .scaleLinear() + .domain([0, yMax || 0]) + .nice() + .range([height, 0]); + + const tickValues = + parsed.length <= 6 + ? parsed.map((entry) => entry.dateKey) + : parsed + .filter((_, index, arr) => index % Math.ceil(arr.length / 6) === 0) + .map((entry) => entry.dateKey); + + const xAxis = d3.axisBottom(x).tickValues(tickValues); + const yAxis = d3.axisLeft(y).ticks(5); + + g.append("g") + .attr("class", "axis") + .attr("transform", `translate(0,${height})`) + .call(xAxis) + .selectAll("text") + .attr("text-anchor", "end") + .attr("transform", "rotate(-35)") + .attr("dx", "-0.8em") + .attr("dy", "0.15em"); + + g.append("g").attr("class", "axis").call(yAxis); + + const stack = d3.stack().keys(channelsOrder).value((entry, key) => entry.counts[key] || 0); + const stacked = stack(parsed); + const color = d3.scaleOrdinal(channelsOrder, d3.schemeTableau10); + + const layers = g + .selectAll(".freq-layer") + .data(stacked) + .enter() + .append("g") + .attr("class", "freq-layer") + .attr("fill", (d) => color(d.key)); + + layers + .selectAll("rect") + .data((d) => d) + .enter() + .append("rect") + .attr("x", (d) => x(d.data.dateKey)) + .attr("width", x.bandwidth()) + .attr("y", (d) => y(d[1])) + .attr("height", (d) => y(d[0]) - y(d[1])) + .append("title") + .text(function (d) { + const group = this.parentNode ? this.parentNode.parentNode : null; + const key = group ? d3.select(group).datum().key : undefined; + const label = key ? channelMap.get(key) || key : key || ''; + return `${dateKeyFormat(d.data.date)}: ${d[1] - d[0]}${label ? " (" + label + ")" : ''}`; + }); + + const legend = document.createElement("div"); + legend.className = "freq-legend"; + channelsOrder.forEach((key) => { + const item = document.createElement("div"); + item.className = "freq-legend-item"; + const swatch = document.createElement("span"); + swatch.className = "freq-legend-swatch"; + swatch.style.backgroundColor = color(key); + const label = document.createElement("span"); + label.textContent = channelMap.get(key) || key; + item.appendChild(swatch); + item.appendChild(label); + legend.appendChild(item); + }); + freqChart.appendChild(legend); +} + +async function updateFrequencyChart(term, channels, queryMode) { + if (!freqChart || typeof d3 === "undefined") { + return; + } + let trimmed = term.trim(); + if (!trimmed) { + if (queryMode) { + trimmed = "*"; + } else { + clearFrequency("Enter a query to see timeline."); + return; + } + } + + const params = new URLSearchParams(); + params.set("term", trimmed); + params.set("interval", "month"); + (channels || []).forEach((id) => params.append("channel_id", id)); + if (queryMode) { + params.set("query_string", "1"); + } + + clearFrequency("Loading timeline…"); + try { + const res = await fetch(`/api/frequency?${params.toString()}`); + if (!res.ok) { + throw new Error(`Request failed with status ${res.status}`); + } + const payload = await res.json(); + const total = payload.totalResults || 0; + if (freqSummary) { + if (total === 0) { + freqSummary.textContent = "No matches for this query."; + } else if (queryMode) { + freqSummary.textContent = `Matches: ${total.toLocaleString()} • Interval: ${payload.interval || "month"} (query-string)`; + } else { + freqSummary.textContent = `Matches: ${total.toLocaleString()} • Interval: ${payload.interval || "month"}`; + } + } + if (total === 0) { + freqChart.innerHTML = ""; + return; + } + renderFrequencyChart(payload.buckets || [], payload.channels || []); + } catch (err) { + console.error(err); + clearFrequency("Timeline unavailable."); + } +} + + function renderResults(payload, page) { + resultsDiv.innerHTML = ""; + metaDiv.textContent = `Total: ${payload.totalResults} • Page ${ + page + 1 + } of ${payload.totalPages}`; + + (payload.items || []).forEach((item) => { + const el = document.createElement("div"); + el.className = "item"; + const titleHtml = + item.titleHtml || escapeHtml(item.title || "Untitled"); + const descriptionHtml = + item.descriptionHtml || escapeHtml(item.description || ""); + + const header = document.createElement("div"); + const badges = []; + if (item.highlightSource && item.highlightSource.primary) badges.push('primary transcript'); + if (item.highlightSource && item.highlightSource.secondary) badges.push('secondary transcript'); + const badgeHtml = badges.length + ? `
${badges + .map((b) => `${escapeHtml(b)}` ) + .join('')}
` + : ''; + header.innerHTML = ` + ${titleHtml} +
${escapeHtml(item.channel_name || "")} • ${fmtDate( + item.date + )}
+ + ${badgeHtml} + `; + el.appendChild(header); + + if (descriptionHtml) { + const desc = document.createElement("div"); + desc.className = "muted"; + desc.innerHTML = descriptionHtml; + el.appendChild(desc); + } + + if (Array.isArray(item.toHighlight) && item.toHighlight.length) { + const highlights = document.createElement("div"); + highlights.className = "transcript highlight-list"; + item.toHighlight.forEach((entry) => { + const html = typeof entry === "string" ? entry : entry?.html; + if (!html) return; + const row = document.createElement("div"); + row.className = "highlight-row"; + row.innerHTML = html; + highlights.appendChild(row); + }); + if (highlights.childElementCount) { + el.appendChild(highlights); + } + } + + resultsDiv.appendChild(el); + }); + + const pager = document.createElement("div"); + pager.className = "pager"; + const prev = document.createElement("button"); + prev.textContent = "Prev"; + prev.disabled = page <= 0; + const next = document.createElement("button"); + next.textContent = "Next"; + next.disabled = page + 1 >= payload.totalPages; + prev.onclick = () => runSearch(page - 1); + next.onclick = () => runSearch(page + 1); + pager.appendChild(prev); + pager.appendChild(next); + resultsDiv.appendChild(pager); + } + + async function runSearch(pageOverride, pushState = true) { + const q = qInput.value.trim(); + const channels = getSelectedChannels(); + const sort = sortSel.value; + const size = parseInt(sizeSel.value, 10) || 10; + const queryMode = queryToggle && queryToggle.checked; + let exact = !!exactToggle.checked; + let fuzzy = !!fuzzyToggle.checked; + let phrase = !!phraseToggle.checked; + if (queryMode) { + exact = false; + fuzzy = false; + phrase = false; + } else { + previousToggleState = { + exact, + fuzzy, + phrase, + }; + } + const page = pageOverride != null ? pageOverride : currentPage; + currentPage = page; + + if (pushState) { + updateUrl(q, sort, channels, page, size, exact, fuzzy, phrase, queryMode); + } + + const params = new URLSearchParams(); + params.set("q", q); + params.set("sort", sort); + params.set("size", String(size)); + params.set("page", String(page)); + params.set("exact", exact ? "1" : "0"); + params.set("fuzzy", fuzzy ? "1" : "0"); + params.set("phrase", phrase ? "1" : "0"); + params.set("query_string", queryMode ? "1" : "0"); + channels.forEach((id) => params.append("channel_id", id)); + + const res = await fetch(`/api/search?${params.toString()}`); + const payload = await res.json(); + renderResults(payload, page); + updateFrequencyChart(q, channels, queryMode); + } + +searchBtn.addEventListener("click", () => runSearch(0)); + qInput.addEventListener("keypress", (e) => { + if (e.key === "Enter") runSearch(0); + }); + sortSel.addEventListener("change", () => runSearch(0)); + sizeSel.addEventListener("change", () => runSearch(0)); + exactToggle.addEventListener("change", () => { rememberToggleState(); runSearch(0); }); + fuzzyToggle.addEventListener("change", () => { rememberToggleState(); runSearch(0); }); + phraseToggle.addEventListener("change", () => { rememberToggleState(); runSearch(0); }); + if (queryToggle) { + queryToggle.addEventListener("change", () => { applyQueryMode(); runSearch(0); }); + } + +window.addEventListener("popstate", () => { + qs = new URLSearchParams(window.location.search); + setFromQuery(); + currentPage = parseInt(qs.get("page") || "0", 10) || 0; + runSearch(currentPage, false); + }); + + setFromQuery(); + loadMetrics(); + loadChannels().then(() => runSearch(currentPage)); +})(); + +function escapeHtml(str) { + return (str || "").replace(/[&<>"']/g, (ch) => { + switch (ch) { + case "&": + return "&"; + case "<": + return "<"; + case ">": + return ">"; + case '"': + return """; + case "'": + return "'"; + default: + return ch; + } + }); +} diff --git a/static/frequency.html b/static/frequency.html new file mode 100644 index 0000000..be24ec9 --- /dev/null +++ b/static/frequency.html @@ -0,0 +1,68 @@ + + + + + + Term Frequency Explorer + + + + + +
+

Term Frequency Explorer

+

+ Pick a term to see how often it appears over time. Back to search +

+
+ +
+ + + + + + +
+ +
+
+ + + + + diff --git a/static/frequency.js b/static/frequency.js new file mode 100644 index 0000000..14bc799 --- /dev/null +++ b/static/frequency.js @@ -0,0 +1,222 @@ +(() => { + let qs = new URLSearchParams(window.location.search); + + const termInput = document.getElementById("term"); + const channelSel = document.getElementById("channel"); + const intervalSel = document.getElementById("interval"); + const startInput = document.getElementById("start"); + const endInput = document.getElementById("end"); + const runBtn = document.getElementById("runBtn"); + const summaryDiv = document.getElementById("summary"); + const chartDiv = document.getElementById("chart"); + + function parseParams() { + return { + term: qs.get("term") || "", + channel: qs.get("channel_id") || "all", + interval: qs.get("interval") || "month", + start: qs.get("start") || "", + end: qs.get("end") || "", + }; + } + + function setFormFromParams() { + const params = parseParams(); + termInput.value = params.term; + intervalSel.value = params.interval; + startInput.value = params.start; + endInput.value = params.end; + return params; + } + + function updateUrl(params) { + const url = new URL(window.location.href); + url.searchParams.set("term", params.term); + url.searchParams.set("channel_id", params.channel); + url.searchParams.set("interval", params.interval); + if (params.start) url.searchParams.set("start", params.start); + else url.searchParams.delete("start"); + if (params.end) url.searchParams.set("end", params.end); + else url.searchParams.delete("end"); + history.pushState({}, "", url.toString()); + qs = new URLSearchParams(url.search); + } + + async function loadChannels(initialValue) { + try { + const res = await fetch("/api/channels"); + const data = await res.json(); + data.forEach((item) => { + const opt = document.createElement("option"); + opt.value = item.Id; + opt.textContent = `${item.Name} (${item.Count})`; + channelSel.appendChild(opt); + }); + } catch (err) { + console.error("Failed to load channels", err); + } + channelSel.value = initialValue || "all"; + } + + function drawChart(data) { + chartDiv.innerHTML = ""; + if (!data.length) { + const msg = document.createElement("div"); + msg.className = "muted"; + msg.textContent = "No matching documents for this term."; + chartDiv.appendChild(msg); + return; + } + + const parsed = data + .map((d) => ({ + date: d3.isoParse(d.date) || new Date(d.date), + value: d.count, + })) + .filter((d) => d.date instanceof Date && !Number.isNaN(d.date.valueOf())); + + if (!parsed.length) { + const msg = document.createElement("div"); + msg.className = "muted"; + msg.textContent = "Unable to parse dates for this series."; + chartDiv.appendChild(msg); + return; + } + + const margin = { top: 20, right: 30, bottom: 40, left: 56 }; + const fullWidth = chartDiv.clientWidth || 900; + const fullHeight = 360; + const width = fullWidth - margin.left - margin.right; + const height = fullHeight - margin.top - margin.bottom; + + const svg = d3 + .select(chartDiv) + .append("svg") + .attr("width", fullWidth) + .attr("height", fullHeight); + + const g = svg + .append("g") + .attr("transform", `translate(${margin.left},${margin.top})`); + + const x = d3 + .scaleTime() + .domain(d3.extent(parsed, (d) => d.date)) + .range([0, width]); + + const y = d3 + .scaleLinear() + .domain([0, d3.max(parsed, (d) => d.value) || 0]) + .nice() + .range([height, 0]); + + const xAxis = d3.axisBottom(x).ticks(6).tickFormat(d3.timeFormat("%Y-%m-%d")); + const yAxis = d3.axisLeft(y).ticks(6); + + g.append("g") + .attr("class", "axis") + .attr("transform", `translate(0,${height})`) + .call(xAxis) + .selectAll("text") + .attr("text-anchor", "end") + .attr("transform", "rotate(-35)") + .attr("dx", "-0.8em") + .attr("dy", "0.15em"); + + g.append("g").attr("class", "axis").call(yAxis); + + const line = d3 + .line() + .x((d) => x(d.date)) + .y((d) => y(d.value)); + + g.append("path") + .datum(parsed) + .attr("class", "line") + .attr("d", line); + + g.selectAll(".dot") + .data(parsed) + .enter() + .append("circle") + .attr("class", "dot") + .attr("r", 3) + .attr("cx", (d) => x(d.date)) + .attr("cy", (d) => y(d.value)) + .append("title") + .text((d) => `${d3.timeFormat("%Y-%m-%d")(d.date)}: ${d.value}`); + } + + async function runFrequency(pushState = true) { + const term = termInput.value.trim(); + if (!term) { + summaryDiv.textContent = "Enter a term to begin."; + chartDiv.innerHTML = ""; + return; + } + + const params = { + term, + channel: channelSel.value, + interval: intervalSel.value, + start: startInput.value, + end: endInput.value, + }; + + if (pushState) updateUrl(params); + + const search = new URLSearchParams(); + search.set("term", term); + if (params.channel && params.channel !== "all") { + search.set("channel_id", params.channel); + } + search.set("interval", params.interval); + if (params.start) search.set("start", params.start); + if (params.end) search.set("end", params.end); + + summaryDiv.textContent = "Loading…"; + chartDiv.innerHTML = ""; + + try { + const res = await fetch(`/api/frequency?${search.toString()}`); + if (!res.ok) { + throw new Error(`Request failed: ${res.status}`); + } + const payload = await res.json(); + const total = payload.totalResults || 0; + summaryDiv.textContent = `Matches: ${total.toLocaleString()} • Buckets: ${ + (payload.buckets || []).length + } • Interval: ${payload.interval}`; + drawChart(payload.buckets || []); + } catch (err) { + console.error(err); + summaryDiv.textContent = "Failed to load data."; + } + } + + runBtn.addEventListener("click", () => runFrequency()); + termInput.addEventListener("keypress", (e) => { + if (e.key === "Enter") runFrequency(); + }); + intervalSel.addEventListener("change", () => runFrequency()); + channelSel.addEventListener("change", () => runFrequency()); + startInput.addEventListener("change", () => runFrequency()); + endInput.addEventListener("change", () => runFrequency()); + + window.addEventListener("popstate", () => { + qs = new URLSearchParams(window.location.search); + const params = setFormFromParams(); + channelSel.value = params.channel; + runFrequency(false); + }); + + const initialParams = setFormFromParams(); + loadChannels(initialParams.channel).then(() => { + if (initialParams.term) { + runFrequency(false); + } else { + summaryDiv.textContent = "Enter a term to begin."; + } + }); +})(); + diff --git a/static/index.html b/static/index.html new file mode 100644 index 0000000..822346e --- /dev/null +++ b/static/index.html @@ -0,0 +1,63 @@ + + + + + + This Little Corner (Python) + + + + +
+

This Little Corner — Elastic Search

+

+ Enter a phrase to query title, description, and transcript text. +

+
+ +
+ +
+ All Channels +
+
Loading channels…
+
+
+ + + +
+
+ + + + +
+ +
+
+
+
+
+
+
+
+
+
+
+
+
+ +
+ + + + diff --git a/static/style.css b/static/style.css new file mode 100644 index 0000000..e61aa61 --- /dev/null +++ b/static/style.css @@ -0,0 +1,225 @@ +body { + font-family: Arial, sans-serif; + margin: 24px; + color: #222; +} + +header { + margin-bottom: 16px; +} + +.controls { + display: flex; + flex-wrap: wrap; + gap: 8px; + align-items: center; + margin-bottom: 12px; +} + +.channel-dropdown { + position: relative; + min-width: 220px; + flex: 0 1 260px; +} + +.channel-dropdown summary { + list-style: none; + cursor: pointer; + border: 1px solid #ccc; + border-radius: 4px; + padding: 6px 8px; + background: #fff; + color: #222; + display: inline-flex; + align-items: center; + min-height: 32px; + max-width: 100%; + overflow: hidden; + text-overflow: ellipsis; + white-space: nowrap; +} + +.channel-dropdown summary::-webkit-details-marker { + display: none; +} + +.channel-dropdown[open] summary { + border-bottom-left-radius: 0; + border-bottom-right-radius: 0; +} + +.channel-options { + margin-top: 4px; + padding: 8px; + border: 1px solid #ccc; + border-radius: 0 0 4px 4px; + background: #fff; + max-height: 240px; + overflow-y: auto; + box-shadow: 0 2px 6px rgba(0, 0, 0, 0.12); + min-width: 220px; + width: max(220px, 100%); +} + +.channel-option { + display: flex; + align-items: center; + gap: 6px; + margin-bottom: 6px; + font-size: 12px; +} + +.channel-option:last-child { + margin-bottom: 0; +} + +input, +select, +button { + padding: 6px 8px; +} + +.muted { + color: #666; + font-size: 12px; +} + +#results .item { + border-bottom: 1px solid #ddd; + padding: 12px 0; +} + +.summary-row { + display: flex; + gap: 16px; + flex-wrap: wrap; + align-items: flex-start; + margin-top: 12px; +} + +.summary-left { + flex: 0 1 280px; + max-width: 360px; +} + +.summary-right { + flex: 1 1 0%; + min-width: 0; + background: #f5f5f5; + padding: 12px; + border-radius: 8px; + box-shadow: 0 1px 2px rgba(0, 0, 0, 0.08); +} + +#metrics { + margin-top: 12px; + display: flex; + flex-direction: column; + gap: 8px; +} + +#metricsStatus { + min-height: 16px; +} + +#metricsContent { + display: flex; + flex-direction: column; + gap: 6px; +} + +#frequencyChart { + margin-top: 8px; +} + +#frequencyChart svg { + max-width: 100%; +} + +#frequencyChart .axis path, +#frequencyChart .axis line { + stroke: #ccc; +} + +#frequencyChart .freq-layer rect { + stroke: #fff; + stroke-width: 0.5px; +} + +.freq-legend { + margin-top: 8px; + display: flex; + flex-wrap: wrap; + gap: 8px; + font-size: 12px; + color: #444; +} + +.freq-legend-item { + display: flex; + align-items: center; + gap: 6px; +} + +.freq-legend-swatch { + width: 12px; + height: 12px; + border-radius: 2px; + display: inline-block; +} + +.transcript { + background: #fafafa; + padding: 8px; + margin-top: 6px; + max-height: 200px; + overflow-y: auto; +} + +.highlight-list { + display: flex; + flex-direction: column; + gap: 8px; + max-height: none; + overflow: visible; +} + +.highlight-row { + padding: 4px 0; + border-bottom: 1px solid #ececec; +} + +.highlight-row:last-child { + border-bottom: none; +} + +.transcript-wrapper { + margin-top: 8px; +} + +.pager { + margin-top: 12px; + display: flex; + gap: 8px; +} + +mark { + background: #ffe58a; + padding: 0 2px; +} + + +.badge-row { + margin-top: 6px; + display: flex; + gap: 4px; + flex-wrap: wrap; +} + +.badge { + background: #0b6efd; + color: #fff; + border-radius: 999px; + padding: 2px 8px; + font-size: 12px; +} diff --git a/transcript_collector.py b/transcript_collector.py new file mode 100644 index 0000000..65e6617 --- /dev/null +++ b/transcript_collector.py @@ -0,0 +1,226 @@ +""" +Lightweight helpers for gathering video metadata and transcripts from YouTube. + +Usage: + python -m python_app.transcript_collector --channel UC123 --output data/raw + +Relies on: + - YouTube Data API v3 (requires YOUTUBE_API_KEY). + - youtube-transcript-api for transcript retrieval. +Both libraries are optional at import time so the module can still be referenced +when only working with existing JSON dumps. +""" + +from __future__ import annotations + +import argparse +import json +import logging +from dataclasses import asdict, dataclass +from pathlib import Path +from typing import Dict, Iterable, Iterator, List, Optional + +from .config import CONFIG + +try: + from googleapiclient.discovery import build as build_youtube # type: ignore +except ImportError: # pragma: no cover - library optional + build_youtube = None + +try: + from youtube_transcript_api import YouTubeTranscriptApi # type: ignore +except ImportError: # pragma: no cover - library optional + YouTubeTranscriptApi = None + + +LOGGER = logging.getLogger(__name__) + + +@dataclass +class TranscriptSegment: + start: float + duration: float + text: str + + +@dataclass +class VideoRecord: + video_id: str + channel_id: str + channel_title: str + title: str + description: str + published_at: str + url: str + transcript: List[TranscriptSegment] + + +def _ensure_youtube_client(api_key: Optional[str]): + if build_youtube is None: + raise RuntimeError( + "google-api-python-client not installed. " + "Install google-api-python-client to collect metadata." + ) + if not api_key: + raise RuntimeError( + "Set YOUTUBE_API_KEY to collect metadata from YouTube." + ) + return build_youtube("youtube", "v3", developerKey=api_key) + + +def _ensure_transcript_api(): + if YouTubeTranscriptApi is None: + raise RuntimeError( + "youtube-transcript-api not installed. " + "Install youtube-transcript-api to fetch transcripts." + ) + return YouTubeTranscriptApi() + + +def iter_channel_videos( + channel_id: str, + *, + api_key: Optional[str] = None, + max_pages: int = 10, +) -> Iterator[Dict]: + """ + Yield raw playlist items for the uploads playlist of the given channel. + + Args: + channel_id: Target YouTube channel ID. + api_key: Explicit API key (defaults to config value). + max_pages: Hard cap on paginated playlist fetches to keep things simple. + """ + client = _ensure_youtube_client(api_key or CONFIG.youtube.api_key) + channels = ( + client.channels().list(id=channel_id, part="contentDetails").execute() + ) + items = channels.get("items", []) + if not items: + raise ValueError(f"Channel {channel_id} not found.") + uploads_playlist = ( + items[0] + .get("contentDetails", {}) + .get("relatedPlaylists", {}) + .get("uploads") + ) + if not uploads_playlist: + raise ValueError(f"Channel {channel_id} missing uploads playlist.") + + request = client.playlistItems().list( + playlistId=uploads_playlist, part="snippet", maxResults=50 + ) + page = 0 + while request and page < max_pages: + response = request.execute() + for item in response.get("items", []): + yield item + page += 1 + request = client.playlistItems().list_next(request, response) + + +def fetch_transcript( + video_id: str, *, languages: Optional[Iterable[str]] = None +) -> List[TranscriptSegment]: + """Return transcript segments for a video, if available.""" + api = _ensure_transcript_api() + try: + transcripts = api.get_transcript(video_id, languages=languages) + except Exception as exc: # broad catch keeps draft simple + LOGGER.warning("Transcript unavailable for %s: %s", video_id, exc) + return [] + return [ + TranscriptSegment( + start=entry.get("start", 0.0), + duration=entry.get("duration", 0.0), + text=entry.get("text", ""), + ) + for entry in transcripts + ] + + +def collect_channel( + channel_id: str, + output_dir: Path, + *, + api_key: Optional[str] = None, + max_pages: int = 2, + languages: Optional[List[str]] = None, +) -> List[VideoRecord]: + """ + Collect metadata + transcripts for a channel and store as JSON files. + + Returns the in-memory list to make it easy to chain into ingestion. + """ + output_dir.mkdir(parents=True, exist_ok=True) + videos: List[VideoRecord] = [] + for item in iter_channel_videos( + channel_id, api_key=api_key, max_pages=max_pages + ): + snippet = item.get("snippet", {}) + video_id = snippet.get("resourceId", {}).get("videoId") + if not video_id: + continue + segments = fetch_transcript(video_id, languages=languages) + record = VideoRecord( + video_id=video_id, + channel_id=snippet.get("channelId", channel_id), + channel_title=snippet.get("channelTitle", ""), + title=snippet.get("title", ""), + description=snippet.get("description", ""), + published_at=snippet.get("publishedAt", ""), + url=f"https://www.youtube.com/watch?v={video_id}", + transcript=segments, + ) + videos.append(record) + dest = output_dir / f"{video_id}.json" + with dest.open("w", encoding="utf-8") as handle: + json.dump(asdict(record), handle, ensure_ascii=False, indent=2) + LOGGER.info("Saved %s", dest) + return videos + + +def _build_parser() -> argparse.ArgumentParser: + parser = argparse.ArgumentParser( + description="Collect channel transcripts into JSON files." + ) + parser.add_argument( + "--channel", + required=True, + help="YouTube channel ID (e.g. UCXYZ).", + ) + parser.add_argument( + "--output", + type=Path, + default=Path("data/raw"), + help="Directory to write per-video JSON files.", + ) + parser.add_argument( + "--max-pages", + type=int, + default=2, + help="Number of paginated channel pages to pull (50 videos per page).", + ) + parser.add_argument( + "--language", + dest="languages", + action="append", + help="Preferred transcript languages (can be repeated).", + ) + return parser + + +def main() -> None: + logging.basicConfig(level=logging.INFO, format="%(levelname)s %(message)s") + args = _build_parser().parse_args() + collect_channel( + args.channel, + args.output, + max_pages=args.max_pages, + languages=args.languages, + ) + + +if __name__ == "__main__": # pragma: no cover + main() +