Initial commit

2025-11-02 01:14:36 -04:00 · 2025-11-02 01:14:36 -04:00 · fcdc6ecb9b
commit fcdc6ecb9b
13 changed files with 2883 additions and 0 deletions
--- a/.gitignore
+++ b/.gitignore
@ -0,0 +1,60 @@
 # Byte-compiled / optimized / DLL files
 __pycache__/
 *.py[cod]
 *$py.class
 # C extensions
 *.so
 # Distribution / packaging
 .Python
 build/
 develop-eggs/
 dist/
 downloads/
 eggs/
 .eggs/
 lib/
 lib64/
 parts/
 sdist/
 var/
 wheels/
 *.egg-info/
 .installed.cfg
 *.egg
 # Virtual environments
 .venv/
 venv/
 ENV/
 env/
 # IDE
 .vscode/
 .idea/
 *.swp
 *.swo
 *~
 # Environment variables
 .env
 .env.local
 # Elasticsearch data
 data/
 # OS
 .DS_Store
 Thumbs.db
 # Logs
 *.log
 # Testing
 .pytest_cache/
 .coverage
 htmlcov/
 # mypy
 .mypy_cache/
--- a/README.md
+++ b/README.md
@ -0,0 +1,87 @@
 # Python Search Toolkit (Rough Draft)
 This minimal Python implementation covers three core needs:
 1. **Collect transcripts** from YouTube channels.
 2. **Ingest transcripts/metadata** into Elasticsearch.
 3. **Expose a simple Flask search UI** that queries Elasticsearch directly.
 The code lives alongside the existing C# stack so you can experiment without
 touching production infrastructure.
 ## Setup
 ```bash
 python -m venv .venv
 source .venv/bin/activate
 pip install -r python_app/requirements.txt
 ```
 Configure your environment as needed:
 ```bash
 export ELASTIC_URL=http://localhost:9200
 export ELASTIC_INDEX=this_little_corner_py
 export ELASTIC_USERNAME=elastic          # optional
 export ELASTIC_PASSWORD=secret           # optional
 export ELASTIC_API_KEY=XXXX              # optional alternative auth
 export ELASTIC_CA_CERT=/path/to/ca.pem   # optional, for self-signed TLS
 export ELASTIC_VERIFY_CERTS=1            # set to 0 to skip verification (dev only)
 export ELASTIC_DEBUG=0                   # set to 1 for verbose request/response logging
 export LOCAL_DATA_DIR=./data/video_metadata  # defaults to this
 export YOUTUBE_API_KEY=AIza...           # required for live collection
 ```
 ## 1. Collect Transcripts
 ```bash
 python -m python_app.transcript_collector \
  --channel UCxxxx \
  --output data/raw \
  --max-pages 2
 ```
 Each video becomes a JSON file containing metadata plus transcript segments
 (`TranscriptSegment`). Downloads require both `google-api-python-client` and
 `youtube-transcript-api`, as well as a valid `YOUTUBE_API_KEY`.
 > Already have cached JSON? You can skip this step and move straight to ingesting.
 ## 2. Ingest Into Elasticsearch
 ```bash
 python -m python_app.ingest \
  --source data/video_metadata \
  --index this_little_corner_py
 ```
 The script walks the source directory, builds `bulk` requests, and creates the
 index with a lightweight mapping when needed. Authentication is handled via
 `ELASTIC_USERNAME` / `ELASTIC_PASSWORD` if set.
 ## 3. Serve the Search Frontend
 ```bash
 python -m python_app.search_app
 ```
 Visit <http://localhost:8080/> and you’ll see a barebones UI that:
 - Lists channels via a terms aggregation.
 - Queries titles/descriptions/transcripts with toggleable exact, fuzzy, and phrase clauses plus optional date sorting.
 - Surfaces transcript highlights.
 - Lets you pull the full transcript for any result on demand.
 - Shows a stacked-by-channel timeline for each search query (with `/frequency` offering a standalone explorer) powered by D3.js.
 - Supports a query-string mode toggle so you can write advanced Lucene queries (e.g. `meaning OR purpose`, `meaning~2` for fuzzy matches, `title:(meaning crisis)`), while the default toggles stay AND-backed.
 ## Integration Notes
 - All modules share configuration through `python_app.config.CONFIG`, so you can
  fine-tune paths or credentials centrally.
 - The ingest flow reuses existing JSON schema from `data/video_metadata`, so no
  re-download is necessary if you already have the dumps.
 - Everything is intentionally simple (no Celery, task queues, or custom auth) to
  keep the draft approachable and easy to extend.
 Feel free to expand on this scaffold—add proper logging, schedule transcript
 updates, or flesh out the UI—once you’re happy with the baseline behaviour.
--- a/init.py
+++ b/init.py
@ -0,0 +1,11 @@
 """
 Minimal Python toolkit for collecting YouTube transcripts, ingesting them into
 Elasticsearch, and serving a lightweight search API/front-end.
 Modules:
    config: shared configuration helpers (Elastic endpoint, data paths, etc.).
    transcript_collector: fetches channel metadata and transcripts.
    ingest: pushes transcript JSON into Elasticsearch.
    search_app: Flask app exposing simple search and transcript endpoints.
 """
--- a/config.py
+++ b/config.py
@ -0,0 +1,81 @@
 """
 Centralised configuration helpers for the Python search toolkit.
 Environment Variables:
    ELASTIC_URL: Base URL to the Elasticsearch node (default: http://localhost:9200).
    ELASTIC_USERNAME / ELASTIC_PASSWORD: Optional basic auth credentials.
    ELASTIC_INDEX: Target index name (default: this_little_corner_py).
    LOCAL_DATA_DIR: Root folder containing JSON metadata (default: ../data/video_metadata).
    YOUTUBE_API_KEY: Optional API key for pulling metadata directly from YouTube.
 """
 from __future__ import annotations
 import os
 from dataclasses import dataclass
 from pathlib import Path
 from typing import Optional
@dataclass(frozen=True)
 class ElasticSettings:
    url: str
    username: Optional[str]
    password: Optional[str]
    index: str
    ca_cert: Optional[Path]
    verify_certs: bool
    api_key: Optional[str]
    debug: bool
@dataclass(frozen=True)
 class DataSettings:
    root: Path
@dataclass(frozen=True)
 class YoutubeSettings:
    api_key: Optional[str]
@dataclass(frozen=True)
 class AppConfig:
    elastic: ElasticSettings
    data: DataSettings
    youtube: YoutubeSettings
 def _env(name: str, default: Optional[str] = None) -> Optional[str]:
    """Return an environment variable value with optional default."""
    value = os.environ.get(name)
    if value is None:
        return default
    stripped = value.strip()
    return stripped or default
 def load_config() -> AppConfig:
    """Collect configuration from environment variables."""
    elastic = ElasticSettings(
        url=_env("ELASTIC_URL", "http://localhost:9200"),
        username=_env("ELASTIC_USERNAME"),
        password=_env("ELASTIC_PASSWORD"),
        index=_env("ELASTIC_INDEX", "this_little_corner_py"),
        ca_cert=Path(_env("ELASTIC_CA_CERT")).expanduser() if _env("ELASTIC_CA_CERT") else None,
        verify_certs=_env("ELASTIC_VERIFY_CERTS", "1") not in {"0", "false", "False"},
        api_key=_env("ELASTIC_API_KEY"),
        debug=_env("ELASTIC_DEBUG", "0") in {"1", "true", "True"},
    )
    data_root = Path(
        _env(
            "LOCAL_DATA_DIR",
            Path(__file__).resolve().parents[1] / "data" / "video_metadata",
        )
    )
    data = DataSettings(root=data_root)
    youtube = YoutubeSettings(api_key=_env("YOUTUBE_API_KEY"))
    return AppConfig(elastic=elastic, data=data, youtube=youtube)
 CONFIG = load_config()
--- a/ingest.py
+++ b/ingest.py
@ -0,0 +1,193 @@
 """
 Utilities for indexing transcript JSON documents into Elasticsearch.
 Usage:
    python -m python_app.ingest --source data/video_metadata --index corner
 """
 from __future__ import annotations
 import argparse
 import json
 import logging
 from pathlib import Path
 from typing import Dict, Iterable, Iterator, Optional
 from .config import CONFIG, AppConfig
 try:
    from elasticsearch import Elasticsearch, helpers  # type: ignore
 except ImportError:  # pragma: no cover - dependency optional
    Elasticsearch = None
    helpers = None
 LOGGER = logging.getLogger(__name__)
 def _ensure_client(config: AppConfig) -> "Elasticsearch":
    if Elasticsearch is None:
        raise RuntimeError(
            "elasticsearch package not installed. "
            "Install elasticsearch>=7 to index documents."
        )
    kwargs = {}
    if config.elastic.api_key:
        kwargs["api_key"] = config.elastic.api_key
    elif config.elastic.username and config.elastic.password:
        kwargs["basic_auth"] = (
            config.elastic.username,
            config.elastic.password,
        )
    if config.elastic.ca_cert:
        kwargs["ca_certs"] = str(config.elastic.ca_cert)
    kwargs["verify_certs"] = config.elastic.verify_certs
    return Elasticsearch(config.elastic.url, **kwargs)
 def iter_json_documents(data_root: Path) -> Iterator[Dict]:
    """Yield JSON objects from the provided directory tree."""
    if not data_root.exists():
        raise FileNotFoundError(f"{data_root} does not exist")
    for path in sorted(data_root.rglob("*.json")):
        try:
            with path.open("r", encoding="utf-8") as handle:
                doc = json.load(handle)
                doc.setdefault("video_id", path.stem)
                yield doc
        except Exception as exc:
            LOGGER.warning("Skipping %s: %s", path, exc)
 def build_bulk_actions(
    docs: Iterable[Dict], *, index: Optional[str] = None
 ) -> Iterator[Dict]:
    """Translate raw JSON dictionaries into Elasticsearch bulk actions."""
    for doc in docs:
        video_id = doc.get("video_id")
        if not video_id:
            continue
        parts = doc.get("transcript_parts") or doc.get("transcript") or []
        transcript_full = doc.get("transcript_full")
        if not transcript_full and isinstance(parts, list):
            transcript_full = " ".join(
                segment.get("text", "") if isinstance(segment, dict) else str(segment)
                for segment in parts
            ).strip()
        yield {
            "_id": video_id,
            "_index": index or CONFIG.elastic.index,
            "_op_type": "index",
            "_source": {
                "video_id": video_id,
                "channel_id": doc.get("channel_id"),
                "channel_name": doc.get("channel_name"),
                "title": doc.get("title"),
                "description": doc.get("description"),
                "date": doc.get("date") or doc.get("published_at"),
                "url": doc.get("url"),
                "duration": doc.get("duration"),
                "transcript_full": transcript_full,
                "transcript_secondary_full": doc.get("transcript_secondary_full"),
                "transcript_parts": parts,
            },
        }
 def ensure_index(client: "Elasticsearch", index: str) -> None:
    """Create the target index with a minimal mapping if it is missing."""
    if client.indices.exists(index=index):
        return
    LOGGER.info("Creating index %s", index)
    client.indices.create(
        index=index,
        mappings={
            "properties": {
                "video_id": {"type": "keyword"},
                "channel_id": {"type": "keyword"},
                "channel_name": {"type": "keyword"},
                "title": {"type": "text"},
                "description": {"type": "text"},
                "date": {"type": "date", "format": "strict_date_optional_time"},
                "url": {"type": "keyword"},
                "duration": {"type": "float"},
                "transcript_full": {"type": "text"},
                "transcript_secondary_full": {"type": "text"},
                "transcript_parts": {
                    "type": "nested",
                    "properties": {
                        "start": {"type": "float"},
                        "duration": {"type": "float"},
                        "text": {"type": "text"},
                    },
                },
            }
        },
    )
 def ingest_directory(
    data_root: Path,
    *,
    config: AppConfig = CONFIG,
    index: Optional[str] = None,
    batch_size: int = 500,
    request_timeout: int = 120,
 ) -> None:
    """Bulk index every JSON file in the directory tree."""
    client = _ensure_client(config)
    target_index = index or config.elastic.index
    ensure_index(client, target_index)
    docs = iter_json_documents(data_root)
    actions = build_bulk_actions(docs, index=target_index)
    bulk_client = client.options(request_timeout=request_timeout)
    helpers.bulk(
        bulk_client,
        actions,
        chunk_size=batch_size,
    )
    LOGGER.info("Ingestion complete for %s", target_index)
 def _build_parser() -> argparse.ArgumentParser:
    parser = argparse.ArgumentParser(
        description="Ingest transcript JSON files into Elasticsearch."
    )
    parser.add_argument(
        "--source",
        type=Path,
        default=CONFIG.data.root,
        help="Directory containing per-video JSON files.",
    )
    parser.add_argument(
        "--index",
        help="Override the Elasticsearch index name.",
    )
    parser.add_argument(
        "--batch-size",
        type=int,
        default=500,
        help="Bulk ingest batch size.",
    )
    parser.add_argument(
        "--timeout",
        type=int,
        default=120,
        help="Request timeout (seconds) for bulk operations.",
    )
    return parser
 def main() -> None:
    logging.basicConfig(level=logging.INFO, format="%(levelname)s %(message)s")
    args = _build_parser().parse_args()
    ingest_directory(
        args.source,
        index=args.index,
        batch_size=args.batch_size,
        request_timeout=args.timeout,
    )
 if __name__ == "__main__":  # pragma: no cover
    main()
--- a/requirements.txt
+++ b/requirements.txt
@ -0,0 +1,4 @@
 Flask>=2.3
 elasticsearch>=7.0.0,<9.0.0
 youtube-transcript-api>=0.6
 google-api-python-client>=2.0.0
--- a/search_app.py
+++ b/search_app.py
@ -0,0 +1,910 @@
 """
 Flask application exposing a minimal search API backed by Elasticsearch.
 Routes:
    GET /                -> Static HTML search page.
    GET /api/channels    -> List available channels (via terms aggregation).
    GET /api/search      -> Search index with pagination and simple highlighting.
    GET /api/transcript  -> Return full transcript for a given video_id.
 """
 from __future__ import annotations
 import copy
 import json
 import logging
 import re
 from pathlib import Path
 from typing import Any, Dict, Iterable, List, Optional, Sequence, Set
 from collections import Counter
 from datetime import datetime
 from flask import Flask, jsonify, request, send_from_directory
 from .config import CONFIG, AppConfig
 try:
    from elasticsearch import Elasticsearch  # type: ignore
    from elasticsearch import BadRequestError  # type: ignore
 except ImportError:  # pragma: no cover - dependency optional
    Elasticsearch = None
    BadRequestError = Exception  # type: ignore
 LOGGER = logging.getLogger(__name__)
 def _ensure_client(config: AppConfig) -> "Elasticsearch":
    if Elasticsearch is None:
        raise RuntimeError(
            "elasticsearch package not installed. "
            "Install elasticsearch>=7 to run the Flask search app."
        )
    kwargs = {}
    if config.elastic.api_key:
        kwargs["api_key"] = config.elastic.api_key
    elif config.elastic.username and config.elastic.password:
        kwargs["basic_auth"] = (
            config.elastic.username,
            config.elastic.password,
        )
    if config.elastic.ca_cert:
        kwargs["ca_certs"] = str(config.elastic.ca_cert)
    kwargs["verify_certs"] = config.elastic.verify_certs
    return Elasticsearch(config.elastic.url, **kwargs)
 def metrics_payload(data_root: Path) -> Dict[str, Any]:
    total_items = 0
    channel_counter: Counter = Counter()
    channel_name_map: Dict[str, str] = {}
    year_counter: Counter = Counter()
    month_counter: Counter = Counter()
    if not data_root.exists():
        LOGGER.warning("Data directory %s not found; metrics will be empty.", data_root)
        return {
            "totalItems": 0,
            "totalChannels": 0,
            "itemsPerChannel": [],
            "yearHistogram": [],
            "recentMonths": [],
        }
    for path in data_root.rglob("*.json"):
        try:
            with path.open("r", encoding="utf-8") as handle:
                doc = json.load(handle)
        except Exception:
            continue
        total_items += 1
        channel_id = doc.get("channel_id")
        channel_name = doc.get("channel_name") or channel_id
        if channel_id:
            channel_counter[channel_id] += 1
            if channel_name and channel_id not in channel_name_map:
                channel_name_map[channel_id] = channel_name
        date_value = doc.get("date") or doc.get("published_at")
        dt: Optional[datetime] = None
        if isinstance(date_value, str):
            for fmt in ("%Y-%m-%d", "%Y-%m-%dT%H:%M:%S", "%Y-%m-%dT%H:%M:%SZ"):
                try:
                    dt = datetime.strptime(date_value[: len(fmt)], fmt)
                    break
                except Exception:
                    continue
        elif isinstance(date_value, (int, float)):
            try:
                dt = datetime.fromtimestamp(date_value)
            except Exception:
                dt = None
        if dt:
            year_counter[str(dt.year)] += 1
            month_counter[dt.strftime("%Y-%m")] += 1
    items_per_channel = [
        {
            "label": channel_name_map.get(cid, cid),
            "count": count,
        }
        for cid, count in channel_counter.most_common()
    ]
    year_histogram = [
        {"bucket": year, "count": year_counter[year]}
        for year in sorted(year_counter.keys())
    ]
    recent_months = sorted(month_counter.keys())
    recent_months = recent_months[-12:]
    recent_months_payload = [
        {"bucket": month, "count": month_counter[month]} for month in recent_months
    ]
    return {
        "totalItems": total_items,
        "totalChannels": len(channel_counter),
        "itemsPerChannel": items_per_channel,
        "yearHistogram": year_histogram,
        "recentMonths": recent_months_payload,
    }
 def elastic_metrics_payload(
    client: "Elasticsearch",
    index: str,
    *,
    channel_field_candidates: Optional[List[str]] = None,
    debug: bool = False,
 ) -> Dict[str, Any]:
    if channel_field_candidates is None:
        channel_field_candidates = ["channel_id.keyword", "channel_id"]
    base_body: Dict[str, Any] = {
        "size": 0,
        "track_total_hits": True,
        "aggs": {
            "channels": {
                "terms": {
                    "field": "channel_id.keyword",
                    "size": 500,
                    "order": {"_count": "desc"},
                },
                "aggs": {
                    "name": {
                        "top_hits": {
                            "size": 1,
                            "_source": {"includes": ["channel_name"]},
                        }
                    }
                },
            },
            "year_histogram": {
                "date_histogram": {
                    "field": "date",
                    "calendar_interval": "year",
                    "format": "yyyy",
                }
            },
            "month_histogram": {
                "date_histogram": {
                    "field": "date",
                    "calendar_interval": "month",
                    "format": "yyyy-MM",
                    "order": {"_key": "asc"},
                }
            },
        },
    }
    last_error: Optional[Exception] = None
    response: Optional[Dict[str, Any]] = None
    for candidate_field in channel_field_candidates:
        body = json.loads(json.dumps(base_body))
        body["aggs"]["channels"]["terms"]["field"] = candidate_field
        try:
            if debug:
                LOGGER.info(
                    "Elasticsearch metrics request: %s",
                    json.dumps({"index": index, "body": body}, indent=2),
                )
            response = client.search(index=index, body=body)
            break
        except BadRequestError as exc:
            last_error = exc
            if debug:
                LOGGER.warning(
                    "Metrics aggregation failed for field %s: %s",
                    candidate_field,
                    exc,
                )
    if response is None:
        raise last_error or RuntimeError("Unable to compute metrics from Elasticsearch.")
    hits = response.get("hits", {})
    total_items = hits.get("total", {}).get("value", 0)
    if debug:
        LOGGER.info(
            "Elasticsearch metrics response: %s",
            json.dumps(response, indent=2, default=str),
        )
    aggregations = response.get("aggregations", {})
    channel_buckets = aggregations.get("channels", {}).get("buckets", [])
    items_per_channel = []
    for bucket in channel_buckets:
        key = bucket.get("key")
        channel_name = key
        top_hits = (
            bucket.get("name", {})
            .get("hits", {})
            .get("hits", [])
        )
        if top_hits:
            channel_name = (
                top_hits[0]
                .get("_source", {})
                .get("channel_name", channel_name)
            )
        items_per_channel.append(
            {"label": channel_name or key, "count": bucket.get("doc_count", 0)}
        )
    year_buckets = aggregations.get("year_histogram", {}).get("buckets", [])
    year_histogram = [
        {
            "bucket": bucket.get("key_as_string")
            or str(bucket.get("key")),
            "count": bucket.get("doc_count", 0),
        }
        for bucket in year_buckets
    ]
    month_buckets = aggregations.get("month_histogram", {}).get("buckets", [])
    recent_months_entries = [
        {
            "bucket": bucket.get("key_as_string")
            or str(bucket.get("key")),
            "count": bucket.get("doc_count", 0),
            "_key": bucket.get("key"),
        }
        for bucket in month_buckets
    ]
    recent_months_entries.sort(key=lambda item: item.get("_key", 0))
    recent_months_payload = [
        {"bucket": entry["bucket"], "count": entry["count"]}
        for entry in recent_months_entries[-12:]
    ]
    return {
        "totalItems": total_items,
        "totalChannels": len(items_per_channel),
        "itemsPerChannel": items_per_channel,
        "yearHistogram": year_histogram,
        "recentMonths": recent_months_payload,
    }
 def parse_channel_params(values: Iterable[Optional[str]]) -> List[str]:
    seen: Set[str] = set()
    channels: List[str] = []
    for value in values:
        if not value:
            continue
        for part in str(value).split(","):
            cleaned = part.strip()
            if not cleaned or cleaned.lower() == "all":
                continue
            if cleaned not in seen:
                seen.add(cleaned)
                channels.append(cleaned)
    return channels
 def build_channel_filter(channels: Optional[Sequence[str]]) -> Optional[Dict]:
    if not channels:
        return None
    per_channel_clauses: List[Dict[str, Any]] = []
    for value in channels:
        if not value:
            continue
        per_channel_clauses.append(
            {
                "bool": {
                    "should": [
                        {"term": {"channel_id.keyword": value}},
                        {"term": {"channel_id": value}},
                    ],
                    "minimum_should_match": 1,
                }
            }
        )
    if not per_channel_clauses:
        return None
    if len(per_channel_clauses) == 1:
        return per_channel_clauses[0]
    return {
        "bool": {
            "should": per_channel_clauses,
            "minimum_should_match": 1,
        }
    }
 def build_query_payload(
    query: str,
    *,
    channels: Optional[Sequence[str]] = None,
    sort: str = "relevant",
    use_exact: bool = True,
    use_fuzzy: bool = True,
    use_phrase: bool = True,
    use_query_string: bool = False,
 ) -> Dict:
    filters: List[Dict] = []
    should: List[Dict] = []
    channel_filter = build_channel_filter(channels)
    if channel_filter:
        filters.append(channel_filter)
    if use_query_string:
        base_fields = ["title^3", "description^2", "transcript_full", "transcript_secondary_full"]
        qs_query = (query or "").strip() or "*"
        query_body: Dict[str, Any] = {
            "query_string": {
                "query": qs_query,
                "default_operator": "AND",
                "fields": base_fields,
            }
        }
        if filters:
            query_body = {"bool": {"must": query_body, "filter": filters}}
        body: Dict = {
            "query": query_body,
            "highlight": {
                "fields": {
                    "transcript_full": {
                    "fragment_size": 160,
                    "number_of_fragments": 5,
                    "fragmenter": "span",
                },
                "transcript_secondary_full": {
                    "fragment_size": 160,
                    "number_of_fragments": 5,
                    "fragmenter": "span",
                },
                "title": {"number_of_fragments": 0},
                "description": {
                        "fragment_size": 160,
                        "number_of_fragments": 1,
                    },
                },
                "require_field_match": False,
                "pre_tags": ["<mark>"],
                "post_tags": ["</mark>"],
                "encoder": "html",
                "max_analyzed_offset": 900000,
            },
        }
        if sort == "newer":
            body["sort"] = [{"date": {"order": "desc"}}]
        elif sort == "older":
            body["sort"] = [{"date": {"order": "asc"}}]
        return body
    if query:
        base_fields = ["title^3", "description^2", "transcript_full", "transcript_secondary_full"]
        if use_phrase:
            should.append(
                {
                    "match_phrase": {
                        "transcript_full": {
                            "query": query,
                            "slop": 2,
                            "boost": 10.0,
                        }
                    }
                }
            )
            should.append(
                {
                    "match_phrase": {
                        "transcript_secondary_full": {
                            "query": query,
                            "slop": 2,
                            "boost": 10.0,
                        }
                    }
                }
            )
        if use_fuzzy:
            should.append(
                {
                    "multi_match": {
                        "query": query,
                        "fields": base_fields,
                        "type": "best_fields",
                        "operator": "and",
                        "fuzziness": "AUTO",
                        "prefix_length": 1,
                        "max_expansions": 50,
                        "boost": 1.5,
                    }
                }
            )
        if use_exact:
            should.append(
                {
                    "multi_match": {
                        "query": query,
                        "fields": base_fields,
                        "type": "best_fields",
                        "operator": "and",
                        "boost": 3.0,
                    }
                }
            )
    if should:
        query_body: Dict = {
            "bool": {
                "should": should,
                "minimum_should_match": 1,
            }
        }
        if filters:
            query_body["bool"]["filter"] = filters
    elif filters:
        query_body = {"bool": {"filter": filters}}
    else:
        query_body = {"match_all": {}}
    body: Dict = {
        "query": query_body,
        "highlight": {
            "fields": {
                "transcript_full": {
                    "fragment_size": 160,
                    "number_of_fragments": 5,
                    "fragmenter": "span",
                },
                "transcript_secondary_full": {
                    "fragment_size": 160,
                    "number_of_fragments": 5,
                    "fragmenter": "span",
                },
                "title": {"number_of_fragments": 0},
                "description": {
                    "fragment_size": 160,
                    "number_of_fragments": 1,
                },
            },
            "require_field_match": False,
            "pre_tags": ["<mark>"],
            "post_tags": ["</mark>"],
            "encoder": "html",
            "max_analyzed_offset": 900000,
        },
    }
    if query_body.get("match_all") is None:
        body["highlight"]["highlight_query"] = copy.deepcopy(query_body)
    if sort == "newer":
        body["sort"] = [{"date": {"order": "desc"}}]
    elif sort == "older":
        body["sort"] = [{"date": {"order": "asc"}}]
    return body
 def create_app(config: AppConfig = CONFIG) -> Flask:
    app = Flask(__name__, static_folder=str(Path(__file__).parent / "static"))
    client = _ensure_client(config)
    index = config.elastic.index
    @app.route("/")
    def index_page():
        return send_from_directory(app.static_folder, "index.html")
    @app.route("/static/<path:filename>")
    def static_files(filename: str):
        return send_from_directory(app.static_folder, filename)
    @app.route("/api/channels")
    def channels():
        base_channels_body = {
            "size": 0,
            "aggs": {
                "channels": {
                    "terms": {"field": "channel_id", "size": 200},
                    "aggs": {
                        "name": {
                            "top_hits": {
                                "size": 1,
                                "_source": {"includes": ["channel_name"]},
                            }
                        }
                    },
                }
            },
        }
        def run_channels_request(field_name: str):
            body = json.loads(json.dumps(base_channels_body))  # deep copy
            body["aggs"]["channels"]["terms"]["field"] = field_name
            if config.elastic.debug:
                LOGGER.info(
                    "Elasticsearch channels request: %s",
                    json.dumps({"index": index, "body": body}, indent=2),
                )
            return client.search(index=index, body=body)
        response = None
        last_error = None
        for candidate_field in ("channel_id.keyword", "channel_id"):
            try:
                response = run_channels_request(candidate_field)
                if config.elastic.debug:
                    LOGGER.info("Channels aggregation used field: %s", candidate_field)
                break
            except BadRequestError as exc:
                last_error = exc
                if config.elastic.debug:
                    LOGGER.warning(
                        "Channels aggregation failed for field %s: %s",
                        candidate_field,
                        exc,
                    )
        if response is None:
            raise last_error or RuntimeError("Unable to aggregate channels.")
        if config.elastic.debug:
            LOGGER.info(
                "Elasticsearch channels response: %s",
                json.dumps(response, indent=2, default=str),
            )
        buckets = (
            response.get("aggregations", {})
            .get("channels", {})
            .get("buckets", [])
        )
        data = [
            {
                "Id": bucket.get("key"),
                "Name": (
                    bucket.get("name", {})
                    .get("hits", {})
                    .get("hits", [{}])[0]
                    .get("_source", {})
                    .get("channel_name", bucket.get("key"))
                ),
                "Count": bucket.get("doc_count", 0),
            }
            for bucket in buckets
        ]
        data.sort(key=lambda item: item["Name"].lower())
        return jsonify(data)
    @app.route("/api/search")
    def search():
        query = request.args.get("q", "", type=str)
        raw_channels: List[Optional[str]] = request.args.getlist("channel_id")
        legacy_channel = request.args.get("channel", type=str)
        if legacy_channel:
            raw_channels.append(legacy_channel)
        channels = parse_channel_params(raw_channels)
        sort = request.args.get("sort", "relevant", type=str)
        page = max(request.args.get("page", 0, type=int), 0)
        size = max(request.args.get("size", 10, type=int), 1)
        def parse_flag(name: str, default: bool = True) -> bool:
            value = request.args.get(name)
            if value is None:
                return default
            return value.lower() not in {"0", "false", "no"}
        use_exact = parse_flag("exact", True)
        use_fuzzy = parse_flag("fuzzy", True)
        use_phrase = parse_flag("phrase", True)
        use_query_string = parse_flag("query_string", False)
        if use_query_string:
            use_exact = use_fuzzy = use_phrase = False
        payload = build_query_payload(
            query,
            channels=channels,
            sort=sort,
            use_exact=use_exact,
            use_fuzzy=use_fuzzy,
            use_phrase=use_phrase,
            use_query_string=use_query_string,
        )
        start = page * size
        if config.elastic.debug:
            LOGGER.info(
                "Elasticsearch search request: %s",
                json.dumps(
                    {
                        "index": index,
                        "from": start,
                        "size": size,
                        "body": payload,
                        "channels": channels,
                        "toggles": {
                            "exact": use_exact,
                            "fuzzy": use_fuzzy,
                            "phrase": use_phrase,
                        },
                    },
                    indent=2,
                ),
            )
        response = client.search(
            index=index,
            from_=start,
            size=size,
            body=payload,
        )
        if config.elastic.debug:
            LOGGER.info(
                "Elasticsearch search response: %s",
                json.dumps(response, indent=2, default=str),
            )
        hits = response.get("hits", {})
        total = hits.get("total", {}).get("value", 0)
        documents = []
        for hit in hits.get("hits", []):
            source = hit.get("_source", {})
            highlight_map = hit.get("highlight", {})
            transcript_highlight = (
                (highlight_map.get("transcript_full", []) or [])
                + (highlight_map.get("transcript_secondary_full", []) or [])
            )
            title_html = (
                highlight_map.get("title")
                or [source.get("title") or "Untitled"]
            )[0]
            description_html = (
                highlight_map.get("description")
                or [source.get("description") or ""]
            )[0]
            documents.append(
                {
                    "video_id": source.get("video_id"),
                    "channel_id": source.get("channel_id"),
                    "channel_name": source.get("channel_name"),
                    "title": source.get("title"),
                    "titleHtml": title_html,
                    "description": source.get("description"),
                    "descriptionHtml": description_html,
                    "date": source.get("date"),
                    "url": source.get("url"),
                    "toHighlight": transcript_highlight,
                    "highlightSource": {
                        "primary": bool(highlight_map.get("transcript_full")),
                        "secondary": bool(highlight_map.get("transcript_secondary_full")),
                    },
                }
            )
        return jsonify(
            {
                "items": documents,
                "totalResults": total,
                "totalPages": (total + size - 1) // size,
                "currentPage": page,
            }
        )
    @app.route("/api/metrics")
    def metrics():
        try:
            data = elastic_metrics_payload(
                client,
                index,
                channel_field_candidates=["channel_id.keyword", "channel_id"],
                debug=config.elastic.debug,
            )
        except Exception:
            LOGGER.exception(
                "Falling back to local metrics payload due to Elasticsearch error.",
                exc_info=True,
            )
            data = metrics_payload(config.data.root)
        return jsonify(data)
    @app.route("/api/frequency")
    def frequency():
        raw_term = request.args.get("term", type=str) or ""
        use_query_string = request.args.get("query_string", default="0", type=str)
        use_query_string = (use_query_string or "").lower() in {"1", "true", "yes"}
        term = raw_term.strip()
        if not term and not use_query_string:
            return ("term parameter is required", 400)
        if use_query_string and not term:
            term = "*"
        raw_channels: List[Optional[str]] = request.args.getlist("channel_id")
        legacy_channel = request.args.get("channel", type=str)
        if legacy_channel:
            raw_channels.append(legacy_channel)
        channels = parse_channel_params(raw_channels)
        interval = (request.args.get("interval", "month") or "month").lower()
        allowed_intervals = {"day", "week", "month", "quarter", "year"}
        if interval not in allowed_intervals:
            interval = "month"
        start = request.args.get("start", type=str)
        end = request.args.get("end", type=str)
        filters: List[Dict] = []
        channel_filter = build_channel_filter(channels)
        if channel_filter:
            filters.append(channel_filter)
        if start or end:
            range_filter: Dict[str, Dict[str, Dict[str, str]]] = {"range": {"date": {}}}
            if start:
                range_filter["range"]["date"]["gte"] = start
            if end:
                range_filter["range"]["date"]["lte"] = end
            filters.append(range_filter)
        base_fields = ["title^3", "description^2", "transcript_full", "transcript_secondary_full"]
        if use_query_string:
            qs_query = term or "*"
            must_clause: List[Dict[str, Any]] = [
                {
                    "query_string": {
                        "query": qs_query,
                        "default_operator": "AND",
                        "fields": base_fields,
                    }
                }
            ]
        else:
            must_clause = [
                {
                    "multi_match": {
                        "query": term,
                        "fields": base_fields,
                        "type": "best_fields",
                        "operator": "and",
                    }
                }
            ]
        query: Dict[str, Any] = {"bool": {"must": must_clause}}
        if filters:
            query["bool"]["filter"] = filters
        histogram: Dict[str, Any] = {
            "field": "date",
            "calendar_interval": interval,
            "min_doc_count": 0,
        }
        if start or end:
            bounds: Dict[str, str] = {}
            if start:
                bounds["min"] = start
            if end:
                bounds["max"] = end
            if bounds:
                histogram["extended_bounds"] = bounds
        channel_terms_size = max(6, len(channels)) if channels else 6
        body = {
            "size": 0,
            "query": query,
            "aggs": {
                "over_time": {
                    "date_histogram": histogram,
                    "aggs": {
                        "by_channel": {
                            "terms": {
                                "field": "channel_id.keyword",
                                "size": channel_terms_size,
                                "order": {"_count": "desc"},
                            }
                        }
                    },
                }
            },
        }
        if config.elastic.debug:
            LOGGER.info(
                "Elasticsearch frequency request: %s",
                json.dumps(
                    {
                        "index": index,
                        "body": body,
                        "term": term,
                        "interval": interval,
                        "channels": channels,
                        "start": start,
                        "end": end,
                        "query_string": use_query_string,
                    },
                    indent=2,
                ),
            )
        response = client.search(index=index, body=body)
        if config.elastic.debug:
            LOGGER.info(
                "Elasticsearch frequency response: %s",
                json.dumps(response, indent=2, default=str),
            )
        raw_buckets = (
            response.get("aggregations", {})
            .get("over_time", {})
            .get("buckets", [])
        )
        channel_totals: Dict[str, int] = {}
        buckets: List[Dict[str, Any]] = []
        for bucket in raw_buckets:
            date_str = bucket.get("key_as_string")
            total = bucket.get("doc_count", 0)
            channel_entries: List[Dict[str, Any]] = []
            for ch_bucket in bucket.get("by_channel", {}).get("buckets", []):
                cid = ch_bucket.get("key")
                count = ch_bucket.get("doc_count", 0)
                if cid:
                    channel_entries.append({"id": cid, "count": count})
                    channel_totals[cid] = channel_totals.get(cid, 0) + count
            buckets.append(
                {"date": date_str, "total": total, "channels": channel_entries}
            )
        ranked_channels = sorted(
            [{"id": cid, "total": total} for cid, total in channel_totals.items()],
            key=lambda item: item["total"],
            reverse=True,
        )
        payload = {
            "term": raw_term if not use_query_string else term,
            "interval": interval,
            "buckets": buckets,
            "channels": ranked_channels,
            "totalResults": response.get("hits", {})
            .get("total", {})
            .get("value", 0),
        }
        return jsonify(payload)
    @app.route("/frequency")
    def frequency_page():
        return send_from_directory(app.static_folder, "frequency.html")
    @app.route("/api/transcript")
    def transcript():
        video_id = request.args.get("video_id", type=str)
        if not video_id:
            return ("video_id not set", 400)
        response = client.get(index=index, id=video_id, ignore=[404])
        if config.elastic.debug:
            LOGGER.info(
                "Elasticsearch transcript request: index=%s id=%s", index, video_id
            )
            LOGGER.info(
                "Elasticsearch transcript response: %s",
                json.dumps(response, indent=2, default=str)
                if response
                else "None",
            )
        if not response or not response.get("found"):
            return ("not found", 404)
        source = response["_source"]
        return jsonify(
            {
                "video_id": source.get("video_id"),
                "title": source.get("title"),
                "transcript_parts": source.get("transcript_parts", []),
                "transcript_full": source.get("transcript_full"),
                "transcript_secondary_parts": source.get("transcript_secondary_parts", []),
                "transcript_secondary_full": source.get("transcript_secondary_full"),
            }
        )
    return app
 def main() -> None:  # pragma: no cover
    logging.basicConfig(level=logging.INFO, format="%(levelname)s %(message)s")
    app = create_app()
    app.run(host="0.0.0.0", port=8080, debug=True)
 if __name__ == "__main__":  # pragma: no cover
    main()
--- a/static/app.js
+++ b/static/app.js
@ -0,0 +1,733 @@
 (() => {
  let qs = new URLSearchParams(window.location.search);
  const qInput = document.getElementById("q");
  const channelDropdown = document.getElementById("channelDropdown");
  const channelSummary = document.getElementById("channelSummary");
  const channelOptions = document.getElementById("channelOptions");
  const sortSel = document.getElementById("sort");
  const sizeSel = document.getElementById("size");
  const exactToggle = document.getElementById("exactToggle");
  const fuzzyToggle = document.getElementById("fuzzyToggle");
  const phraseToggle = document.getElementById("phraseToggle");
  const queryToggle = document.getElementById("queryStringToggle");
  const searchBtn = document.getElementById("searchBtn");
  const resultsDiv = document.getElementById("results");
  const metaDiv = document.getElementById("meta");
  const metricsContainer = document.getElementById("metrics");
  const metricsStatus = document.getElementById("metricsStatus");
  const metricsContent = document.getElementById("metricsContent");
  const freqSummary = document.getElementById("frequencySummary");
  const freqChart = document.getElementById("frequencyChart");
  const channelMap = new Map();
  const selectedChannels = new Set();
  let pendingChannelSelection = [];
  let channelsReady = false;
  let suppressChannelChange = false;
  let allChannelsCheckbox = null;
  let previousToggleState = { exact: true, fuzzy: true, phrase: true };
  let currentPage =
    parseInt(qs.get("page") || "0", 10) ||
    0;
  function parseBoolParam(name, defaultValue) {
    const raw = qs.get(name);
    if (raw === null) return defaultValue;
    const lowered = raw.toLowerCase();
    return !["0", "false", "no"].includes(lowered);
  }
  function parseChannelParams(params) {
    const collected = [];
    if (!params) return collected;
    const seen = new Set();
    const rawValues = params.getAll("channel_id");
    const legacy = params.get("channel");
    if (legacy) rawValues.push(legacy);
    rawValues.forEach((value) => {
      if (value == null) return;
      String(value)
        .split(",")
        .map((part) => part.trim())
        .filter((part) => part && part.toLowerCase() !== "all")
        .forEach((part) => {
          if (!seen.has(part)) {
            seen.add(part);
            collected.push(part);
          }
        });
    });
    return collected;
  }
  function getSelectedChannels() {
    return Array.from(selectedChannels);
  }
  function ensureAllCheckboxState() {
    if (allChannelsCheckbox) {
      allChannelsCheckbox.checked = selectedChannels.size === 0;
    }
  }
  function updateChannelSummary() {
    if (!channelSummary) return;
    if (!selectedChannels.size) {
      channelSummary.textContent = "All Channels";
      return;
    }
    const names = Array.from(selectedChannels).map(
      (id) => channelMap.get(id) || id
    );
    if (names.length > 1) {
      names.sort((a, b) => a.localeCompare(b, undefined, { sensitivity: "base" }));
    }
    let label = names.slice(0, 3).join(", ");
    if (names.length > 3) {
      label += ` +${names.length - 3} more`;
    }
    channelSummary.textContent = label;
  }
  function applyChannelSelection(ids, { silent = false } = {}) {
    selectedChannels.clear();
    ids.forEach((id) => selectedChannels.add(id));
    pendingChannelSelection = getSelectedChannels();
    ensureAllCheckboxState();
    if (channelOptions) {
      suppressChannelChange = true;
      const checkboxes = channelOptions.querySelectorAll(
        'input[type="checkbox"][data-channel="1"]'
      );
      checkboxes.forEach((checkbox) => {
        checkbox.checked = selectedChannels.has(checkbox.value);
      });
      suppressChannelChange = false;
    }
    updateChannelSummary();
    if (!silent && channelsReady) {
      runSearch(0);
    }
  }
  function setFromQuery() {
    qInput.value = qs.get("q") || "";
    sortSel.value = qs.get("sort") || "relevant";
    sizeSel.value = qs.get("size") || "10";
    pendingChannelSelection = parseChannelParams(qs);
    applyChannelSelection(pendingChannelSelection, { silent: true });
    exactToggle.checked = parseBoolParam("exact", true);
    fuzzyToggle.checked = parseBoolParam("fuzzy", true);
    phraseToggle.checked = parseBoolParam("phrase", true);
    queryToggle.checked = parseBoolParam("query_string", false);
    applyQueryMode();
    rememberToggleState();
  }
  function applyQueryMode() {
    if (!queryToggle) return;
    if (queryToggle.checked) {
      if (!exactToggle.disabled) {
        previousToggleState = {
          exact: exactToggle.checked,
          fuzzy: fuzzyToggle.checked,
          phrase: phraseToggle.checked,
        };
      }
      exactToggle.checked = false;
      fuzzyToggle.checked = false;
      phraseToggle.checked = false;
      exactToggle.disabled = true;
      fuzzyToggle.disabled = true;
      phraseToggle.disabled = true;
    } else {
      exactToggle.disabled = false;
      fuzzyToggle.disabled = false;
      phraseToggle.disabled = false;
      exactToggle.checked = previousToggleState.exact;
      fuzzyToggle.checked = previousToggleState.fuzzy;
      phraseToggle.checked = previousToggleState.phrase;
    }
  }
  function rememberToggleState() {
    if (queryToggle && !queryToggle.checked) {
      previousToggleState = {
        exact: !!exactToggle.checked,
        fuzzy: !!fuzzyToggle.checked,
        phrase: !!phraseToggle.checked,
      };
    }
  }
  if (channelOptions) {
    channelOptions.addEventListener("change", (event) => {
      const target = event.target;
      if (!(target instanceof HTMLInputElement) || target.type !== "checkbox") {
        return;
      }
      if (suppressChannelChange) {
        return;
      }
      if (target.dataset.all === "1") {
        if (!target.checked && !selectedChannels.size) {
          suppressChannelChange = true;
          target.checked = true;
          suppressChannelChange = false;
          return;
        }
        if (target.checked) {
          selectedChannels.clear();
          pendingChannelSelection = [];
          suppressChannelChange = true;
          const others = channelOptions.querySelectorAll(
            'input[type="checkbox"][data-channel="1"]'
          );
          others.forEach((checkbox) => {
            checkbox.checked = false;
          });
          suppressChannelChange = false;
          ensureAllCheckboxState();
          updateChannelSummary();
          if (channelsReady) {
            runSearch(0);
          }
        }
        return;
      }
      const id = target.value;
      if (!id) return;
      if (target.checked) {
        selectedChannels.add(id);
      } else {
        selectedChannels.delete(id);
      }
      pendingChannelSelection = getSelectedChannels();
      ensureAllCheckboxState();
      updateChannelSummary();
      if (channelsReady) {
        runSearch(0);
      }
    });
  }
  async function loadChannels() {
    if (!channelOptions) {
      channelsReady = true;
      return;
    }
    try {
      const res = await fetch("/api/channels");
      const data = await res.json();
      channelMap.clear();
      channelOptions.innerHTML = "";
      const listFragment = document.createDocumentFragment();
      const allLabel = document.createElement("label");
      allLabel.className = "channel-option";
      allChannelsCheckbox = document.createElement("input");
      allChannelsCheckbox.type = "checkbox";
      allChannelsCheckbox.dataset.all = "1";
      allChannelsCheckbox.checked = selectedChannels.size === 0;
      const allText = document.createElement("span");
      allText.textContent = "All Channels";
      allLabel.appendChild(allChannelsCheckbox);
      allLabel.appendChild(allText);
      listFragment.appendChild(allLabel);
      data.forEach((item) => {
        const label = document.createElement("label");
        label.className = "channel-option";
        const checkbox = document.createElement("input");
        checkbox.type = "checkbox";
        checkbox.value = item.Id;
        checkbox.dataset.channel = "1";
        const text = document.createElement("span");
        text.textContent = `${item.Name} (${item.Count})`;
        label.appendChild(checkbox);
        label.appendChild(text);
        listFragment.appendChild(label);
        channelMap.set(item.Id, item.Name);
      });
      channelOptions.appendChild(listFragment);
      if (!data.length) {
        const empty = document.createElement("div");
        empty.textContent = "No channels available.";
        channelOptions.appendChild(empty);
      }
      const initialSelection = pendingChannelSelection.length
        ? pendingChannelSelection
        : Array.from(selectedChannels);
      applyChannelSelection(initialSelection, { silent: true });
      channelsReady = true;
      updateChannelSummary();
    } catch (err) {
      console.error("Failed to load channels", err);
      channelOptions.innerHTML = "<div>Failed to load channels.</div>";
      channelsReady = true;
      ensureAllCheckboxState();
      updateChannelSummary();
    }
  }
  function updateUrl(q, sort, channels, page, size, exact, fuzzy, phrase, queryMode) {
    const next = new URL(window.location.href);
    next.searchParams.set("q", q);
    next.searchParams.set("sort", sort);
    next.searchParams.delete("channel_id");
    next.searchParams.delete("channel");
    channels.forEach((id) => next.searchParams.append("channel_id", id));
    next.searchParams.set("page", page);
    next.searchParams.set("size", size);
    next.searchParams.set("exact", exact ? "1" : "0");
    next.searchParams.set("fuzzy", fuzzy ? "1" : "0");
    next.searchParams.set("phrase", phrase ? "1" : "0");
    next.searchParams.set("query_string", queryMode ? "1" : "0");
    history.pushState({}, "", next.toString());
  }
  function fmtDate(value) {
    try {
      return (value || "").split("T")[0];
    } catch {
      return value;
    }
  }
  function fmtNumber(n) {
    if (typeof n === "number") return n.toLocaleString();
    return n;
  }
  // Transcript viewer functionality removed.
  function renderMetrics(data) {
  if (!metricsContent) return;
  metricsContent.innerHTML = "";
  if (!data) return;
  if (metricsStatus) {
    metricsStatus.textContent = "";
  }
  const summary = document.createElement("div");
  summary.innerHTML = `<strong>Entries:</strong> ${fmtNumber(data.totalItems)} • <strong>Channels:</strong> ${fmtNumber(data.totalChannels)}`;
  metricsContent.appendChild(summary);
  if (Array.isArray(data.itemsPerChannel) && data.itemsPerChannel.length) {
    const top = data.itemsPerChannel.slice(0, 5);
    const channelHeader = document.createElement("div");
    channelHeader.style.marginTop = "8px";
    channelHeader.innerHTML = "<strong>Top Channels</strong>";
    metricsContent.appendChild(channelHeader);
    const channelList = document.createElement("div");
    channelList.className = "muted";
    top.forEach((entry) => {
      const row = document.createElement("div");
      row.textContent = `${entry.label}: ${fmtNumber(entry.count)}`;
      channelList.appendChild(row);
    });
    metricsContent.appendChild(channelList);
  }
 }
 async function loadMetrics() {
  if (!metricsContainer) return;
  metricsContainer.dataset.loading = "1";
  if (!metricsContainer.dataset.loaded && metricsStatus) {
    metricsStatus.textContent = "Loading metrics…";
  }
  try {
    const res = await fetch("/api/metrics");
    const data = await res.json();
    renderMetrics(data);
    metricsContainer.dataset.loaded = "1";
  } catch (err) {
    console.error("Failed to load metrics", err);
    if (!metricsContainer.dataset.loaded && metricsStatus) {
      metricsStatus.textContent = "Metrics unavailable.";
    }
  } finally {
    delete metricsContainer.dataset.loading;
  }
 }
 function clearFrequency(message) {
  if (freqSummary) {
    freqSummary.textContent = message || "";
  }
  if (freqChart) {
    freqChart.innerHTML = "";
  }
 }
 function renderFrequencyChart(buckets, channelTotals) {
  if (!freqChart || typeof d3 === "undefined") {
    return;
  }
  freqChart.innerHTML = "";
  if (!buckets.length) {
    clearFrequency("No matches for this query.");
    return;
  }
  let channelsOrder =
    (channelTotals && channelTotals.length
      ? channelTotals.map((entry) => entry.id)
      : []) || [];
  if (!channelsOrder.length) {
    const unique = new Set();
    buckets.forEach((bucket) => {
      (bucket.channels || []).forEach((entry) => unique.add(entry.id));
    });
    channelsOrder = Array.from(unique);
  }
  channelsOrder = channelsOrder.slice(0, 6);
  if (!channelsOrder.length) {
    clearFrequency("No matches for this query.");
    return;
  }
  const dateKeyFormat = d3.timeFormat("%Y-%m-%d");
  const parsed = buckets
    .map((bucket) => {
      const parsedDate = d3.isoParse(bucket.date) || new Date(bucket.date);
      if (!(parsedDate instanceof Date) || Number.isNaN(parsedDate.valueOf())) {
        return null;
      }
      const counts = {};
      (bucket.channels || []).forEach((entry) => {
        if (channelsOrder.includes(entry.id)) {
          counts[entry.id] = entry.count || 0;
        }
      });
      return {
        date: parsedDate,
        dateKey: dateKeyFormat(parsedDate),
        counts,
      };
    })
    .filter(Boolean);
  if (!parsed.length) {
    clearFrequency("Timeline unavailable.");
    return;
  }
  const margin = { top: 12, right: 12, bottom: 52, left: 56 };
  const fullWidth = freqChart.clientWidth || 360;
  const fullHeight = 220;
  const width = fullWidth - margin.left - margin.right;
  const height = fullHeight - margin.top - margin.bottom;
  const svg = d3
    .select(freqChart)
    .append("svg")
    .attr("width", fullWidth)
    .attr("height", fullHeight);
  const g = svg
    .append("g")
    .attr("transform", `translate(${margin.left},${margin.top})`);
  const x = d3
    .scaleBand()
    .domain(parsed.map((entry) => entry.dateKey))
    .range([0, width])
    .padding(0.25);
  const yMax = d3.max(parsed, (entry) =>
    d3.sum(channelsOrder, (key) => entry.counts[key] || 0)
  );
  const y = d3
    .scaleLinear()
    .domain([0, yMax || 0])
    .nice()
    .range([height, 0]);
  const tickValues =
    parsed.length <= 6
      ? parsed.map((entry) => entry.dateKey)
      : parsed
          .filter((_, index, arr) => index % Math.ceil(arr.length / 6) === 0)
          .map((entry) => entry.dateKey);
  const xAxis = d3.axisBottom(x).tickValues(tickValues);
  const yAxis = d3.axisLeft(y).ticks(5);
  g.append("g")
    .attr("class", "axis")
    .attr("transform", `translate(0,${height})`)
    .call(xAxis)
    .selectAll("text")
    .attr("text-anchor", "end")
    .attr("transform", "rotate(-35)")
    .attr("dx", "-0.8em")
    .attr("dy", "0.15em");
  g.append("g").attr("class", "axis").call(yAxis);
  const stack = d3.stack().keys(channelsOrder).value((entry, key) => entry.counts[key] || 0);
  const stacked = stack(parsed);
  const color = d3.scaleOrdinal(channelsOrder, d3.schemeTableau10);
  const layers = g
    .selectAll(".freq-layer")
    .data(stacked)
    .enter()
    .append("g")
    .attr("class", "freq-layer")
    .attr("fill", (d) => color(d.key));
  layers
    .selectAll("rect")
    .data((d) => d)
    .enter()
    .append("rect")
    .attr("x", (d) => x(d.data.dateKey))
    .attr("width", x.bandwidth())
    .attr("y", (d) => y(d[1]))
    .attr("height", (d) => y(d[0]) - y(d[1]))
    .append("title")
    .text(function (d) {
      const group = this.parentNode ? this.parentNode.parentNode : null;
      const key = group ? d3.select(group).datum().key : undefined;
      const label = key ? channelMap.get(key) || key : key || '';
      return `${dateKeyFormat(d.data.date)}: ${d[1] - d[0]}${label ? " (" + label + ")" : ''}`;
    });
  const legend = document.createElement("div");
  legend.className = "freq-legend";
  channelsOrder.forEach((key) => {
    const item = document.createElement("div");
    item.className = "freq-legend-item";
    const swatch = document.createElement("span");
    swatch.className = "freq-legend-swatch";
    swatch.style.backgroundColor = color(key);
    const label = document.createElement("span");
    label.textContent = channelMap.get(key) || key;
    item.appendChild(swatch);
    item.appendChild(label);
    legend.appendChild(item);
  });
  freqChart.appendChild(legend);
 }
 async function updateFrequencyChart(term, channels, queryMode) {
  if (!freqChart || typeof d3 === "undefined") {
    return;
  }
  let trimmed = term.trim();
  if (!trimmed) {
    if (queryMode) {
      trimmed = "*";
    } else {
      clearFrequency("Enter a query to see timeline.");
      return;
    }
  }
  const params = new URLSearchParams();
  params.set("term", trimmed);
  params.set("interval", "month");
  (channels || []).forEach((id) => params.append("channel_id", id));
  if (queryMode) {
    params.set("query_string", "1");
  }
  clearFrequency("Loading timeline…");
  try {
    const res = await fetch(`/api/frequency?${params.toString()}`);
    if (!res.ok) {
      throw new Error(`Request failed with status ${res.status}`);
    }
    const payload = await res.json();
    const total = payload.totalResults || 0;
    if (freqSummary) {
      if (total === 0) {
        freqSummary.textContent = "No matches for this query.";
      } else if (queryMode) {
        freqSummary.textContent = `Matches: ${total.toLocaleString()} • Interval: ${payload.interval || "month"} (query-string)`;
      } else {
        freqSummary.textContent = `Matches: ${total.toLocaleString()} • Interval: ${payload.interval || "month"}`;
      }
    }
    if (total === 0) {
      freqChart.innerHTML = "";
      return;
    }
    renderFrequencyChart(payload.buckets || [], payload.channels || []);
  } catch (err) {
    console.error(err);
    clearFrequency("Timeline unavailable.");
  }
 }
  function renderResults(payload, page) {
    resultsDiv.innerHTML = "";
    metaDiv.textContent = `Total: ${payload.totalResults} • Page ${
      page + 1
    } of ${payload.totalPages}`;
    (payload.items || []).forEach((item) => {
      const el = document.createElement("div");
      el.className = "item";
      const titleHtml =
        item.titleHtml || escapeHtml(item.title || "Untitled");
      const descriptionHtml =
        item.descriptionHtml || escapeHtml(item.description || "");
      const header = document.createElement("div");
      const badges = [];
      if (item.highlightSource && item.highlightSource.primary) badges.push('primary transcript');
      if (item.highlightSource && item.highlightSource.secondary) badges.push('secondary transcript');
      const badgeHtml = badges.length
        ? `<div class="badge-row">${badges
            .map((b) => `<span class="badge">${escapeHtml(b)}</span>` )
            .join('')}</div>`
        : '';
      header.innerHTML = `
        <strong>${titleHtml}</strong>
        <div class="muted">${escapeHtml(item.channel_name || "")} • ${fmtDate(
        item.date
      )}</div>
        <div class="muted"><a href="${item.url}" target="_blank" rel="noopener">Open on YouTube</a></div>
        ${badgeHtml}
      `;
      el.appendChild(header);
      if (descriptionHtml) {
        const desc = document.createElement("div");
        desc.className = "muted";
        desc.innerHTML = descriptionHtml;
        el.appendChild(desc);
      }
      if (Array.isArray(item.toHighlight) && item.toHighlight.length) {
        const highlights = document.createElement("div");
        highlights.className = "transcript highlight-list";
        item.toHighlight.forEach((entry) => {
          const html = typeof entry === "string" ? entry : entry?.html;
          if (!html) return;
          const row = document.createElement("div");
          row.className = "highlight-row";
          row.innerHTML = html;
          highlights.appendChild(row);
        });
        if (highlights.childElementCount) {
          el.appendChild(highlights);
        }
      }
      resultsDiv.appendChild(el);
    });
    const pager = document.createElement("div");
    pager.className = "pager";
    const prev = document.createElement("button");
    prev.textContent = "Prev";
    prev.disabled = page <= 0;
    const next = document.createElement("button");
    next.textContent = "Next";
    next.disabled = page + 1 >= payload.totalPages;
    prev.onclick = () => runSearch(page - 1);
    next.onclick = () => runSearch(page + 1);
    pager.appendChild(prev);
    pager.appendChild(next);
    resultsDiv.appendChild(pager);
  }
  async function runSearch(pageOverride, pushState = true) {
    const q = qInput.value.trim();
    const channels = getSelectedChannels();
    const sort = sortSel.value;
    const size = parseInt(sizeSel.value, 10) || 10;
    const queryMode = queryToggle && queryToggle.checked;
    let exact = !!exactToggle.checked;
    let fuzzy = !!fuzzyToggle.checked;
    let phrase = !!phraseToggle.checked;
    if (queryMode) {
      exact = false;
      fuzzy = false;
      phrase = false;
    } else {
      previousToggleState = {
        exact,
        fuzzy,
        phrase,
      };
    }
    const page = pageOverride != null ? pageOverride : currentPage;
    currentPage = page;
    if (pushState) {
      updateUrl(q, sort, channels, page, size, exact, fuzzy, phrase, queryMode);
    }
    const params = new URLSearchParams();
    params.set("q", q);
    params.set("sort", sort);
    params.set("size", String(size));
    params.set("page", String(page));
    params.set("exact", exact ? "1" : "0");
    params.set("fuzzy", fuzzy ? "1" : "0");
    params.set("phrase", phrase ? "1" : "0");
    params.set("query_string", queryMode ? "1" : "0");
    channels.forEach((id) => params.append("channel_id", id));
    const res = await fetch(`/api/search?${params.toString()}`);
    const payload = await res.json();
    renderResults(payload, page);
    updateFrequencyChart(q, channels, queryMode);
  }
 searchBtn.addEventListener("click", () => runSearch(0));
  qInput.addEventListener("keypress", (e) => {
    if (e.key === "Enter") runSearch(0);
  });
  sortSel.addEventListener("change", () => runSearch(0));
  sizeSel.addEventListener("change", () => runSearch(0));
  exactToggle.addEventListener("change", () => { rememberToggleState(); runSearch(0); });
  fuzzyToggle.addEventListener("change", () => { rememberToggleState(); runSearch(0); });
  phraseToggle.addEventListener("change", () => { rememberToggleState(); runSearch(0); });
  if (queryToggle) {
    queryToggle.addEventListener("change", () => { applyQueryMode(); runSearch(0); });
  }
 window.addEventListener("popstate", () => {
    qs = new URLSearchParams(window.location.search);
    setFromQuery();
    currentPage = parseInt(qs.get("page") || "0", 10) || 0;
    runSearch(currentPage, false);
  });
  setFromQuery();
  loadMetrics();
  loadChannels().then(() => runSearch(currentPage));
 })();
 function escapeHtml(str) {
  return (str || "").replace(/[&<>"']/g, (ch) => {
    switch (ch) {
      case "&":
        return "&amp;";
      case "<":
        return "&lt;";
      case ">":
        return "&gt;";
      case '"':
        return "&quot;";
      case "'":
        return "&#39;";
      default:
        return ch;
    }
  });
 }
--- a/static/frequency.html
+++ b/static/frequency.html
@ -0,0 +1,68 @@
 <!doctype html>
 <html>
  <head>
    <meta charset="utf-8" />
    <meta name="viewport" content="width=device-width, initial-scale=1" />
    <title>Term Frequency Explorer</title>
    <link rel="stylesheet" href="/static/style.css" />
    <style>
      #chart {
        margin-top: 24px;
      }
      svg {
        max-width: 100%;
      }
      .axis path,
      .axis line {
        stroke: #ccc;
      }
      .line {
        fill: none;
        stroke: #0b6efd;
        stroke-width: 2px;
      }
      .dot {
        fill: #0b6efd;
        stroke: white;
        stroke-width: 1px;
      }
      .controls label {
        display: flex;
        align-items: center;
        gap: 6px;
      }
    </style>
    <script src="https://cdn.jsdelivr.net/npm/d3@7/dist/d3.min.js"></script>
  </head>
  <body>
    <header>
      <h1>Term Frequency Explorer</h1>
      <p class="muted">
        Pick a term to see how often it appears over time. <a href="/">Back to search</a>
      </p>
    </header>
    <section class="controls">
      <input id="term" type="text" placeholder="Term (e.g. meaning)" size="28" />
      <select id="channel">
        <option value="all">All Channels</option>
      </select>
      <select id="interval">
        <option value="month">Per Month</option>
        <option value="week">Per Week</option>
        <option value="day">Per Day</option>
        <option value="quarter">Per Quarter</option>
        <option value="year">Per Year</option>
      </select>
      <input id="start" type="date" />
      <input id="end" type="date" />
      <button id="runBtn">Run</button>
    </section>
    <section id="summary" class="muted"></section>
    <section id="chart"></section>
    <script src="/static/frequency.js"></script>
  </body>
 </html>
--- a/static/frequency.js
+++ b/static/frequency.js
@ -0,0 +1,222 @@
 (() => {
  let qs = new URLSearchParams(window.location.search);
  const termInput = document.getElementById("term");
  const channelSel = document.getElementById("channel");
  const intervalSel = document.getElementById("interval");
  const startInput = document.getElementById("start");
  const endInput = document.getElementById("end");
  const runBtn = document.getElementById("runBtn");
  const summaryDiv = document.getElementById("summary");
  const chartDiv = document.getElementById("chart");
  function parseParams() {
    return {
      term: qs.get("term") || "",
      channel: qs.get("channel_id") || "all",
      interval: qs.get("interval") || "month",
      start: qs.get("start") || "",
      end: qs.get("end") || "",
    };
  }
  function setFormFromParams() {
    const params = parseParams();
    termInput.value = params.term;
    intervalSel.value = params.interval;
    startInput.value = params.start;
    endInput.value = params.end;
    return params;
  }
  function updateUrl(params) {
    const url = new URL(window.location.href);
    url.searchParams.set("term", params.term);
    url.searchParams.set("channel_id", params.channel);
    url.searchParams.set("interval", params.interval);
    if (params.start) url.searchParams.set("start", params.start);
    else url.searchParams.delete("start");
    if (params.end) url.searchParams.set("end", params.end);
    else url.searchParams.delete("end");
    history.pushState({}, "", url.toString());
    qs = new URLSearchParams(url.search);
  }
  async function loadChannels(initialValue) {
    try {
      const res = await fetch("/api/channels");
      const data = await res.json();
      data.forEach((item) => {
        const opt = document.createElement("option");
        opt.value = item.Id;
        opt.textContent = `${item.Name} (${item.Count})`;
        channelSel.appendChild(opt);
      });
    } catch (err) {
      console.error("Failed to load channels", err);
    }
    channelSel.value = initialValue || "all";
  }
  function drawChart(data) {
    chartDiv.innerHTML = "";
    if (!data.length) {
      const msg = document.createElement("div");
      msg.className = "muted";
      msg.textContent = "No matching documents for this term.";
      chartDiv.appendChild(msg);
      return;
    }
    const parsed = data
      .map((d) => ({
        date: d3.isoParse(d.date) || new Date(d.date),
        value: d.count,
      }))
      .filter((d) => d.date instanceof Date && !Number.isNaN(d.date.valueOf()));
    if (!parsed.length) {
      const msg = document.createElement("div");
      msg.className = "muted";
      msg.textContent = "Unable to parse dates for this series.";
      chartDiv.appendChild(msg);
      return;
    }
    const margin = { top: 20, right: 30, bottom: 40, left: 56 };
    const fullWidth = chartDiv.clientWidth || 900;
    const fullHeight = 360;
    const width = fullWidth - margin.left - margin.right;
    const height = fullHeight - margin.top - margin.bottom;
    const svg = d3
      .select(chartDiv)
      .append("svg")
      .attr("width", fullWidth)
      .attr("height", fullHeight);
    const g = svg
      .append("g")
      .attr("transform", `translate(${margin.left},${margin.top})`);
    const x = d3
      .scaleTime()
      .domain(d3.extent(parsed, (d) => d.date))
      .range([0, width]);
    const y = d3
      .scaleLinear()
      .domain([0, d3.max(parsed, (d) => d.value) || 0])
      .nice()
      .range([height, 0]);
    const xAxis = d3.axisBottom(x).ticks(6).tickFormat(d3.timeFormat("%Y-%m-%d"));
    const yAxis = d3.axisLeft(y).ticks(6);
    g.append("g")
      .attr("class", "axis")
      .attr("transform", `translate(0,${height})`)
      .call(xAxis)
      .selectAll("text")
      .attr("text-anchor", "end")
      .attr("transform", "rotate(-35)")
      .attr("dx", "-0.8em")
      .attr("dy", "0.15em");
    g.append("g").attr("class", "axis").call(yAxis);
    const line = d3
      .line()
      .x((d) => x(d.date))
      .y((d) => y(d.value));
    g.append("path")
      .datum(parsed)
      .attr("class", "line")
      .attr("d", line);
    g.selectAll(".dot")
      .data(parsed)
      .enter()
      .append("circle")
      .attr("class", "dot")
      .attr("r", 3)
      .attr("cx", (d) => x(d.date))
      .attr("cy", (d) => y(d.value))
      .append("title")
      .text((d) => `${d3.timeFormat("%Y-%m-%d")(d.date)}: ${d.value}`);
  }
  async function runFrequency(pushState = true) {
    const term = termInput.value.trim();
    if (!term) {
      summaryDiv.textContent = "Enter a term to begin.";
      chartDiv.innerHTML = "";
      return;
    }
    const params = {
      term,
      channel: channelSel.value,
      interval: intervalSel.value,
      start: startInput.value,
      end: endInput.value,
    };
    if (pushState) updateUrl(params);
    const search = new URLSearchParams();
    search.set("term", term);
    if (params.channel && params.channel !== "all") {
      search.set("channel_id", params.channel);
    }
    search.set("interval", params.interval);
    if (params.start) search.set("start", params.start);
    if (params.end) search.set("end", params.end);
    summaryDiv.textContent = "Loading…";
    chartDiv.innerHTML = "";
    try {
      const res = await fetch(`/api/frequency?${search.toString()}`);
      if (!res.ok) {
        throw new Error(`Request failed: ${res.status}`);
      }
      const payload = await res.json();
      const total = payload.totalResults || 0;
      summaryDiv.textContent = `Matches: ${total.toLocaleString()} • Buckets: ${
        (payload.buckets || []).length
      } • Interval: ${payload.interval}`;
      drawChart(payload.buckets || []);
    } catch (err) {
      console.error(err);
      summaryDiv.textContent = "Failed to load data.";
    }
  }
  runBtn.addEventListener("click", () => runFrequency());
  termInput.addEventListener("keypress", (e) => {
    if (e.key === "Enter") runFrequency();
  });
  intervalSel.addEventListener("change", () => runFrequency());
  channelSel.addEventListener("change", () => runFrequency());
  startInput.addEventListener("change", () => runFrequency());
  endInput.addEventListener("change", () => runFrequency());
  window.addEventListener("popstate", () => {
    qs = new URLSearchParams(window.location.search);
    const params = setFormFromParams();
    channelSel.value = params.channel;
    runFrequency(false);
  });
  const initialParams = setFormFromParams();
  loadChannels(initialParams.channel).then(() => {
    if (initialParams.term) {
      runFrequency(false);
    } else {
      summaryDiv.textContent = "Enter a term to begin.";
    }
  });
 })();
--- a/static/index.html
+++ b/static/index.html
@ -0,0 +1,63 @@
 <!doctype html>
 <html>
  <head>
    <meta charset="utf-8" />
    <meta name="viewport" content="width=device-width, initial-scale=1" />
    <title>This Little Corner (Python)</title>
    <link rel="stylesheet" href="/static/style.css" />
    <script src="https://cdn.jsdelivr.net/npm/d3@7/dist/d3.min.js"></script>
  </head>
  <body>
    <header>
      <h1>This Little Corner — Elastic Search</h1>
      <p class="muted">
        Enter a phrase to query title, description, and transcript text.
      </p>
    </header>
    <section class="controls">
      <input id="q" type="text" placeholder="Search..." size="40" />
      <details id="channelDropdown" class="channel-dropdown">
        <summary id="channelSummary">All Channels</summary>
        <div id="channelOptions" class="channel-options muted">
          <div>Loading channels…</div>
        </div>
      </details>
      <select id="sort">
        <option value="relevant">Most relevant</option>
        <option value="newer">Newest first</option>
        <option value="older">Oldest first</option>
      </select>
      <select id="size">
        <option value="10">10</option>
        <option value="25">25</option>
        <option value="50">50</option>
      </select>
      <button id="searchBtn">Search</button>
    </section>
    <section class="controls muted">
      <label><input type="checkbox" id="exactToggle" checked /> Exact</label>
      <label><input type="checkbox" id="fuzzyToggle" checked /> Fuzzy</label>
      <label><input type="checkbox" id="phraseToggle" checked /> Phrase</label>
      <label><input type="checkbox" id="queryStringToggle" /> Query string mode</label>
    </section>
    <section class="summary-row">
      <div class="summary-left">
        <section id="meta" class="muted"></section>
        <section id="metrics">
          <div id="metricsStatus" class="muted"></div>
          <div id="metricsContent"></div>
        </section>
      </div>
      <div class="summary-right">
        <section id="frequencySummary" class="muted"></section>
        <div id="frequencyChart"></div>
      </div>
    </section>
    <section id="results"></section>
    <script src="/static/app.js"></script>
  </body>
 </html>
--- a/static/style.css
+++ b/static/style.css
@ -0,0 +1,225 @@
 body {
  font-family: Arial, sans-serif;
  margin: 24px;
  color: #222;
 }
 header {
  margin-bottom: 16px;
 }
 .controls {
  display: flex;
  flex-wrap: wrap;
  gap: 8px;
  align-items: center;
  margin-bottom: 12px;
 }
 .channel-dropdown {
  position: relative;
  min-width: 220px;
  flex: 0 1 260px;
 }
 .channel-dropdown summary {
  list-style: none;
  cursor: pointer;
  border: 1px solid #ccc;
  border-radius: 4px;
  padding: 6px 8px;
  background: #fff;
  color: #222;
  display: inline-flex;
  align-items: center;
  min-height: 32px;
  max-width: 100%;
  overflow: hidden;
  text-overflow: ellipsis;
  white-space: nowrap;
 }
 .channel-dropdown summary::-webkit-details-marker {
  display: none;
 }
 .channel-dropdown[open] summary {
  border-bottom-left-radius: 0;
  border-bottom-right-radius: 0;
 }
 .channel-options {
  margin-top: 4px;
  padding: 8px;
  border: 1px solid #ccc;
  border-radius: 0 0 4px 4px;
  background: #fff;
  max-height: 240px;
  overflow-y: auto;
  box-shadow: 0 2px 6px rgba(0, 0, 0, 0.12);
  min-width: 220px;
  width: max(220px, 100%);
 }
 .channel-option {
  display: flex;
  align-items: center;
  gap: 6px;
  margin-bottom: 6px;
  font-size: 12px;
 }
 .channel-option:last-child {
  margin-bottom: 0;
 }
 input,
 select,
 button {
  padding: 6px 8px;
 }
 .muted {
  color: #666;
  font-size: 12px;
 }
 #results .item {
  border-bottom: 1px solid #ddd;
  padding: 12px 0;
 }
 .summary-row {
  display: flex;
  gap: 16px;
  flex-wrap: wrap;
  align-items: flex-start;
  margin-top: 12px;
 }
 .summary-left {
  flex: 0 1 280px;
  max-width: 360px;
 }
 .summary-right {
  flex: 1 1 0%;
  min-width: 0;
  background: #f5f5f5;
  padding: 12px;
  border-radius: 8px;
  box-shadow: 0 1px 2px rgba(0, 0, 0, 0.08);
 }
 #metrics {
  margin-top: 12px;
  display: flex;
  flex-direction: column;
  gap: 8px;
 }
 #metricsStatus {
  min-height: 16px;
 }
 #metricsContent {
  display: flex;
  flex-direction: column;
  gap: 6px;
 }
 #frequencyChart {
  margin-top: 8px;
 }
 #frequencyChart svg {
  max-width: 100%;
 }
 #frequencyChart .axis path,
 #frequencyChart .axis line {
  stroke: #ccc;
 }
 #frequencyChart .freq-layer rect {
  stroke: #fff;
  stroke-width: 0.5px;
 }
 .freq-legend {
  margin-top: 8px;
  display: flex;
  flex-wrap: wrap;
  gap: 8px;
  font-size: 12px;
  color: #444;
 }
 .freq-legend-item {
  display: flex;
  align-items: center;
  gap: 6px;
 }
 .freq-legend-swatch {
  width: 12px;
  height: 12px;
  border-radius: 2px;
  display: inline-block;
 }
 .transcript {
  background: #fafafa;
  padding: 8px;
  margin-top: 6px;
  max-height: 200px;
  overflow-y: auto;
 }
 .highlight-list {
  display: flex;
  flex-direction: column;
  gap: 8px;
  max-height: none;
  overflow: visible;
 }
 .highlight-row {
  padding: 4px 0;
  border-bottom: 1px solid #ececec;
 }
 .highlight-row:last-child {
  border-bottom: none;
 }
 .transcript-wrapper {
  margin-top: 8px;
 }
 .pager {
  margin-top: 12px;
  display: flex;
  gap: 8px;
 }
 mark {
  background: #ffe58a;
  padding: 0 2px;
 }
 .badge-row {
  margin-top: 6px;
  display: flex;
  gap: 4px;
  flex-wrap: wrap;
 }
 .badge {
  background: #0b6efd;
  color: #fff;
  border-radius: 999px;
  padding: 2px 8px;
  font-size: 12px;
 }
--- a/transcript_collector.py
+++ b/transcript_collector.py
@ -0,0 +1,226 @@
 """
 Lightweight helpers for gathering video metadata and transcripts from YouTube.
 Usage:
    python -m python_app.transcript_collector --channel UC123 --output data/raw
 Relies on:
    - YouTube Data API v3 (requires YOUTUBE_API_KEY).
    - youtube-transcript-api for transcript retrieval.
 Both libraries are optional at import time so the module can still be referenced
 when only working with existing JSON dumps.
 """
 from __future__ import annotations
 import argparse
 import json
 import logging
 from dataclasses import asdict, dataclass
 from pathlib import Path
 from typing import Dict, Iterable, Iterator, List, Optional
 from .config import CONFIG
 try:
    from googleapiclient.discovery import build as build_youtube  # type: ignore
 except ImportError:  # pragma: no cover - library optional
    build_youtube = None
 try:
    from youtube_transcript_api import YouTubeTranscriptApi  # type: ignore
 except ImportError:  # pragma: no cover - library optional
    YouTubeTranscriptApi = None
 LOGGER = logging.getLogger(__name__)
@dataclass
 class TranscriptSegment:
    start: float
    duration: float
    text: str
@dataclass
 class VideoRecord:
    video_id: str
    channel_id: str
    channel_title: str
    title: str
    description: str
    published_at: str
    url: str
    transcript: List[TranscriptSegment]
 def _ensure_youtube_client(api_key: Optional[str]):
    if build_youtube is None:
        raise RuntimeError(
            "google-api-python-client not installed. "
            "Install google-api-python-client to collect metadata."
        )
    if not api_key:
        raise RuntimeError(
            "Set YOUTUBE_API_KEY to collect metadata from YouTube."
        )
    return build_youtube("youtube", "v3", developerKey=api_key)
 def _ensure_transcript_api():
    if YouTubeTranscriptApi is None:
        raise RuntimeError(
            "youtube-transcript-api not installed. "
            "Install youtube-transcript-api to fetch transcripts."
        )
    return YouTubeTranscriptApi()
 def iter_channel_videos(
    channel_id: str,
    *,
    api_key: Optional[str] = None,
    max_pages: int = 10,
 ) -> Iterator[Dict]:
    """
    Yield raw playlist items for the uploads playlist of the given channel.
    Args:
        channel_id: Target YouTube channel ID.
        api_key: Explicit API key (defaults to config value).
        max_pages: Hard cap on paginated playlist fetches to keep things simple.
    """
    client = _ensure_youtube_client(api_key or CONFIG.youtube.api_key)
    channels = (
        client.channels().list(id=channel_id, part="contentDetails").execute()
    )
    items = channels.get("items", [])
    if not items:
        raise ValueError(f"Channel {channel_id} not found.")
    uploads_playlist = (
        items[0]
        .get("contentDetails", {})
        .get("relatedPlaylists", {})
        .get("uploads")
    )
    if not uploads_playlist:
        raise ValueError(f"Channel {channel_id} missing uploads playlist.")
    request = client.playlistItems().list(
        playlistId=uploads_playlist, part="snippet", maxResults=50
    )
    page = 0
    while request and page < max_pages:
        response = request.execute()
        for item in response.get("items", []):
            yield item
        page += 1
        request = client.playlistItems().list_next(request, response)
 def fetch_transcript(
    video_id: str, *, languages: Optional[Iterable[str]] = None
 ) -> List[TranscriptSegment]:
    """Return transcript segments for a video, if available."""
    api = _ensure_transcript_api()
    try:
        transcripts = api.get_transcript(video_id, languages=languages)
    except Exception as exc:  # broad catch keeps draft simple
        LOGGER.warning("Transcript unavailable for %s: %s", video_id, exc)
        return []
    return [
        TranscriptSegment(
            start=entry.get("start", 0.0),
            duration=entry.get("duration", 0.0),
            text=entry.get("text", ""),
        )
        for entry in transcripts
    ]
 def collect_channel(
    channel_id: str,
    output_dir: Path,
    *,
    api_key: Optional[str] = None,
    max_pages: int = 2,
    languages: Optional[List[str]] = None,
 ) -> List[VideoRecord]:
    """
    Collect metadata + transcripts for a channel and store as JSON files.
    Returns the in-memory list to make it easy to chain into ingestion.
    """
    output_dir.mkdir(parents=True, exist_ok=True)
    videos: List[VideoRecord] = []
    for item in iter_channel_videos(
        channel_id, api_key=api_key, max_pages=max_pages
    ):
        snippet = item.get("snippet", {})
        video_id = snippet.get("resourceId", {}).get("videoId")
        if not video_id:
            continue
        segments = fetch_transcript(video_id, languages=languages)
        record = VideoRecord(
            video_id=video_id,
            channel_id=snippet.get("channelId", channel_id),
            channel_title=snippet.get("channelTitle", ""),
            title=snippet.get("title", ""),
            description=snippet.get("description", ""),
            published_at=snippet.get("publishedAt", ""),
            url=f"https://www.youtube.com/watch?v={video_id}",
            transcript=segments,
        )
        videos.append(record)
        dest = output_dir / f"{video_id}.json"
        with dest.open("w", encoding="utf-8") as handle:
            json.dump(asdict(record), handle, ensure_ascii=False, indent=2)
        LOGGER.info("Saved %s", dest)
    return videos
 def _build_parser() -> argparse.ArgumentParser:
    parser = argparse.ArgumentParser(
        description="Collect channel transcripts into JSON files."
    )
    parser.add_argument(
        "--channel",
        required=True,
        help="YouTube channel ID (e.g. UCXYZ).",
    )
    parser.add_argument(
        "--output",
        type=Path,
        default=Path("data/raw"),
        help="Directory to write per-video JSON files.",
    )
    parser.add_argument(
        "--max-pages",
        type=int,
        default=2,
        help="Number of paginated channel pages to pull (50 videos per page).",
    )
    parser.add_argument(
        "--language",
        dest="languages",
        action="append",
        help="Preferred transcript languages (can be repeated).",
    )
    return parser
 def main() -> None:
    logging.basicConfig(level=logging.INFO, format="%(levelname)s %(message)s")
    args = _build_parser().parse_args()
    collect_channel(
        args.channel,
        args.output,
        max_pages=args.max_pages,
        languages=args.languages,
    )
 if __name__ == "__main__":  # pragma: no cover
    main()