Initial commit
This commit is contained in:
commit
fcdc6ecb9b
60
.gitignore
vendored
Normal file
60
.gitignore
vendored
Normal file
@ -0,0 +1,60 @@
|
|||||||
|
# Byte-compiled / optimized / DLL files
|
||||||
|
__pycache__/
|
||||||
|
*.py[cod]
|
||||||
|
*$py.class
|
||||||
|
|
||||||
|
# C extensions
|
||||||
|
*.so
|
||||||
|
|
||||||
|
# Distribution / packaging
|
||||||
|
.Python
|
||||||
|
build/
|
||||||
|
develop-eggs/
|
||||||
|
dist/
|
||||||
|
downloads/
|
||||||
|
eggs/
|
||||||
|
.eggs/
|
||||||
|
lib/
|
||||||
|
lib64/
|
||||||
|
parts/
|
||||||
|
sdist/
|
||||||
|
var/
|
||||||
|
wheels/
|
||||||
|
*.egg-info/
|
||||||
|
.installed.cfg
|
||||||
|
*.egg
|
||||||
|
|
||||||
|
# Virtual environments
|
||||||
|
.venv/
|
||||||
|
venv/
|
||||||
|
ENV/
|
||||||
|
env/
|
||||||
|
|
||||||
|
# IDE
|
||||||
|
.vscode/
|
||||||
|
.idea/
|
||||||
|
*.swp
|
||||||
|
*.swo
|
||||||
|
*~
|
||||||
|
|
||||||
|
# Environment variables
|
||||||
|
.env
|
||||||
|
.env.local
|
||||||
|
|
||||||
|
# Elasticsearch data
|
||||||
|
data/
|
||||||
|
|
||||||
|
# OS
|
||||||
|
.DS_Store
|
||||||
|
Thumbs.db
|
||||||
|
|
||||||
|
# Logs
|
||||||
|
*.log
|
||||||
|
|
||||||
|
# Testing
|
||||||
|
.pytest_cache/
|
||||||
|
.coverage
|
||||||
|
htmlcov/
|
||||||
|
|
||||||
|
# mypy
|
||||||
|
.mypy_cache/
|
||||||
87
README.md
Normal file
87
README.md
Normal file
@ -0,0 +1,87 @@
|
|||||||
|
# Python Search Toolkit (Rough Draft)
|
||||||
|
|
||||||
|
This minimal Python implementation covers three core needs:
|
||||||
|
|
||||||
|
1. **Collect transcripts** from YouTube channels.
|
||||||
|
2. **Ingest transcripts/metadata** into Elasticsearch.
|
||||||
|
3. **Expose a simple Flask search UI** that queries Elasticsearch directly.
|
||||||
|
|
||||||
|
The code lives alongside the existing C# stack so you can experiment without
|
||||||
|
touching production infrastructure.
|
||||||
|
|
||||||
|
## Setup
|
||||||
|
|
||||||
|
```bash
|
||||||
|
python -m venv .venv
|
||||||
|
source .venv/bin/activate
|
||||||
|
pip install -r python_app/requirements.txt
|
||||||
|
```
|
||||||
|
|
||||||
|
Configure your environment as needed:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
export ELASTIC_URL=http://localhost:9200
|
||||||
|
export ELASTIC_INDEX=this_little_corner_py
|
||||||
|
export ELASTIC_USERNAME=elastic # optional
|
||||||
|
export ELASTIC_PASSWORD=secret # optional
|
||||||
|
export ELASTIC_API_KEY=XXXX # optional alternative auth
|
||||||
|
export ELASTIC_CA_CERT=/path/to/ca.pem # optional, for self-signed TLS
|
||||||
|
export ELASTIC_VERIFY_CERTS=1 # set to 0 to skip verification (dev only)
|
||||||
|
export ELASTIC_DEBUG=0 # set to 1 for verbose request/response logging
|
||||||
|
export LOCAL_DATA_DIR=./data/video_metadata # defaults to this
|
||||||
|
export YOUTUBE_API_KEY=AIza... # required for live collection
|
||||||
|
```
|
||||||
|
|
||||||
|
## 1. Collect Transcripts
|
||||||
|
|
||||||
|
```bash
|
||||||
|
python -m python_app.transcript_collector \
|
||||||
|
--channel UCxxxx \
|
||||||
|
--output data/raw \
|
||||||
|
--max-pages 2
|
||||||
|
```
|
||||||
|
|
||||||
|
Each video becomes a JSON file containing metadata plus transcript segments
|
||||||
|
(`TranscriptSegment`). Downloads require both `google-api-python-client` and
|
||||||
|
`youtube-transcript-api`, as well as a valid `YOUTUBE_API_KEY`.
|
||||||
|
|
||||||
|
> Already have cached JSON? You can skip this step and move straight to ingesting.
|
||||||
|
|
||||||
|
## 2. Ingest Into Elasticsearch
|
||||||
|
|
||||||
|
```bash
|
||||||
|
python -m python_app.ingest \
|
||||||
|
--source data/video_metadata \
|
||||||
|
--index this_little_corner_py
|
||||||
|
```
|
||||||
|
|
||||||
|
The script walks the source directory, builds `bulk` requests, and creates the
|
||||||
|
index with a lightweight mapping when needed. Authentication is handled via
|
||||||
|
`ELASTIC_USERNAME` / `ELASTIC_PASSWORD` if set.
|
||||||
|
|
||||||
|
## 3. Serve the Search Frontend
|
||||||
|
|
||||||
|
```bash
|
||||||
|
python -m python_app.search_app
|
||||||
|
```
|
||||||
|
|
||||||
|
Visit <http://localhost:8080/> and you’ll see a barebones UI that:
|
||||||
|
|
||||||
|
- Lists channels via a terms aggregation.
|
||||||
|
- Queries titles/descriptions/transcripts with toggleable exact, fuzzy, and phrase clauses plus optional date sorting.
|
||||||
|
- Surfaces transcript highlights.
|
||||||
|
- Lets you pull the full transcript for any result on demand.
|
||||||
|
- Shows a stacked-by-channel timeline for each search query (with `/frequency` offering a standalone explorer) powered by D3.js.
|
||||||
|
- Supports a query-string mode toggle so you can write advanced Lucene queries (e.g. `meaning OR purpose`, `meaning~2` for fuzzy matches, `title:(meaning crisis)`), while the default toggles stay AND-backed.
|
||||||
|
|
||||||
|
## Integration Notes
|
||||||
|
|
||||||
|
- All modules share configuration through `python_app.config.CONFIG`, so you can
|
||||||
|
fine-tune paths or credentials centrally.
|
||||||
|
- The ingest flow reuses existing JSON schema from `data/video_metadata`, so no
|
||||||
|
re-download is necessary if you already have the dumps.
|
||||||
|
- Everything is intentionally simple (no Celery, task queues, or custom auth) to
|
||||||
|
keep the draft approachable and easy to extend.
|
||||||
|
|
||||||
|
Feel free to expand on this scaffold—add proper logging, schedule transcript
|
||||||
|
updates, or flesh out the UI—once you’re happy with the baseline behaviour.
|
||||||
11
__init__.py
Normal file
11
__init__.py
Normal file
@ -0,0 +1,11 @@
|
|||||||
|
"""
|
||||||
|
Minimal Python toolkit for collecting YouTube transcripts, ingesting them into
|
||||||
|
Elasticsearch, and serving a lightweight search API/front-end.
|
||||||
|
|
||||||
|
Modules:
|
||||||
|
config: shared configuration helpers (Elastic endpoint, data paths, etc.).
|
||||||
|
transcript_collector: fetches channel metadata and transcripts.
|
||||||
|
ingest: pushes transcript JSON into Elasticsearch.
|
||||||
|
search_app: Flask app exposing simple search and transcript endpoints.
|
||||||
|
"""
|
||||||
|
|
||||||
81
config.py
Normal file
81
config.py
Normal file
@ -0,0 +1,81 @@
|
|||||||
|
"""
|
||||||
|
Centralised configuration helpers for the Python search toolkit.
|
||||||
|
|
||||||
|
Environment Variables:
|
||||||
|
ELASTIC_URL: Base URL to the Elasticsearch node (default: http://localhost:9200).
|
||||||
|
ELASTIC_USERNAME / ELASTIC_PASSWORD: Optional basic auth credentials.
|
||||||
|
ELASTIC_INDEX: Target index name (default: this_little_corner_py).
|
||||||
|
LOCAL_DATA_DIR: Root folder containing JSON metadata (default: ../data/video_metadata).
|
||||||
|
YOUTUBE_API_KEY: Optional API key for pulling metadata directly from YouTube.
|
||||||
|
"""
|
||||||
|
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import os
|
||||||
|
from dataclasses import dataclass
|
||||||
|
from pathlib import Path
|
||||||
|
from typing import Optional
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass(frozen=True)
|
||||||
|
class ElasticSettings:
|
||||||
|
url: str
|
||||||
|
username: Optional[str]
|
||||||
|
password: Optional[str]
|
||||||
|
index: str
|
||||||
|
ca_cert: Optional[Path]
|
||||||
|
verify_certs: bool
|
||||||
|
api_key: Optional[str]
|
||||||
|
debug: bool
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass(frozen=True)
|
||||||
|
class DataSettings:
|
||||||
|
root: Path
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass(frozen=True)
|
||||||
|
class YoutubeSettings:
|
||||||
|
api_key: Optional[str]
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass(frozen=True)
|
||||||
|
class AppConfig:
|
||||||
|
elastic: ElasticSettings
|
||||||
|
data: DataSettings
|
||||||
|
youtube: YoutubeSettings
|
||||||
|
|
||||||
|
|
||||||
|
def _env(name: str, default: Optional[str] = None) -> Optional[str]:
|
||||||
|
"""Return an environment variable value with optional default."""
|
||||||
|
value = os.environ.get(name)
|
||||||
|
if value is None:
|
||||||
|
return default
|
||||||
|
stripped = value.strip()
|
||||||
|
return stripped or default
|
||||||
|
|
||||||
|
|
||||||
|
def load_config() -> AppConfig:
|
||||||
|
"""Collect configuration from environment variables."""
|
||||||
|
elastic = ElasticSettings(
|
||||||
|
url=_env("ELASTIC_URL", "http://localhost:9200"),
|
||||||
|
username=_env("ELASTIC_USERNAME"),
|
||||||
|
password=_env("ELASTIC_PASSWORD"),
|
||||||
|
index=_env("ELASTIC_INDEX", "this_little_corner_py"),
|
||||||
|
ca_cert=Path(_env("ELASTIC_CA_CERT")).expanduser() if _env("ELASTIC_CA_CERT") else None,
|
||||||
|
verify_certs=_env("ELASTIC_VERIFY_CERTS", "1") not in {"0", "false", "False"},
|
||||||
|
api_key=_env("ELASTIC_API_KEY"),
|
||||||
|
debug=_env("ELASTIC_DEBUG", "0") in {"1", "true", "True"},
|
||||||
|
)
|
||||||
|
data_root = Path(
|
||||||
|
_env(
|
||||||
|
"LOCAL_DATA_DIR",
|
||||||
|
Path(__file__).resolve().parents[1] / "data" / "video_metadata",
|
||||||
|
)
|
||||||
|
)
|
||||||
|
data = DataSettings(root=data_root)
|
||||||
|
youtube = YoutubeSettings(api_key=_env("YOUTUBE_API_KEY"))
|
||||||
|
return AppConfig(elastic=elastic, data=data, youtube=youtube)
|
||||||
|
|
||||||
|
|
||||||
|
CONFIG = load_config()
|
||||||
193
ingest.py
Normal file
193
ingest.py
Normal file
@ -0,0 +1,193 @@
|
|||||||
|
"""
|
||||||
|
Utilities for indexing transcript JSON documents into Elasticsearch.
|
||||||
|
|
||||||
|
Usage:
|
||||||
|
python -m python_app.ingest --source data/video_metadata --index corner
|
||||||
|
"""
|
||||||
|
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import argparse
|
||||||
|
import json
|
||||||
|
import logging
|
||||||
|
from pathlib import Path
|
||||||
|
from typing import Dict, Iterable, Iterator, Optional
|
||||||
|
|
||||||
|
from .config import CONFIG, AppConfig
|
||||||
|
|
||||||
|
try:
|
||||||
|
from elasticsearch import Elasticsearch, helpers # type: ignore
|
||||||
|
except ImportError: # pragma: no cover - dependency optional
|
||||||
|
Elasticsearch = None
|
||||||
|
helpers = None
|
||||||
|
|
||||||
|
|
||||||
|
LOGGER = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
|
def _ensure_client(config: AppConfig) -> "Elasticsearch":
|
||||||
|
if Elasticsearch is None:
|
||||||
|
raise RuntimeError(
|
||||||
|
"elasticsearch package not installed. "
|
||||||
|
"Install elasticsearch>=7 to index documents."
|
||||||
|
)
|
||||||
|
kwargs = {}
|
||||||
|
if config.elastic.api_key:
|
||||||
|
kwargs["api_key"] = config.elastic.api_key
|
||||||
|
elif config.elastic.username and config.elastic.password:
|
||||||
|
kwargs["basic_auth"] = (
|
||||||
|
config.elastic.username,
|
||||||
|
config.elastic.password,
|
||||||
|
)
|
||||||
|
if config.elastic.ca_cert:
|
||||||
|
kwargs["ca_certs"] = str(config.elastic.ca_cert)
|
||||||
|
kwargs["verify_certs"] = config.elastic.verify_certs
|
||||||
|
return Elasticsearch(config.elastic.url, **kwargs)
|
||||||
|
|
||||||
|
|
||||||
|
def iter_json_documents(data_root: Path) -> Iterator[Dict]:
|
||||||
|
"""Yield JSON objects from the provided directory tree."""
|
||||||
|
if not data_root.exists():
|
||||||
|
raise FileNotFoundError(f"{data_root} does not exist")
|
||||||
|
for path in sorted(data_root.rglob("*.json")):
|
||||||
|
try:
|
||||||
|
with path.open("r", encoding="utf-8") as handle:
|
||||||
|
doc = json.load(handle)
|
||||||
|
doc.setdefault("video_id", path.stem)
|
||||||
|
yield doc
|
||||||
|
except Exception as exc:
|
||||||
|
LOGGER.warning("Skipping %s: %s", path, exc)
|
||||||
|
|
||||||
|
|
||||||
|
def build_bulk_actions(
|
||||||
|
docs: Iterable[Dict], *, index: Optional[str] = None
|
||||||
|
) -> Iterator[Dict]:
|
||||||
|
"""Translate raw JSON dictionaries into Elasticsearch bulk actions."""
|
||||||
|
for doc in docs:
|
||||||
|
video_id = doc.get("video_id")
|
||||||
|
if not video_id:
|
||||||
|
continue
|
||||||
|
parts = doc.get("transcript_parts") or doc.get("transcript") or []
|
||||||
|
transcript_full = doc.get("transcript_full")
|
||||||
|
if not transcript_full and isinstance(parts, list):
|
||||||
|
transcript_full = " ".join(
|
||||||
|
segment.get("text", "") if isinstance(segment, dict) else str(segment)
|
||||||
|
for segment in parts
|
||||||
|
).strip()
|
||||||
|
yield {
|
||||||
|
"_id": video_id,
|
||||||
|
"_index": index or CONFIG.elastic.index,
|
||||||
|
"_op_type": "index",
|
||||||
|
"_source": {
|
||||||
|
"video_id": video_id,
|
||||||
|
"channel_id": doc.get("channel_id"),
|
||||||
|
"channel_name": doc.get("channel_name"),
|
||||||
|
"title": doc.get("title"),
|
||||||
|
"description": doc.get("description"),
|
||||||
|
"date": doc.get("date") or doc.get("published_at"),
|
||||||
|
"url": doc.get("url"),
|
||||||
|
"duration": doc.get("duration"),
|
||||||
|
"transcript_full": transcript_full,
|
||||||
|
"transcript_secondary_full": doc.get("transcript_secondary_full"),
|
||||||
|
"transcript_parts": parts,
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def ensure_index(client: "Elasticsearch", index: str) -> None:
|
||||||
|
"""Create the target index with a minimal mapping if it is missing."""
|
||||||
|
if client.indices.exists(index=index):
|
||||||
|
return
|
||||||
|
LOGGER.info("Creating index %s", index)
|
||||||
|
client.indices.create(
|
||||||
|
index=index,
|
||||||
|
mappings={
|
||||||
|
"properties": {
|
||||||
|
"video_id": {"type": "keyword"},
|
||||||
|
"channel_id": {"type": "keyword"},
|
||||||
|
"channel_name": {"type": "keyword"},
|
||||||
|
"title": {"type": "text"},
|
||||||
|
"description": {"type": "text"},
|
||||||
|
"date": {"type": "date", "format": "strict_date_optional_time"},
|
||||||
|
"url": {"type": "keyword"},
|
||||||
|
"duration": {"type": "float"},
|
||||||
|
"transcript_full": {"type": "text"},
|
||||||
|
"transcript_secondary_full": {"type": "text"},
|
||||||
|
"transcript_parts": {
|
||||||
|
"type": "nested",
|
||||||
|
"properties": {
|
||||||
|
"start": {"type": "float"},
|
||||||
|
"duration": {"type": "float"},
|
||||||
|
"text": {"type": "text"},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
}
|
||||||
|
},
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def ingest_directory(
|
||||||
|
data_root: Path,
|
||||||
|
*,
|
||||||
|
config: AppConfig = CONFIG,
|
||||||
|
index: Optional[str] = None,
|
||||||
|
batch_size: int = 500,
|
||||||
|
request_timeout: int = 120,
|
||||||
|
) -> None:
|
||||||
|
"""Bulk index every JSON file in the directory tree."""
|
||||||
|
client = _ensure_client(config)
|
||||||
|
target_index = index or config.elastic.index
|
||||||
|
ensure_index(client, target_index)
|
||||||
|
docs = iter_json_documents(data_root)
|
||||||
|
actions = build_bulk_actions(docs, index=target_index)
|
||||||
|
bulk_client = client.options(request_timeout=request_timeout)
|
||||||
|
helpers.bulk(
|
||||||
|
bulk_client,
|
||||||
|
actions,
|
||||||
|
chunk_size=batch_size,
|
||||||
|
)
|
||||||
|
LOGGER.info("Ingestion complete for %s", target_index)
|
||||||
|
|
||||||
|
|
||||||
|
def _build_parser() -> argparse.ArgumentParser:
|
||||||
|
parser = argparse.ArgumentParser(
|
||||||
|
description="Ingest transcript JSON files into Elasticsearch."
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--source",
|
||||||
|
type=Path,
|
||||||
|
default=CONFIG.data.root,
|
||||||
|
help="Directory containing per-video JSON files.",
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--index",
|
||||||
|
help="Override the Elasticsearch index name.",
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--batch-size",
|
||||||
|
type=int,
|
||||||
|
default=500,
|
||||||
|
help="Bulk ingest batch size.",
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--timeout",
|
||||||
|
type=int,
|
||||||
|
default=120,
|
||||||
|
help="Request timeout (seconds) for bulk operations.",
|
||||||
|
)
|
||||||
|
return parser
|
||||||
|
|
||||||
|
|
||||||
|
def main() -> None:
|
||||||
|
logging.basicConfig(level=logging.INFO, format="%(levelname)s %(message)s")
|
||||||
|
args = _build_parser().parse_args()
|
||||||
|
ingest_directory(
|
||||||
|
args.source,
|
||||||
|
index=args.index,
|
||||||
|
batch_size=args.batch_size,
|
||||||
|
request_timeout=args.timeout,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__": # pragma: no cover
|
||||||
|
main()
|
||||||
4
requirements.txt
Normal file
4
requirements.txt
Normal file
@ -0,0 +1,4 @@
|
|||||||
|
Flask>=2.3
|
||||||
|
elasticsearch>=7.0.0,<9.0.0
|
||||||
|
youtube-transcript-api>=0.6
|
||||||
|
google-api-python-client>=2.0.0
|
||||||
910
search_app.py
Normal file
910
search_app.py
Normal file
@ -0,0 +1,910 @@
|
|||||||
|
"""
|
||||||
|
Flask application exposing a minimal search API backed by Elasticsearch.
|
||||||
|
|
||||||
|
Routes:
|
||||||
|
GET / -> Static HTML search page.
|
||||||
|
GET /api/channels -> List available channels (via terms aggregation).
|
||||||
|
GET /api/search -> Search index with pagination and simple highlighting.
|
||||||
|
GET /api/transcript -> Return full transcript for a given video_id.
|
||||||
|
"""
|
||||||
|
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import copy
|
||||||
|
import json
|
||||||
|
import logging
|
||||||
|
import re
|
||||||
|
from pathlib import Path
|
||||||
|
from typing import Any, Dict, Iterable, List, Optional, Sequence, Set
|
||||||
|
|
||||||
|
from collections import Counter
|
||||||
|
from datetime import datetime
|
||||||
|
|
||||||
|
from flask import Flask, jsonify, request, send_from_directory
|
||||||
|
|
||||||
|
from .config import CONFIG, AppConfig
|
||||||
|
|
||||||
|
try:
|
||||||
|
from elasticsearch import Elasticsearch # type: ignore
|
||||||
|
from elasticsearch import BadRequestError # type: ignore
|
||||||
|
except ImportError: # pragma: no cover - dependency optional
|
||||||
|
Elasticsearch = None
|
||||||
|
BadRequestError = Exception # type: ignore
|
||||||
|
|
||||||
|
LOGGER = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
|
def _ensure_client(config: AppConfig) -> "Elasticsearch":
|
||||||
|
if Elasticsearch is None:
|
||||||
|
raise RuntimeError(
|
||||||
|
"elasticsearch package not installed. "
|
||||||
|
"Install elasticsearch>=7 to run the Flask search app."
|
||||||
|
)
|
||||||
|
kwargs = {}
|
||||||
|
if config.elastic.api_key:
|
||||||
|
kwargs["api_key"] = config.elastic.api_key
|
||||||
|
elif config.elastic.username and config.elastic.password:
|
||||||
|
kwargs["basic_auth"] = (
|
||||||
|
config.elastic.username,
|
||||||
|
config.elastic.password,
|
||||||
|
)
|
||||||
|
if config.elastic.ca_cert:
|
||||||
|
kwargs["ca_certs"] = str(config.elastic.ca_cert)
|
||||||
|
kwargs["verify_certs"] = config.elastic.verify_certs
|
||||||
|
return Elasticsearch(config.elastic.url, **kwargs)
|
||||||
|
|
||||||
|
|
||||||
|
def metrics_payload(data_root: Path) -> Dict[str, Any]:
|
||||||
|
total_items = 0
|
||||||
|
channel_counter: Counter = Counter()
|
||||||
|
channel_name_map: Dict[str, str] = {}
|
||||||
|
year_counter: Counter = Counter()
|
||||||
|
month_counter: Counter = Counter()
|
||||||
|
|
||||||
|
if not data_root.exists():
|
||||||
|
LOGGER.warning("Data directory %s not found; metrics will be empty.", data_root)
|
||||||
|
return {
|
||||||
|
"totalItems": 0,
|
||||||
|
"totalChannels": 0,
|
||||||
|
"itemsPerChannel": [],
|
||||||
|
"yearHistogram": [],
|
||||||
|
"recentMonths": [],
|
||||||
|
}
|
||||||
|
|
||||||
|
for path in data_root.rglob("*.json"):
|
||||||
|
try:
|
||||||
|
with path.open("r", encoding="utf-8") as handle:
|
||||||
|
doc = json.load(handle)
|
||||||
|
except Exception:
|
||||||
|
continue
|
||||||
|
|
||||||
|
total_items += 1
|
||||||
|
|
||||||
|
channel_id = doc.get("channel_id")
|
||||||
|
channel_name = doc.get("channel_name") or channel_id
|
||||||
|
if channel_id:
|
||||||
|
channel_counter[channel_id] += 1
|
||||||
|
if channel_name and channel_id not in channel_name_map:
|
||||||
|
channel_name_map[channel_id] = channel_name
|
||||||
|
|
||||||
|
date_value = doc.get("date") or doc.get("published_at")
|
||||||
|
dt: Optional[datetime] = None
|
||||||
|
if isinstance(date_value, str):
|
||||||
|
for fmt in ("%Y-%m-%d", "%Y-%m-%dT%H:%M:%S", "%Y-%m-%dT%H:%M:%SZ"):
|
||||||
|
try:
|
||||||
|
dt = datetime.strptime(date_value[: len(fmt)], fmt)
|
||||||
|
break
|
||||||
|
except Exception:
|
||||||
|
continue
|
||||||
|
elif isinstance(date_value, (int, float)):
|
||||||
|
try:
|
||||||
|
dt = datetime.fromtimestamp(date_value)
|
||||||
|
except Exception:
|
||||||
|
dt = None
|
||||||
|
|
||||||
|
if dt:
|
||||||
|
year_counter[str(dt.year)] += 1
|
||||||
|
month_counter[dt.strftime("%Y-%m")] += 1
|
||||||
|
|
||||||
|
items_per_channel = [
|
||||||
|
{
|
||||||
|
"label": channel_name_map.get(cid, cid),
|
||||||
|
"count": count,
|
||||||
|
}
|
||||||
|
for cid, count in channel_counter.most_common()
|
||||||
|
]
|
||||||
|
|
||||||
|
year_histogram = [
|
||||||
|
{"bucket": year, "count": year_counter[year]}
|
||||||
|
for year in sorted(year_counter.keys())
|
||||||
|
]
|
||||||
|
|
||||||
|
recent_months = sorted(month_counter.keys())
|
||||||
|
recent_months = recent_months[-12:]
|
||||||
|
recent_months_payload = [
|
||||||
|
{"bucket": month, "count": month_counter[month]} for month in recent_months
|
||||||
|
]
|
||||||
|
|
||||||
|
return {
|
||||||
|
"totalItems": total_items,
|
||||||
|
"totalChannels": len(channel_counter),
|
||||||
|
"itemsPerChannel": items_per_channel,
|
||||||
|
"yearHistogram": year_histogram,
|
||||||
|
"recentMonths": recent_months_payload,
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def elastic_metrics_payload(
|
||||||
|
client: "Elasticsearch",
|
||||||
|
index: str,
|
||||||
|
*,
|
||||||
|
channel_field_candidates: Optional[List[str]] = None,
|
||||||
|
debug: bool = False,
|
||||||
|
) -> Dict[str, Any]:
|
||||||
|
if channel_field_candidates is None:
|
||||||
|
channel_field_candidates = ["channel_id.keyword", "channel_id"]
|
||||||
|
|
||||||
|
base_body: Dict[str, Any] = {
|
||||||
|
"size": 0,
|
||||||
|
"track_total_hits": True,
|
||||||
|
"aggs": {
|
||||||
|
"channels": {
|
||||||
|
"terms": {
|
||||||
|
"field": "channel_id.keyword",
|
||||||
|
"size": 500,
|
||||||
|
"order": {"_count": "desc"},
|
||||||
|
},
|
||||||
|
"aggs": {
|
||||||
|
"name": {
|
||||||
|
"top_hits": {
|
||||||
|
"size": 1,
|
||||||
|
"_source": {"includes": ["channel_name"]},
|
||||||
|
}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
},
|
||||||
|
"year_histogram": {
|
||||||
|
"date_histogram": {
|
||||||
|
"field": "date",
|
||||||
|
"calendar_interval": "year",
|
||||||
|
"format": "yyyy",
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"month_histogram": {
|
||||||
|
"date_histogram": {
|
||||||
|
"field": "date",
|
||||||
|
"calendar_interval": "month",
|
||||||
|
"format": "yyyy-MM",
|
||||||
|
"order": {"_key": "asc"},
|
||||||
|
}
|
||||||
|
},
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
last_error: Optional[Exception] = None
|
||||||
|
response: Optional[Dict[str, Any]] = None
|
||||||
|
for candidate_field in channel_field_candidates:
|
||||||
|
body = json.loads(json.dumps(base_body))
|
||||||
|
body["aggs"]["channels"]["terms"]["field"] = candidate_field
|
||||||
|
try:
|
||||||
|
if debug:
|
||||||
|
LOGGER.info(
|
||||||
|
"Elasticsearch metrics request: %s",
|
||||||
|
json.dumps({"index": index, "body": body}, indent=2),
|
||||||
|
)
|
||||||
|
response = client.search(index=index, body=body)
|
||||||
|
break
|
||||||
|
except BadRequestError as exc:
|
||||||
|
last_error = exc
|
||||||
|
if debug:
|
||||||
|
LOGGER.warning(
|
||||||
|
"Metrics aggregation failed for field %s: %s",
|
||||||
|
candidate_field,
|
||||||
|
exc,
|
||||||
|
)
|
||||||
|
if response is None:
|
||||||
|
raise last_error or RuntimeError("Unable to compute metrics from Elasticsearch.")
|
||||||
|
|
||||||
|
hits = response.get("hits", {})
|
||||||
|
total_items = hits.get("total", {}).get("value", 0)
|
||||||
|
|
||||||
|
if debug:
|
||||||
|
LOGGER.info(
|
||||||
|
"Elasticsearch metrics response: %s",
|
||||||
|
json.dumps(response, indent=2, default=str),
|
||||||
|
)
|
||||||
|
|
||||||
|
aggregations = response.get("aggregations", {})
|
||||||
|
channel_buckets = aggregations.get("channels", {}).get("buckets", [])
|
||||||
|
items_per_channel = []
|
||||||
|
for bucket in channel_buckets:
|
||||||
|
key = bucket.get("key")
|
||||||
|
channel_name = key
|
||||||
|
top_hits = (
|
||||||
|
bucket.get("name", {})
|
||||||
|
.get("hits", {})
|
||||||
|
.get("hits", [])
|
||||||
|
)
|
||||||
|
if top_hits:
|
||||||
|
channel_name = (
|
||||||
|
top_hits[0]
|
||||||
|
.get("_source", {})
|
||||||
|
.get("channel_name", channel_name)
|
||||||
|
)
|
||||||
|
items_per_channel.append(
|
||||||
|
{"label": channel_name or key, "count": bucket.get("doc_count", 0)}
|
||||||
|
)
|
||||||
|
|
||||||
|
year_buckets = aggregations.get("year_histogram", {}).get("buckets", [])
|
||||||
|
year_histogram = [
|
||||||
|
{
|
||||||
|
"bucket": bucket.get("key_as_string")
|
||||||
|
or str(bucket.get("key")),
|
||||||
|
"count": bucket.get("doc_count", 0),
|
||||||
|
}
|
||||||
|
for bucket in year_buckets
|
||||||
|
]
|
||||||
|
|
||||||
|
month_buckets = aggregations.get("month_histogram", {}).get("buckets", [])
|
||||||
|
recent_months_entries = [
|
||||||
|
{
|
||||||
|
"bucket": bucket.get("key_as_string")
|
||||||
|
or str(bucket.get("key")),
|
||||||
|
"count": bucket.get("doc_count", 0),
|
||||||
|
"_key": bucket.get("key"),
|
||||||
|
}
|
||||||
|
for bucket in month_buckets
|
||||||
|
]
|
||||||
|
recent_months_entries.sort(key=lambda item: item.get("_key", 0))
|
||||||
|
recent_months_payload = [
|
||||||
|
{"bucket": entry["bucket"], "count": entry["count"]}
|
||||||
|
for entry in recent_months_entries[-12:]
|
||||||
|
]
|
||||||
|
|
||||||
|
return {
|
||||||
|
"totalItems": total_items,
|
||||||
|
"totalChannels": len(items_per_channel),
|
||||||
|
"itemsPerChannel": items_per_channel,
|
||||||
|
"yearHistogram": year_histogram,
|
||||||
|
"recentMonths": recent_months_payload,
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def parse_channel_params(values: Iterable[Optional[str]]) -> List[str]:
|
||||||
|
seen: Set[str] = set()
|
||||||
|
channels: List[str] = []
|
||||||
|
for value in values:
|
||||||
|
if not value:
|
||||||
|
continue
|
||||||
|
for part in str(value).split(","):
|
||||||
|
cleaned = part.strip()
|
||||||
|
if not cleaned or cleaned.lower() == "all":
|
||||||
|
continue
|
||||||
|
if cleaned not in seen:
|
||||||
|
seen.add(cleaned)
|
||||||
|
channels.append(cleaned)
|
||||||
|
return channels
|
||||||
|
|
||||||
|
|
||||||
|
def build_channel_filter(channels: Optional[Sequence[str]]) -> Optional[Dict]:
|
||||||
|
if not channels:
|
||||||
|
return None
|
||||||
|
per_channel_clauses: List[Dict[str, Any]] = []
|
||||||
|
for value in channels:
|
||||||
|
if not value:
|
||||||
|
continue
|
||||||
|
per_channel_clauses.append(
|
||||||
|
{
|
||||||
|
"bool": {
|
||||||
|
"should": [
|
||||||
|
{"term": {"channel_id.keyword": value}},
|
||||||
|
{"term": {"channel_id": value}},
|
||||||
|
],
|
||||||
|
"minimum_should_match": 1,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
)
|
||||||
|
if not per_channel_clauses:
|
||||||
|
return None
|
||||||
|
if len(per_channel_clauses) == 1:
|
||||||
|
return per_channel_clauses[0]
|
||||||
|
return {
|
||||||
|
"bool": {
|
||||||
|
"should": per_channel_clauses,
|
||||||
|
"minimum_should_match": 1,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def build_query_payload(
|
||||||
|
query: str,
|
||||||
|
*,
|
||||||
|
channels: Optional[Sequence[str]] = None,
|
||||||
|
sort: str = "relevant",
|
||||||
|
use_exact: bool = True,
|
||||||
|
use_fuzzy: bool = True,
|
||||||
|
use_phrase: bool = True,
|
||||||
|
use_query_string: bool = False,
|
||||||
|
) -> Dict:
|
||||||
|
filters: List[Dict] = []
|
||||||
|
should: List[Dict] = []
|
||||||
|
|
||||||
|
channel_filter = build_channel_filter(channels)
|
||||||
|
if channel_filter:
|
||||||
|
filters.append(channel_filter)
|
||||||
|
|
||||||
|
if use_query_string:
|
||||||
|
base_fields = ["title^3", "description^2", "transcript_full", "transcript_secondary_full"]
|
||||||
|
qs_query = (query or "").strip() or "*"
|
||||||
|
query_body: Dict[str, Any] = {
|
||||||
|
"query_string": {
|
||||||
|
"query": qs_query,
|
||||||
|
"default_operator": "AND",
|
||||||
|
"fields": base_fields,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if filters:
|
||||||
|
query_body = {"bool": {"must": query_body, "filter": filters}}
|
||||||
|
body: Dict = {
|
||||||
|
"query": query_body,
|
||||||
|
"highlight": {
|
||||||
|
"fields": {
|
||||||
|
"transcript_full": {
|
||||||
|
"fragment_size": 160,
|
||||||
|
"number_of_fragments": 5,
|
||||||
|
"fragmenter": "span",
|
||||||
|
},
|
||||||
|
"transcript_secondary_full": {
|
||||||
|
"fragment_size": 160,
|
||||||
|
"number_of_fragments": 5,
|
||||||
|
"fragmenter": "span",
|
||||||
|
},
|
||||||
|
"title": {"number_of_fragments": 0},
|
||||||
|
"description": {
|
||||||
|
"fragment_size": 160,
|
||||||
|
"number_of_fragments": 1,
|
||||||
|
},
|
||||||
|
},
|
||||||
|
"require_field_match": False,
|
||||||
|
"pre_tags": ["<mark>"],
|
||||||
|
"post_tags": ["</mark>"],
|
||||||
|
"encoder": "html",
|
||||||
|
"max_analyzed_offset": 900000,
|
||||||
|
},
|
||||||
|
}
|
||||||
|
if sort == "newer":
|
||||||
|
body["sort"] = [{"date": {"order": "desc"}}]
|
||||||
|
elif sort == "older":
|
||||||
|
body["sort"] = [{"date": {"order": "asc"}}]
|
||||||
|
return body
|
||||||
|
|
||||||
|
if query:
|
||||||
|
base_fields = ["title^3", "description^2", "transcript_full", "transcript_secondary_full"]
|
||||||
|
if use_phrase:
|
||||||
|
should.append(
|
||||||
|
{
|
||||||
|
"match_phrase": {
|
||||||
|
"transcript_full": {
|
||||||
|
"query": query,
|
||||||
|
"slop": 2,
|
||||||
|
"boost": 10.0,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
)
|
||||||
|
should.append(
|
||||||
|
{
|
||||||
|
"match_phrase": {
|
||||||
|
"transcript_secondary_full": {
|
||||||
|
"query": query,
|
||||||
|
"slop": 2,
|
||||||
|
"boost": 10.0,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
)
|
||||||
|
if use_fuzzy:
|
||||||
|
should.append(
|
||||||
|
{
|
||||||
|
"multi_match": {
|
||||||
|
"query": query,
|
||||||
|
"fields": base_fields,
|
||||||
|
"type": "best_fields",
|
||||||
|
"operator": "and",
|
||||||
|
"fuzziness": "AUTO",
|
||||||
|
"prefix_length": 1,
|
||||||
|
"max_expansions": 50,
|
||||||
|
"boost": 1.5,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
)
|
||||||
|
if use_exact:
|
||||||
|
should.append(
|
||||||
|
{
|
||||||
|
"multi_match": {
|
||||||
|
"query": query,
|
||||||
|
"fields": base_fields,
|
||||||
|
"type": "best_fields",
|
||||||
|
"operator": "and",
|
||||||
|
"boost": 3.0,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
)
|
||||||
|
|
||||||
|
if should:
|
||||||
|
query_body: Dict = {
|
||||||
|
"bool": {
|
||||||
|
"should": should,
|
||||||
|
"minimum_should_match": 1,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if filters:
|
||||||
|
query_body["bool"]["filter"] = filters
|
||||||
|
elif filters:
|
||||||
|
query_body = {"bool": {"filter": filters}}
|
||||||
|
else:
|
||||||
|
query_body = {"match_all": {}}
|
||||||
|
|
||||||
|
body: Dict = {
|
||||||
|
"query": query_body,
|
||||||
|
"highlight": {
|
||||||
|
"fields": {
|
||||||
|
"transcript_full": {
|
||||||
|
"fragment_size": 160,
|
||||||
|
"number_of_fragments": 5,
|
||||||
|
"fragmenter": "span",
|
||||||
|
},
|
||||||
|
"transcript_secondary_full": {
|
||||||
|
"fragment_size": 160,
|
||||||
|
"number_of_fragments": 5,
|
||||||
|
"fragmenter": "span",
|
||||||
|
},
|
||||||
|
"title": {"number_of_fragments": 0},
|
||||||
|
"description": {
|
||||||
|
"fragment_size": 160,
|
||||||
|
"number_of_fragments": 1,
|
||||||
|
},
|
||||||
|
},
|
||||||
|
"require_field_match": False,
|
||||||
|
"pre_tags": ["<mark>"],
|
||||||
|
"post_tags": ["</mark>"],
|
||||||
|
"encoder": "html",
|
||||||
|
"max_analyzed_offset": 900000,
|
||||||
|
},
|
||||||
|
}
|
||||||
|
if query_body.get("match_all") is None:
|
||||||
|
body["highlight"]["highlight_query"] = copy.deepcopy(query_body)
|
||||||
|
|
||||||
|
if sort == "newer":
|
||||||
|
body["sort"] = [{"date": {"order": "desc"}}]
|
||||||
|
elif sort == "older":
|
||||||
|
body["sort"] = [{"date": {"order": "asc"}}]
|
||||||
|
return body
|
||||||
|
|
||||||
|
|
||||||
|
def create_app(config: AppConfig = CONFIG) -> Flask:
|
||||||
|
app = Flask(__name__, static_folder=str(Path(__file__).parent / "static"))
|
||||||
|
client = _ensure_client(config)
|
||||||
|
index = config.elastic.index
|
||||||
|
|
||||||
|
@app.route("/")
|
||||||
|
def index_page():
|
||||||
|
return send_from_directory(app.static_folder, "index.html")
|
||||||
|
|
||||||
|
@app.route("/static/<path:filename>")
|
||||||
|
def static_files(filename: str):
|
||||||
|
return send_from_directory(app.static_folder, filename)
|
||||||
|
|
||||||
|
@app.route("/api/channels")
|
||||||
|
def channels():
|
||||||
|
base_channels_body = {
|
||||||
|
"size": 0,
|
||||||
|
"aggs": {
|
||||||
|
"channels": {
|
||||||
|
"terms": {"field": "channel_id", "size": 200},
|
||||||
|
"aggs": {
|
||||||
|
"name": {
|
||||||
|
"top_hits": {
|
||||||
|
"size": 1,
|
||||||
|
"_source": {"includes": ["channel_name"]},
|
||||||
|
}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
}
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
def run_channels_request(field_name: str):
|
||||||
|
body = json.loads(json.dumps(base_channels_body)) # deep copy
|
||||||
|
body["aggs"]["channels"]["terms"]["field"] = field_name
|
||||||
|
if config.elastic.debug:
|
||||||
|
LOGGER.info(
|
||||||
|
"Elasticsearch channels request: %s",
|
||||||
|
json.dumps({"index": index, "body": body}, indent=2),
|
||||||
|
)
|
||||||
|
return client.search(index=index, body=body)
|
||||||
|
|
||||||
|
response = None
|
||||||
|
last_error = None
|
||||||
|
for candidate_field in ("channel_id.keyword", "channel_id"):
|
||||||
|
try:
|
||||||
|
response = run_channels_request(candidate_field)
|
||||||
|
if config.elastic.debug:
|
||||||
|
LOGGER.info("Channels aggregation used field: %s", candidate_field)
|
||||||
|
break
|
||||||
|
except BadRequestError as exc:
|
||||||
|
last_error = exc
|
||||||
|
if config.elastic.debug:
|
||||||
|
LOGGER.warning(
|
||||||
|
"Channels aggregation failed for field %s: %s",
|
||||||
|
candidate_field,
|
||||||
|
exc,
|
||||||
|
)
|
||||||
|
if response is None:
|
||||||
|
raise last_error or RuntimeError("Unable to aggregate channels.")
|
||||||
|
|
||||||
|
if config.elastic.debug:
|
||||||
|
LOGGER.info(
|
||||||
|
"Elasticsearch channels response: %s",
|
||||||
|
json.dumps(response, indent=2, default=str),
|
||||||
|
)
|
||||||
|
buckets = (
|
||||||
|
response.get("aggregations", {})
|
||||||
|
.get("channels", {})
|
||||||
|
.get("buckets", [])
|
||||||
|
)
|
||||||
|
data = [
|
||||||
|
{
|
||||||
|
"Id": bucket.get("key"),
|
||||||
|
"Name": (
|
||||||
|
bucket.get("name", {})
|
||||||
|
.get("hits", {})
|
||||||
|
.get("hits", [{}])[0]
|
||||||
|
.get("_source", {})
|
||||||
|
.get("channel_name", bucket.get("key"))
|
||||||
|
),
|
||||||
|
"Count": bucket.get("doc_count", 0),
|
||||||
|
}
|
||||||
|
for bucket in buckets
|
||||||
|
]
|
||||||
|
data.sort(key=lambda item: item["Name"].lower())
|
||||||
|
return jsonify(data)
|
||||||
|
|
||||||
|
@app.route("/api/search")
|
||||||
|
def search():
|
||||||
|
query = request.args.get("q", "", type=str)
|
||||||
|
raw_channels: List[Optional[str]] = request.args.getlist("channel_id")
|
||||||
|
legacy_channel = request.args.get("channel", type=str)
|
||||||
|
if legacy_channel:
|
||||||
|
raw_channels.append(legacy_channel)
|
||||||
|
channels = parse_channel_params(raw_channels)
|
||||||
|
sort = request.args.get("sort", "relevant", type=str)
|
||||||
|
page = max(request.args.get("page", 0, type=int), 0)
|
||||||
|
size = max(request.args.get("size", 10, type=int), 1)
|
||||||
|
|
||||||
|
def parse_flag(name: str, default: bool = True) -> bool:
|
||||||
|
value = request.args.get(name)
|
||||||
|
if value is None:
|
||||||
|
return default
|
||||||
|
return value.lower() not in {"0", "false", "no"}
|
||||||
|
|
||||||
|
use_exact = parse_flag("exact", True)
|
||||||
|
use_fuzzy = parse_flag("fuzzy", True)
|
||||||
|
use_phrase = parse_flag("phrase", True)
|
||||||
|
use_query_string = parse_flag("query_string", False)
|
||||||
|
if use_query_string:
|
||||||
|
use_exact = use_fuzzy = use_phrase = False
|
||||||
|
|
||||||
|
payload = build_query_payload(
|
||||||
|
query,
|
||||||
|
channels=channels,
|
||||||
|
sort=sort,
|
||||||
|
use_exact=use_exact,
|
||||||
|
use_fuzzy=use_fuzzy,
|
||||||
|
use_phrase=use_phrase,
|
||||||
|
use_query_string=use_query_string,
|
||||||
|
)
|
||||||
|
start = page * size
|
||||||
|
if config.elastic.debug:
|
||||||
|
LOGGER.info(
|
||||||
|
"Elasticsearch search request: %s",
|
||||||
|
json.dumps(
|
||||||
|
{
|
||||||
|
"index": index,
|
||||||
|
"from": start,
|
||||||
|
"size": size,
|
||||||
|
"body": payload,
|
||||||
|
"channels": channels,
|
||||||
|
"toggles": {
|
||||||
|
"exact": use_exact,
|
||||||
|
"fuzzy": use_fuzzy,
|
||||||
|
"phrase": use_phrase,
|
||||||
|
},
|
||||||
|
},
|
||||||
|
indent=2,
|
||||||
|
),
|
||||||
|
)
|
||||||
|
response = client.search(
|
||||||
|
index=index,
|
||||||
|
from_=start,
|
||||||
|
size=size,
|
||||||
|
body=payload,
|
||||||
|
)
|
||||||
|
if config.elastic.debug:
|
||||||
|
LOGGER.info(
|
||||||
|
"Elasticsearch search response: %s",
|
||||||
|
json.dumps(response, indent=2, default=str),
|
||||||
|
)
|
||||||
|
|
||||||
|
hits = response.get("hits", {})
|
||||||
|
total = hits.get("total", {}).get("value", 0)
|
||||||
|
documents = []
|
||||||
|
for hit in hits.get("hits", []):
|
||||||
|
source = hit.get("_source", {})
|
||||||
|
highlight_map = hit.get("highlight", {})
|
||||||
|
transcript_highlight = (
|
||||||
|
(highlight_map.get("transcript_full", []) or [])
|
||||||
|
+ (highlight_map.get("transcript_secondary_full", []) or [])
|
||||||
|
)
|
||||||
|
|
||||||
|
title_html = (
|
||||||
|
highlight_map.get("title")
|
||||||
|
or [source.get("title") or "Untitled"]
|
||||||
|
)[0]
|
||||||
|
description_html = (
|
||||||
|
highlight_map.get("description")
|
||||||
|
or [source.get("description") or ""]
|
||||||
|
)[0]
|
||||||
|
documents.append(
|
||||||
|
{
|
||||||
|
"video_id": source.get("video_id"),
|
||||||
|
"channel_id": source.get("channel_id"),
|
||||||
|
"channel_name": source.get("channel_name"),
|
||||||
|
"title": source.get("title"),
|
||||||
|
"titleHtml": title_html,
|
||||||
|
"description": source.get("description"),
|
||||||
|
"descriptionHtml": description_html,
|
||||||
|
"date": source.get("date"),
|
||||||
|
"url": source.get("url"),
|
||||||
|
"toHighlight": transcript_highlight,
|
||||||
|
"highlightSource": {
|
||||||
|
"primary": bool(highlight_map.get("transcript_full")),
|
||||||
|
"secondary": bool(highlight_map.get("transcript_secondary_full")),
|
||||||
|
},
|
||||||
|
}
|
||||||
|
)
|
||||||
|
|
||||||
|
return jsonify(
|
||||||
|
{
|
||||||
|
"items": documents,
|
||||||
|
"totalResults": total,
|
||||||
|
"totalPages": (total + size - 1) // size,
|
||||||
|
"currentPage": page,
|
||||||
|
}
|
||||||
|
)
|
||||||
|
|
||||||
|
@app.route("/api/metrics")
|
||||||
|
def metrics():
|
||||||
|
try:
|
||||||
|
data = elastic_metrics_payload(
|
||||||
|
client,
|
||||||
|
index,
|
||||||
|
channel_field_candidates=["channel_id.keyword", "channel_id"],
|
||||||
|
debug=config.elastic.debug,
|
||||||
|
)
|
||||||
|
except Exception:
|
||||||
|
LOGGER.exception(
|
||||||
|
"Falling back to local metrics payload due to Elasticsearch error.",
|
||||||
|
exc_info=True,
|
||||||
|
)
|
||||||
|
data = metrics_payload(config.data.root)
|
||||||
|
return jsonify(data)
|
||||||
|
|
||||||
|
@app.route("/api/frequency")
|
||||||
|
def frequency():
|
||||||
|
raw_term = request.args.get("term", type=str) or ""
|
||||||
|
use_query_string = request.args.get("query_string", default="0", type=str)
|
||||||
|
use_query_string = (use_query_string or "").lower() in {"1", "true", "yes"}
|
||||||
|
term = raw_term.strip()
|
||||||
|
if not term and not use_query_string:
|
||||||
|
return ("term parameter is required", 400)
|
||||||
|
if use_query_string and not term:
|
||||||
|
term = "*"
|
||||||
|
|
||||||
|
raw_channels: List[Optional[str]] = request.args.getlist("channel_id")
|
||||||
|
legacy_channel = request.args.get("channel", type=str)
|
||||||
|
if legacy_channel:
|
||||||
|
raw_channels.append(legacy_channel)
|
||||||
|
channels = parse_channel_params(raw_channels)
|
||||||
|
interval = (request.args.get("interval", "month") or "month").lower()
|
||||||
|
allowed_intervals = {"day", "week", "month", "quarter", "year"}
|
||||||
|
if interval not in allowed_intervals:
|
||||||
|
interval = "month"
|
||||||
|
start = request.args.get("start", type=str)
|
||||||
|
end = request.args.get("end", type=str)
|
||||||
|
|
||||||
|
filters: List[Dict] = []
|
||||||
|
channel_filter = build_channel_filter(channels)
|
||||||
|
if channel_filter:
|
||||||
|
filters.append(channel_filter)
|
||||||
|
if start or end:
|
||||||
|
range_filter: Dict[str, Dict[str, Dict[str, str]]] = {"range": {"date": {}}}
|
||||||
|
if start:
|
||||||
|
range_filter["range"]["date"]["gte"] = start
|
||||||
|
if end:
|
||||||
|
range_filter["range"]["date"]["lte"] = end
|
||||||
|
filters.append(range_filter)
|
||||||
|
|
||||||
|
base_fields = ["title^3", "description^2", "transcript_full", "transcript_secondary_full"]
|
||||||
|
if use_query_string:
|
||||||
|
qs_query = term or "*"
|
||||||
|
must_clause: List[Dict[str, Any]] = [
|
||||||
|
{
|
||||||
|
"query_string": {
|
||||||
|
"query": qs_query,
|
||||||
|
"default_operator": "AND",
|
||||||
|
"fields": base_fields,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
]
|
||||||
|
else:
|
||||||
|
must_clause = [
|
||||||
|
{
|
||||||
|
"multi_match": {
|
||||||
|
"query": term,
|
||||||
|
"fields": base_fields,
|
||||||
|
"type": "best_fields",
|
||||||
|
"operator": "and",
|
||||||
|
}
|
||||||
|
}
|
||||||
|
]
|
||||||
|
|
||||||
|
query: Dict[str, Any] = {"bool": {"must": must_clause}}
|
||||||
|
if filters:
|
||||||
|
query["bool"]["filter"] = filters
|
||||||
|
|
||||||
|
histogram: Dict[str, Any] = {
|
||||||
|
"field": "date",
|
||||||
|
"calendar_interval": interval,
|
||||||
|
"min_doc_count": 0,
|
||||||
|
}
|
||||||
|
if start or end:
|
||||||
|
bounds: Dict[str, str] = {}
|
||||||
|
if start:
|
||||||
|
bounds["min"] = start
|
||||||
|
if end:
|
||||||
|
bounds["max"] = end
|
||||||
|
if bounds:
|
||||||
|
histogram["extended_bounds"] = bounds
|
||||||
|
|
||||||
|
channel_terms_size = max(6, len(channels)) if channels else 6
|
||||||
|
|
||||||
|
body = {
|
||||||
|
"size": 0,
|
||||||
|
"query": query,
|
||||||
|
"aggs": {
|
||||||
|
"over_time": {
|
||||||
|
"date_histogram": histogram,
|
||||||
|
"aggs": {
|
||||||
|
"by_channel": {
|
||||||
|
"terms": {
|
||||||
|
"field": "channel_id.keyword",
|
||||||
|
"size": channel_terms_size,
|
||||||
|
"order": {"_count": "desc"},
|
||||||
|
}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
}
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
if config.elastic.debug:
|
||||||
|
LOGGER.info(
|
||||||
|
"Elasticsearch frequency request: %s",
|
||||||
|
json.dumps(
|
||||||
|
{
|
||||||
|
"index": index,
|
||||||
|
"body": body,
|
||||||
|
"term": term,
|
||||||
|
"interval": interval,
|
||||||
|
"channels": channels,
|
||||||
|
"start": start,
|
||||||
|
"end": end,
|
||||||
|
"query_string": use_query_string,
|
||||||
|
},
|
||||||
|
indent=2,
|
||||||
|
),
|
||||||
|
)
|
||||||
|
|
||||||
|
response = client.search(index=index, body=body)
|
||||||
|
|
||||||
|
if config.elastic.debug:
|
||||||
|
LOGGER.info(
|
||||||
|
"Elasticsearch frequency response: %s",
|
||||||
|
json.dumps(response, indent=2, default=str),
|
||||||
|
)
|
||||||
|
|
||||||
|
raw_buckets = (
|
||||||
|
response.get("aggregations", {})
|
||||||
|
.get("over_time", {})
|
||||||
|
.get("buckets", [])
|
||||||
|
)
|
||||||
|
|
||||||
|
channel_totals: Dict[str, int] = {}
|
||||||
|
buckets: List[Dict[str, Any]] = []
|
||||||
|
for bucket in raw_buckets:
|
||||||
|
date_str = bucket.get("key_as_string")
|
||||||
|
total = bucket.get("doc_count", 0)
|
||||||
|
channel_entries: List[Dict[str, Any]] = []
|
||||||
|
for ch_bucket in bucket.get("by_channel", {}).get("buckets", []):
|
||||||
|
cid = ch_bucket.get("key")
|
||||||
|
count = ch_bucket.get("doc_count", 0)
|
||||||
|
if cid:
|
||||||
|
channel_entries.append({"id": cid, "count": count})
|
||||||
|
channel_totals[cid] = channel_totals.get(cid, 0) + count
|
||||||
|
buckets.append(
|
||||||
|
{"date": date_str, "total": total, "channels": channel_entries}
|
||||||
|
)
|
||||||
|
|
||||||
|
ranked_channels = sorted(
|
||||||
|
[{"id": cid, "total": total} for cid, total in channel_totals.items()],
|
||||||
|
key=lambda item: item["total"],
|
||||||
|
reverse=True,
|
||||||
|
)
|
||||||
|
|
||||||
|
payload = {
|
||||||
|
"term": raw_term if not use_query_string else term,
|
||||||
|
"interval": interval,
|
||||||
|
"buckets": buckets,
|
||||||
|
"channels": ranked_channels,
|
||||||
|
"totalResults": response.get("hits", {})
|
||||||
|
.get("total", {})
|
||||||
|
.get("value", 0),
|
||||||
|
}
|
||||||
|
return jsonify(payload)
|
||||||
|
|
||||||
|
@app.route("/frequency")
|
||||||
|
def frequency_page():
|
||||||
|
return send_from_directory(app.static_folder, "frequency.html")
|
||||||
|
|
||||||
|
@app.route("/api/transcript")
|
||||||
|
def transcript():
|
||||||
|
video_id = request.args.get("video_id", type=str)
|
||||||
|
if not video_id:
|
||||||
|
return ("video_id not set", 400)
|
||||||
|
response = client.get(index=index, id=video_id, ignore=[404])
|
||||||
|
if config.elastic.debug:
|
||||||
|
LOGGER.info(
|
||||||
|
"Elasticsearch transcript request: index=%s id=%s", index, video_id
|
||||||
|
)
|
||||||
|
LOGGER.info(
|
||||||
|
"Elasticsearch transcript response: %s",
|
||||||
|
json.dumps(response, indent=2, default=str)
|
||||||
|
if response
|
||||||
|
else "None",
|
||||||
|
)
|
||||||
|
if not response or not response.get("found"):
|
||||||
|
return ("not found", 404)
|
||||||
|
source = response["_source"]
|
||||||
|
return jsonify(
|
||||||
|
{
|
||||||
|
"video_id": source.get("video_id"),
|
||||||
|
"title": source.get("title"),
|
||||||
|
"transcript_parts": source.get("transcript_parts", []),
|
||||||
|
"transcript_full": source.get("transcript_full"),
|
||||||
|
"transcript_secondary_parts": source.get("transcript_secondary_parts", []),
|
||||||
|
"transcript_secondary_full": source.get("transcript_secondary_full"),
|
||||||
|
}
|
||||||
|
)
|
||||||
|
|
||||||
|
return app
|
||||||
|
|
||||||
|
|
||||||
|
def main() -> None: # pragma: no cover
|
||||||
|
logging.basicConfig(level=logging.INFO, format="%(levelname)s %(message)s")
|
||||||
|
app = create_app()
|
||||||
|
app.run(host="0.0.0.0", port=8080, debug=True)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__": # pragma: no cover
|
||||||
|
main()
|
||||||
733
static/app.js
Normal file
733
static/app.js
Normal file
@ -0,0 +1,733 @@
|
|||||||
|
(() => {
|
||||||
|
let qs = new URLSearchParams(window.location.search);
|
||||||
|
const qInput = document.getElementById("q");
|
||||||
|
const channelDropdown = document.getElementById("channelDropdown");
|
||||||
|
const channelSummary = document.getElementById("channelSummary");
|
||||||
|
const channelOptions = document.getElementById("channelOptions");
|
||||||
|
const sortSel = document.getElementById("sort");
|
||||||
|
const sizeSel = document.getElementById("size");
|
||||||
|
const exactToggle = document.getElementById("exactToggle");
|
||||||
|
const fuzzyToggle = document.getElementById("fuzzyToggle");
|
||||||
|
const phraseToggle = document.getElementById("phraseToggle");
|
||||||
|
const queryToggle = document.getElementById("queryStringToggle");
|
||||||
|
const searchBtn = document.getElementById("searchBtn");
|
||||||
|
const resultsDiv = document.getElementById("results");
|
||||||
|
const metaDiv = document.getElementById("meta");
|
||||||
|
const metricsContainer = document.getElementById("metrics");
|
||||||
|
const metricsStatus = document.getElementById("metricsStatus");
|
||||||
|
const metricsContent = document.getElementById("metricsContent");
|
||||||
|
const freqSummary = document.getElementById("frequencySummary");
|
||||||
|
const freqChart = document.getElementById("frequencyChart");
|
||||||
|
const channelMap = new Map();
|
||||||
|
const selectedChannels = new Set();
|
||||||
|
let pendingChannelSelection = [];
|
||||||
|
let channelsReady = false;
|
||||||
|
let suppressChannelChange = false;
|
||||||
|
let allChannelsCheckbox = null;
|
||||||
|
let previousToggleState = { exact: true, fuzzy: true, phrase: true };
|
||||||
|
let currentPage =
|
||||||
|
parseInt(qs.get("page") || "0", 10) ||
|
||||||
|
0;
|
||||||
|
|
||||||
|
function parseBoolParam(name, defaultValue) {
|
||||||
|
const raw = qs.get(name);
|
||||||
|
if (raw === null) return defaultValue;
|
||||||
|
const lowered = raw.toLowerCase();
|
||||||
|
return !["0", "false", "no"].includes(lowered);
|
||||||
|
}
|
||||||
|
|
||||||
|
function parseChannelParams(params) {
|
||||||
|
const collected = [];
|
||||||
|
if (!params) return collected;
|
||||||
|
const seen = new Set();
|
||||||
|
const rawValues = params.getAll("channel_id");
|
||||||
|
const legacy = params.get("channel");
|
||||||
|
if (legacy) rawValues.push(legacy);
|
||||||
|
rawValues.forEach((value) => {
|
||||||
|
if (value == null) return;
|
||||||
|
String(value)
|
||||||
|
.split(",")
|
||||||
|
.map((part) => part.trim())
|
||||||
|
.filter((part) => part && part.toLowerCase() !== "all")
|
||||||
|
.forEach((part) => {
|
||||||
|
if (!seen.has(part)) {
|
||||||
|
seen.add(part);
|
||||||
|
collected.push(part);
|
||||||
|
}
|
||||||
|
});
|
||||||
|
});
|
||||||
|
return collected;
|
||||||
|
}
|
||||||
|
|
||||||
|
function getSelectedChannels() {
|
||||||
|
return Array.from(selectedChannels);
|
||||||
|
}
|
||||||
|
|
||||||
|
function ensureAllCheckboxState() {
|
||||||
|
if (allChannelsCheckbox) {
|
||||||
|
allChannelsCheckbox.checked = selectedChannels.size === 0;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
function updateChannelSummary() {
|
||||||
|
if (!channelSummary) return;
|
||||||
|
if (!selectedChannels.size) {
|
||||||
|
channelSummary.textContent = "All Channels";
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
const names = Array.from(selectedChannels).map(
|
||||||
|
(id) => channelMap.get(id) || id
|
||||||
|
);
|
||||||
|
if (names.length > 1) {
|
||||||
|
names.sort((a, b) => a.localeCompare(b, undefined, { sensitivity: "base" }));
|
||||||
|
}
|
||||||
|
let label = names.slice(0, 3).join(", ");
|
||||||
|
if (names.length > 3) {
|
||||||
|
label += ` +${names.length - 3} more`;
|
||||||
|
}
|
||||||
|
channelSummary.textContent = label;
|
||||||
|
}
|
||||||
|
|
||||||
|
function applyChannelSelection(ids, { silent = false } = {}) {
|
||||||
|
selectedChannels.clear();
|
||||||
|
ids.forEach((id) => selectedChannels.add(id));
|
||||||
|
pendingChannelSelection = getSelectedChannels();
|
||||||
|
ensureAllCheckboxState();
|
||||||
|
if (channelOptions) {
|
||||||
|
suppressChannelChange = true;
|
||||||
|
const checkboxes = channelOptions.querySelectorAll(
|
||||||
|
'input[type="checkbox"][data-channel="1"]'
|
||||||
|
);
|
||||||
|
checkboxes.forEach((checkbox) => {
|
||||||
|
checkbox.checked = selectedChannels.has(checkbox.value);
|
||||||
|
});
|
||||||
|
suppressChannelChange = false;
|
||||||
|
}
|
||||||
|
updateChannelSummary();
|
||||||
|
if (!silent && channelsReady) {
|
||||||
|
runSearch(0);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
function setFromQuery() {
|
||||||
|
qInput.value = qs.get("q") || "";
|
||||||
|
sortSel.value = qs.get("sort") || "relevant";
|
||||||
|
sizeSel.value = qs.get("size") || "10";
|
||||||
|
pendingChannelSelection = parseChannelParams(qs);
|
||||||
|
applyChannelSelection(pendingChannelSelection, { silent: true });
|
||||||
|
exactToggle.checked = parseBoolParam("exact", true);
|
||||||
|
fuzzyToggle.checked = parseBoolParam("fuzzy", true);
|
||||||
|
phraseToggle.checked = parseBoolParam("phrase", true);
|
||||||
|
queryToggle.checked = parseBoolParam("query_string", false);
|
||||||
|
applyQueryMode();
|
||||||
|
rememberToggleState();
|
||||||
|
}
|
||||||
|
|
||||||
|
function applyQueryMode() {
|
||||||
|
if (!queryToggle) return;
|
||||||
|
if (queryToggle.checked) {
|
||||||
|
if (!exactToggle.disabled) {
|
||||||
|
previousToggleState = {
|
||||||
|
exact: exactToggle.checked,
|
||||||
|
fuzzy: fuzzyToggle.checked,
|
||||||
|
phrase: phraseToggle.checked,
|
||||||
|
};
|
||||||
|
}
|
||||||
|
exactToggle.checked = false;
|
||||||
|
fuzzyToggle.checked = false;
|
||||||
|
phraseToggle.checked = false;
|
||||||
|
exactToggle.disabled = true;
|
||||||
|
fuzzyToggle.disabled = true;
|
||||||
|
phraseToggle.disabled = true;
|
||||||
|
} else {
|
||||||
|
exactToggle.disabled = false;
|
||||||
|
fuzzyToggle.disabled = false;
|
||||||
|
phraseToggle.disabled = false;
|
||||||
|
exactToggle.checked = previousToggleState.exact;
|
||||||
|
fuzzyToggle.checked = previousToggleState.fuzzy;
|
||||||
|
phraseToggle.checked = previousToggleState.phrase;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
function rememberToggleState() {
|
||||||
|
if (queryToggle && !queryToggle.checked) {
|
||||||
|
previousToggleState = {
|
||||||
|
exact: !!exactToggle.checked,
|
||||||
|
fuzzy: !!fuzzyToggle.checked,
|
||||||
|
phrase: !!phraseToggle.checked,
|
||||||
|
};
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (channelOptions) {
|
||||||
|
channelOptions.addEventListener("change", (event) => {
|
||||||
|
const target = event.target;
|
||||||
|
if (!(target instanceof HTMLInputElement) || target.type !== "checkbox") {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
if (suppressChannelChange) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
if (target.dataset.all === "1") {
|
||||||
|
if (!target.checked && !selectedChannels.size) {
|
||||||
|
suppressChannelChange = true;
|
||||||
|
target.checked = true;
|
||||||
|
suppressChannelChange = false;
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
if (target.checked) {
|
||||||
|
selectedChannels.clear();
|
||||||
|
pendingChannelSelection = [];
|
||||||
|
suppressChannelChange = true;
|
||||||
|
const others = channelOptions.querySelectorAll(
|
||||||
|
'input[type="checkbox"][data-channel="1"]'
|
||||||
|
);
|
||||||
|
others.forEach((checkbox) => {
|
||||||
|
checkbox.checked = false;
|
||||||
|
});
|
||||||
|
suppressChannelChange = false;
|
||||||
|
ensureAllCheckboxState();
|
||||||
|
updateChannelSummary();
|
||||||
|
if (channelsReady) {
|
||||||
|
runSearch(0);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
const id = target.value;
|
||||||
|
if (!id) return;
|
||||||
|
if (target.checked) {
|
||||||
|
selectedChannels.add(id);
|
||||||
|
} else {
|
||||||
|
selectedChannels.delete(id);
|
||||||
|
}
|
||||||
|
pendingChannelSelection = getSelectedChannels();
|
||||||
|
ensureAllCheckboxState();
|
||||||
|
updateChannelSummary();
|
||||||
|
if (channelsReady) {
|
||||||
|
runSearch(0);
|
||||||
|
}
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
async function loadChannels() {
|
||||||
|
if (!channelOptions) {
|
||||||
|
channelsReady = true;
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
try {
|
||||||
|
const res = await fetch("/api/channels");
|
||||||
|
const data = await res.json();
|
||||||
|
channelMap.clear();
|
||||||
|
channelOptions.innerHTML = "";
|
||||||
|
|
||||||
|
const listFragment = document.createDocumentFragment();
|
||||||
|
|
||||||
|
const allLabel = document.createElement("label");
|
||||||
|
allLabel.className = "channel-option";
|
||||||
|
allChannelsCheckbox = document.createElement("input");
|
||||||
|
allChannelsCheckbox.type = "checkbox";
|
||||||
|
allChannelsCheckbox.dataset.all = "1";
|
||||||
|
allChannelsCheckbox.checked = selectedChannels.size === 0;
|
||||||
|
const allText = document.createElement("span");
|
||||||
|
allText.textContent = "All Channels";
|
||||||
|
allLabel.appendChild(allChannelsCheckbox);
|
||||||
|
allLabel.appendChild(allText);
|
||||||
|
listFragment.appendChild(allLabel);
|
||||||
|
|
||||||
|
data.forEach((item) => {
|
||||||
|
const label = document.createElement("label");
|
||||||
|
label.className = "channel-option";
|
||||||
|
const checkbox = document.createElement("input");
|
||||||
|
checkbox.type = "checkbox";
|
||||||
|
checkbox.value = item.Id;
|
||||||
|
checkbox.dataset.channel = "1";
|
||||||
|
const text = document.createElement("span");
|
||||||
|
text.textContent = `${item.Name} (${item.Count})`;
|
||||||
|
label.appendChild(checkbox);
|
||||||
|
label.appendChild(text);
|
||||||
|
listFragment.appendChild(label);
|
||||||
|
channelMap.set(item.Id, item.Name);
|
||||||
|
});
|
||||||
|
|
||||||
|
channelOptions.appendChild(listFragment);
|
||||||
|
|
||||||
|
if (!data.length) {
|
||||||
|
const empty = document.createElement("div");
|
||||||
|
empty.textContent = "No channels available.";
|
||||||
|
channelOptions.appendChild(empty);
|
||||||
|
}
|
||||||
|
|
||||||
|
const initialSelection = pendingChannelSelection.length
|
||||||
|
? pendingChannelSelection
|
||||||
|
: Array.from(selectedChannels);
|
||||||
|
applyChannelSelection(initialSelection, { silent: true });
|
||||||
|
channelsReady = true;
|
||||||
|
updateChannelSummary();
|
||||||
|
} catch (err) {
|
||||||
|
console.error("Failed to load channels", err);
|
||||||
|
channelOptions.innerHTML = "<div>Failed to load channels.</div>";
|
||||||
|
channelsReady = true;
|
||||||
|
ensureAllCheckboxState();
|
||||||
|
updateChannelSummary();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
function updateUrl(q, sort, channels, page, size, exact, fuzzy, phrase, queryMode) {
|
||||||
|
const next = new URL(window.location.href);
|
||||||
|
next.searchParams.set("q", q);
|
||||||
|
next.searchParams.set("sort", sort);
|
||||||
|
next.searchParams.delete("channel_id");
|
||||||
|
next.searchParams.delete("channel");
|
||||||
|
channels.forEach((id) => next.searchParams.append("channel_id", id));
|
||||||
|
next.searchParams.set("page", page);
|
||||||
|
next.searchParams.set("size", size);
|
||||||
|
next.searchParams.set("exact", exact ? "1" : "0");
|
||||||
|
next.searchParams.set("fuzzy", fuzzy ? "1" : "0");
|
||||||
|
next.searchParams.set("phrase", phrase ? "1" : "0");
|
||||||
|
next.searchParams.set("query_string", queryMode ? "1" : "0");
|
||||||
|
history.pushState({}, "", next.toString());
|
||||||
|
}
|
||||||
|
|
||||||
|
function fmtDate(value) {
|
||||||
|
try {
|
||||||
|
return (value || "").split("T")[0];
|
||||||
|
} catch {
|
||||||
|
return value;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
function fmtNumber(n) {
|
||||||
|
if (typeof n === "number") return n.toLocaleString();
|
||||||
|
return n;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
// Transcript viewer functionality removed.
|
||||||
|
|
||||||
|
function renderMetrics(data) {
|
||||||
|
if (!metricsContent) return;
|
||||||
|
metricsContent.innerHTML = "";
|
||||||
|
if (!data) return;
|
||||||
|
|
||||||
|
if (metricsStatus) {
|
||||||
|
metricsStatus.textContent = "";
|
||||||
|
}
|
||||||
|
|
||||||
|
const summary = document.createElement("div");
|
||||||
|
summary.innerHTML = `<strong>Entries:</strong> ${fmtNumber(data.totalItems)} • <strong>Channels:</strong> ${fmtNumber(data.totalChannels)}`;
|
||||||
|
metricsContent.appendChild(summary);
|
||||||
|
|
||||||
|
if (Array.isArray(data.itemsPerChannel) && data.itemsPerChannel.length) {
|
||||||
|
const top = data.itemsPerChannel.slice(0, 5);
|
||||||
|
const channelHeader = document.createElement("div");
|
||||||
|
channelHeader.style.marginTop = "8px";
|
||||||
|
channelHeader.innerHTML = "<strong>Top Channels</strong>";
|
||||||
|
metricsContent.appendChild(channelHeader);
|
||||||
|
|
||||||
|
const channelList = document.createElement("div");
|
||||||
|
channelList.className = "muted";
|
||||||
|
top.forEach((entry) => {
|
||||||
|
const row = document.createElement("div");
|
||||||
|
row.textContent = `${entry.label}: ${fmtNumber(entry.count)}`;
|
||||||
|
channelList.appendChild(row);
|
||||||
|
});
|
||||||
|
metricsContent.appendChild(channelList);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
async function loadMetrics() {
|
||||||
|
if (!metricsContainer) return;
|
||||||
|
metricsContainer.dataset.loading = "1";
|
||||||
|
if (!metricsContainer.dataset.loaded && metricsStatus) {
|
||||||
|
metricsStatus.textContent = "Loading metrics…";
|
||||||
|
}
|
||||||
|
try {
|
||||||
|
const res = await fetch("/api/metrics");
|
||||||
|
const data = await res.json();
|
||||||
|
renderMetrics(data);
|
||||||
|
metricsContainer.dataset.loaded = "1";
|
||||||
|
} catch (err) {
|
||||||
|
console.error("Failed to load metrics", err);
|
||||||
|
if (!metricsContainer.dataset.loaded && metricsStatus) {
|
||||||
|
metricsStatus.textContent = "Metrics unavailable.";
|
||||||
|
}
|
||||||
|
} finally {
|
||||||
|
delete metricsContainer.dataset.loading;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
function clearFrequency(message) {
|
||||||
|
if (freqSummary) {
|
||||||
|
freqSummary.textContent = message || "";
|
||||||
|
}
|
||||||
|
if (freqChart) {
|
||||||
|
freqChart.innerHTML = "";
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
function renderFrequencyChart(buckets, channelTotals) {
|
||||||
|
if (!freqChart || typeof d3 === "undefined") {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
freqChart.innerHTML = "";
|
||||||
|
if (!buckets.length) {
|
||||||
|
clearFrequency("No matches for this query.");
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
let channelsOrder =
|
||||||
|
(channelTotals && channelTotals.length
|
||||||
|
? channelTotals.map((entry) => entry.id)
|
||||||
|
: []) || [];
|
||||||
|
if (!channelsOrder.length) {
|
||||||
|
const unique = new Set();
|
||||||
|
buckets.forEach((bucket) => {
|
||||||
|
(bucket.channels || []).forEach((entry) => unique.add(entry.id));
|
||||||
|
});
|
||||||
|
channelsOrder = Array.from(unique);
|
||||||
|
}
|
||||||
|
channelsOrder = channelsOrder.slice(0, 6);
|
||||||
|
if (!channelsOrder.length) {
|
||||||
|
clearFrequency("No matches for this query.");
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
const dateKeyFormat = d3.timeFormat("%Y-%m-%d");
|
||||||
|
const parsed = buckets
|
||||||
|
.map((bucket) => {
|
||||||
|
const parsedDate = d3.isoParse(bucket.date) || new Date(bucket.date);
|
||||||
|
if (!(parsedDate instanceof Date) || Number.isNaN(parsedDate.valueOf())) {
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
const counts = {};
|
||||||
|
(bucket.channels || []).forEach((entry) => {
|
||||||
|
if (channelsOrder.includes(entry.id)) {
|
||||||
|
counts[entry.id] = entry.count || 0;
|
||||||
|
}
|
||||||
|
});
|
||||||
|
return {
|
||||||
|
date: parsedDate,
|
||||||
|
dateKey: dateKeyFormat(parsedDate),
|
||||||
|
counts,
|
||||||
|
};
|
||||||
|
})
|
||||||
|
.filter(Boolean);
|
||||||
|
|
||||||
|
if (!parsed.length) {
|
||||||
|
clearFrequency("Timeline unavailable.");
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
const margin = { top: 12, right: 12, bottom: 52, left: 56 };
|
||||||
|
const fullWidth = freqChart.clientWidth || 360;
|
||||||
|
const fullHeight = 220;
|
||||||
|
const width = fullWidth - margin.left - margin.right;
|
||||||
|
const height = fullHeight - margin.top - margin.bottom;
|
||||||
|
|
||||||
|
const svg = d3
|
||||||
|
.select(freqChart)
|
||||||
|
.append("svg")
|
||||||
|
.attr("width", fullWidth)
|
||||||
|
.attr("height", fullHeight);
|
||||||
|
|
||||||
|
const g = svg
|
||||||
|
.append("g")
|
||||||
|
.attr("transform", `translate(${margin.left},${margin.top})`);
|
||||||
|
|
||||||
|
const x = d3
|
||||||
|
.scaleBand()
|
||||||
|
.domain(parsed.map((entry) => entry.dateKey))
|
||||||
|
.range([0, width])
|
||||||
|
.padding(0.25);
|
||||||
|
|
||||||
|
const yMax = d3.max(parsed, (entry) =>
|
||||||
|
d3.sum(channelsOrder, (key) => entry.counts[key] || 0)
|
||||||
|
);
|
||||||
|
|
||||||
|
const y = d3
|
||||||
|
.scaleLinear()
|
||||||
|
.domain([0, yMax || 0])
|
||||||
|
.nice()
|
||||||
|
.range([height, 0]);
|
||||||
|
|
||||||
|
const tickValues =
|
||||||
|
parsed.length <= 6
|
||||||
|
? parsed.map((entry) => entry.dateKey)
|
||||||
|
: parsed
|
||||||
|
.filter((_, index, arr) => index % Math.ceil(arr.length / 6) === 0)
|
||||||
|
.map((entry) => entry.dateKey);
|
||||||
|
|
||||||
|
const xAxis = d3.axisBottom(x).tickValues(tickValues);
|
||||||
|
const yAxis = d3.axisLeft(y).ticks(5);
|
||||||
|
|
||||||
|
g.append("g")
|
||||||
|
.attr("class", "axis")
|
||||||
|
.attr("transform", `translate(0,${height})`)
|
||||||
|
.call(xAxis)
|
||||||
|
.selectAll("text")
|
||||||
|
.attr("text-anchor", "end")
|
||||||
|
.attr("transform", "rotate(-35)")
|
||||||
|
.attr("dx", "-0.8em")
|
||||||
|
.attr("dy", "0.15em");
|
||||||
|
|
||||||
|
g.append("g").attr("class", "axis").call(yAxis);
|
||||||
|
|
||||||
|
const stack = d3.stack().keys(channelsOrder).value((entry, key) => entry.counts[key] || 0);
|
||||||
|
const stacked = stack(parsed);
|
||||||
|
const color = d3.scaleOrdinal(channelsOrder, d3.schemeTableau10);
|
||||||
|
|
||||||
|
const layers = g
|
||||||
|
.selectAll(".freq-layer")
|
||||||
|
.data(stacked)
|
||||||
|
.enter()
|
||||||
|
.append("g")
|
||||||
|
.attr("class", "freq-layer")
|
||||||
|
.attr("fill", (d) => color(d.key));
|
||||||
|
|
||||||
|
layers
|
||||||
|
.selectAll("rect")
|
||||||
|
.data((d) => d)
|
||||||
|
.enter()
|
||||||
|
.append("rect")
|
||||||
|
.attr("x", (d) => x(d.data.dateKey))
|
||||||
|
.attr("width", x.bandwidth())
|
||||||
|
.attr("y", (d) => y(d[1]))
|
||||||
|
.attr("height", (d) => y(d[0]) - y(d[1]))
|
||||||
|
.append("title")
|
||||||
|
.text(function (d) {
|
||||||
|
const group = this.parentNode ? this.parentNode.parentNode : null;
|
||||||
|
const key = group ? d3.select(group).datum().key : undefined;
|
||||||
|
const label = key ? channelMap.get(key) || key : key || '';
|
||||||
|
return `${dateKeyFormat(d.data.date)}: ${d[1] - d[0]}${label ? " (" + label + ")" : ''}`;
|
||||||
|
});
|
||||||
|
|
||||||
|
const legend = document.createElement("div");
|
||||||
|
legend.className = "freq-legend";
|
||||||
|
channelsOrder.forEach((key) => {
|
||||||
|
const item = document.createElement("div");
|
||||||
|
item.className = "freq-legend-item";
|
||||||
|
const swatch = document.createElement("span");
|
||||||
|
swatch.className = "freq-legend-swatch";
|
||||||
|
swatch.style.backgroundColor = color(key);
|
||||||
|
const label = document.createElement("span");
|
||||||
|
label.textContent = channelMap.get(key) || key;
|
||||||
|
item.appendChild(swatch);
|
||||||
|
item.appendChild(label);
|
||||||
|
legend.appendChild(item);
|
||||||
|
});
|
||||||
|
freqChart.appendChild(legend);
|
||||||
|
}
|
||||||
|
|
||||||
|
async function updateFrequencyChart(term, channels, queryMode) {
|
||||||
|
if (!freqChart || typeof d3 === "undefined") {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
let trimmed = term.trim();
|
||||||
|
if (!trimmed) {
|
||||||
|
if (queryMode) {
|
||||||
|
trimmed = "*";
|
||||||
|
} else {
|
||||||
|
clearFrequency("Enter a query to see timeline.");
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
const params = new URLSearchParams();
|
||||||
|
params.set("term", trimmed);
|
||||||
|
params.set("interval", "month");
|
||||||
|
(channels || []).forEach((id) => params.append("channel_id", id));
|
||||||
|
if (queryMode) {
|
||||||
|
params.set("query_string", "1");
|
||||||
|
}
|
||||||
|
|
||||||
|
clearFrequency("Loading timeline…");
|
||||||
|
try {
|
||||||
|
const res = await fetch(`/api/frequency?${params.toString()}`);
|
||||||
|
if (!res.ok) {
|
||||||
|
throw new Error(`Request failed with status ${res.status}`);
|
||||||
|
}
|
||||||
|
const payload = await res.json();
|
||||||
|
const total = payload.totalResults || 0;
|
||||||
|
if (freqSummary) {
|
||||||
|
if (total === 0) {
|
||||||
|
freqSummary.textContent = "No matches for this query.";
|
||||||
|
} else if (queryMode) {
|
||||||
|
freqSummary.textContent = `Matches: ${total.toLocaleString()} • Interval: ${payload.interval || "month"} (query-string)`;
|
||||||
|
} else {
|
||||||
|
freqSummary.textContent = `Matches: ${total.toLocaleString()} • Interval: ${payload.interval || "month"}`;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (total === 0) {
|
||||||
|
freqChart.innerHTML = "";
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
renderFrequencyChart(payload.buckets || [], payload.channels || []);
|
||||||
|
} catch (err) {
|
||||||
|
console.error(err);
|
||||||
|
clearFrequency("Timeline unavailable.");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
function renderResults(payload, page) {
|
||||||
|
resultsDiv.innerHTML = "";
|
||||||
|
metaDiv.textContent = `Total: ${payload.totalResults} • Page ${
|
||||||
|
page + 1
|
||||||
|
} of ${payload.totalPages}`;
|
||||||
|
|
||||||
|
(payload.items || []).forEach((item) => {
|
||||||
|
const el = document.createElement("div");
|
||||||
|
el.className = "item";
|
||||||
|
const titleHtml =
|
||||||
|
item.titleHtml || escapeHtml(item.title || "Untitled");
|
||||||
|
const descriptionHtml =
|
||||||
|
item.descriptionHtml || escapeHtml(item.description || "");
|
||||||
|
|
||||||
|
const header = document.createElement("div");
|
||||||
|
const badges = [];
|
||||||
|
if (item.highlightSource && item.highlightSource.primary) badges.push('primary transcript');
|
||||||
|
if (item.highlightSource && item.highlightSource.secondary) badges.push('secondary transcript');
|
||||||
|
const badgeHtml = badges.length
|
||||||
|
? `<div class="badge-row">${badges
|
||||||
|
.map((b) => `<span class="badge">${escapeHtml(b)}</span>` )
|
||||||
|
.join('')}</div>`
|
||||||
|
: '';
|
||||||
|
header.innerHTML = `
|
||||||
|
<strong>${titleHtml}</strong>
|
||||||
|
<div class="muted">${escapeHtml(item.channel_name || "")} • ${fmtDate(
|
||||||
|
item.date
|
||||||
|
)}</div>
|
||||||
|
<div class="muted"><a href="${item.url}" target="_blank" rel="noopener">Open on YouTube</a></div>
|
||||||
|
${badgeHtml}
|
||||||
|
`;
|
||||||
|
el.appendChild(header);
|
||||||
|
|
||||||
|
if (descriptionHtml) {
|
||||||
|
const desc = document.createElement("div");
|
||||||
|
desc.className = "muted";
|
||||||
|
desc.innerHTML = descriptionHtml;
|
||||||
|
el.appendChild(desc);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (Array.isArray(item.toHighlight) && item.toHighlight.length) {
|
||||||
|
const highlights = document.createElement("div");
|
||||||
|
highlights.className = "transcript highlight-list";
|
||||||
|
item.toHighlight.forEach((entry) => {
|
||||||
|
const html = typeof entry === "string" ? entry : entry?.html;
|
||||||
|
if (!html) return;
|
||||||
|
const row = document.createElement("div");
|
||||||
|
row.className = "highlight-row";
|
||||||
|
row.innerHTML = html;
|
||||||
|
highlights.appendChild(row);
|
||||||
|
});
|
||||||
|
if (highlights.childElementCount) {
|
||||||
|
el.appendChild(highlights);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
resultsDiv.appendChild(el);
|
||||||
|
});
|
||||||
|
|
||||||
|
const pager = document.createElement("div");
|
||||||
|
pager.className = "pager";
|
||||||
|
const prev = document.createElement("button");
|
||||||
|
prev.textContent = "Prev";
|
||||||
|
prev.disabled = page <= 0;
|
||||||
|
const next = document.createElement("button");
|
||||||
|
next.textContent = "Next";
|
||||||
|
next.disabled = page + 1 >= payload.totalPages;
|
||||||
|
prev.onclick = () => runSearch(page - 1);
|
||||||
|
next.onclick = () => runSearch(page + 1);
|
||||||
|
pager.appendChild(prev);
|
||||||
|
pager.appendChild(next);
|
||||||
|
resultsDiv.appendChild(pager);
|
||||||
|
}
|
||||||
|
|
||||||
|
async function runSearch(pageOverride, pushState = true) {
|
||||||
|
const q = qInput.value.trim();
|
||||||
|
const channels = getSelectedChannels();
|
||||||
|
const sort = sortSel.value;
|
||||||
|
const size = parseInt(sizeSel.value, 10) || 10;
|
||||||
|
const queryMode = queryToggle && queryToggle.checked;
|
||||||
|
let exact = !!exactToggle.checked;
|
||||||
|
let fuzzy = !!fuzzyToggle.checked;
|
||||||
|
let phrase = !!phraseToggle.checked;
|
||||||
|
if (queryMode) {
|
||||||
|
exact = false;
|
||||||
|
fuzzy = false;
|
||||||
|
phrase = false;
|
||||||
|
} else {
|
||||||
|
previousToggleState = {
|
||||||
|
exact,
|
||||||
|
fuzzy,
|
||||||
|
phrase,
|
||||||
|
};
|
||||||
|
}
|
||||||
|
const page = pageOverride != null ? pageOverride : currentPage;
|
||||||
|
currentPage = page;
|
||||||
|
|
||||||
|
if (pushState) {
|
||||||
|
updateUrl(q, sort, channels, page, size, exact, fuzzy, phrase, queryMode);
|
||||||
|
}
|
||||||
|
|
||||||
|
const params = new URLSearchParams();
|
||||||
|
params.set("q", q);
|
||||||
|
params.set("sort", sort);
|
||||||
|
params.set("size", String(size));
|
||||||
|
params.set("page", String(page));
|
||||||
|
params.set("exact", exact ? "1" : "0");
|
||||||
|
params.set("fuzzy", fuzzy ? "1" : "0");
|
||||||
|
params.set("phrase", phrase ? "1" : "0");
|
||||||
|
params.set("query_string", queryMode ? "1" : "0");
|
||||||
|
channels.forEach((id) => params.append("channel_id", id));
|
||||||
|
|
||||||
|
const res = await fetch(`/api/search?${params.toString()}`);
|
||||||
|
const payload = await res.json();
|
||||||
|
renderResults(payload, page);
|
||||||
|
updateFrequencyChart(q, channels, queryMode);
|
||||||
|
}
|
||||||
|
|
||||||
|
searchBtn.addEventListener("click", () => runSearch(0));
|
||||||
|
qInput.addEventListener("keypress", (e) => {
|
||||||
|
if (e.key === "Enter") runSearch(0);
|
||||||
|
});
|
||||||
|
sortSel.addEventListener("change", () => runSearch(0));
|
||||||
|
sizeSel.addEventListener("change", () => runSearch(0));
|
||||||
|
exactToggle.addEventListener("change", () => { rememberToggleState(); runSearch(0); });
|
||||||
|
fuzzyToggle.addEventListener("change", () => { rememberToggleState(); runSearch(0); });
|
||||||
|
phraseToggle.addEventListener("change", () => { rememberToggleState(); runSearch(0); });
|
||||||
|
if (queryToggle) {
|
||||||
|
queryToggle.addEventListener("change", () => { applyQueryMode(); runSearch(0); });
|
||||||
|
}
|
||||||
|
|
||||||
|
window.addEventListener("popstate", () => {
|
||||||
|
qs = new URLSearchParams(window.location.search);
|
||||||
|
setFromQuery();
|
||||||
|
currentPage = parseInt(qs.get("page") || "0", 10) || 0;
|
||||||
|
runSearch(currentPage, false);
|
||||||
|
});
|
||||||
|
|
||||||
|
setFromQuery();
|
||||||
|
loadMetrics();
|
||||||
|
loadChannels().then(() => runSearch(currentPage));
|
||||||
|
})();
|
||||||
|
|
||||||
|
function escapeHtml(str) {
|
||||||
|
return (str || "").replace(/[&<>"']/g, (ch) => {
|
||||||
|
switch (ch) {
|
||||||
|
case "&":
|
||||||
|
return "&";
|
||||||
|
case "<":
|
||||||
|
return "<";
|
||||||
|
case ">":
|
||||||
|
return ">";
|
||||||
|
case '"':
|
||||||
|
return """;
|
||||||
|
case "'":
|
||||||
|
return "'";
|
||||||
|
default:
|
||||||
|
return ch;
|
||||||
|
}
|
||||||
|
});
|
||||||
|
}
|
||||||
68
static/frequency.html
Normal file
68
static/frequency.html
Normal file
@ -0,0 +1,68 @@
|
|||||||
|
<!doctype html>
|
||||||
|
<html>
|
||||||
|
<head>
|
||||||
|
<meta charset="utf-8" />
|
||||||
|
<meta name="viewport" content="width=device-width, initial-scale=1" />
|
||||||
|
<title>Term Frequency Explorer</title>
|
||||||
|
<link rel="stylesheet" href="/static/style.css" />
|
||||||
|
<style>
|
||||||
|
#chart {
|
||||||
|
margin-top: 24px;
|
||||||
|
}
|
||||||
|
svg {
|
||||||
|
max-width: 100%;
|
||||||
|
}
|
||||||
|
.axis path,
|
||||||
|
.axis line {
|
||||||
|
stroke: #ccc;
|
||||||
|
}
|
||||||
|
.line {
|
||||||
|
fill: none;
|
||||||
|
stroke: #0b6efd;
|
||||||
|
stroke-width: 2px;
|
||||||
|
}
|
||||||
|
.dot {
|
||||||
|
fill: #0b6efd;
|
||||||
|
stroke: white;
|
||||||
|
stroke-width: 1px;
|
||||||
|
}
|
||||||
|
.controls label {
|
||||||
|
display: flex;
|
||||||
|
align-items: center;
|
||||||
|
gap: 6px;
|
||||||
|
}
|
||||||
|
</style>
|
||||||
|
<script src="https://cdn.jsdelivr.net/npm/d3@7/dist/d3.min.js"></script>
|
||||||
|
</head>
|
||||||
|
<body>
|
||||||
|
<header>
|
||||||
|
<h1>Term Frequency Explorer</h1>
|
||||||
|
<p class="muted">
|
||||||
|
Pick a term to see how often it appears over time. <a href="/">Back to search</a>
|
||||||
|
</p>
|
||||||
|
</header>
|
||||||
|
|
||||||
|
<section class="controls">
|
||||||
|
<input id="term" type="text" placeholder="Term (e.g. meaning)" size="28" />
|
||||||
|
<select id="channel">
|
||||||
|
<option value="all">All Channels</option>
|
||||||
|
</select>
|
||||||
|
<select id="interval">
|
||||||
|
<option value="month">Per Month</option>
|
||||||
|
<option value="week">Per Week</option>
|
||||||
|
<option value="day">Per Day</option>
|
||||||
|
<option value="quarter">Per Quarter</option>
|
||||||
|
<option value="year">Per Year</option>
|
||||||
|
</select>
|
||||||
|
<input id="start" type="date" />
|
||||||
|
<input id="end" type="date" />
|
||||||
|
<button id="runBtn">Run</button>
|
||||||
|
</section>
|
||||||
|
|
||||||
|
<section id="summary" class="muted"></section>
|
||||||
|
<section id="chart"></section>
|
||||||
|
|
||||||
|
<script src="/static/frequency.js"></script>
|
||||||
|
</body>
|
||||||
|
</html>
|
||||||
|
|
||||||
222
static/frequency.js
Normal file
222
static/frequency.js
Normal file
@ -0,0 +1,222 @@
|
|||||||
|
(() => {
|
||||||
|
let qs = new URLSearchParams(window.location.search);
|
||||||
|
|
||||||
|
const termInput = document.getElementById("term");
|
||||||
|
const channelSel = document.getElementById("channel");
|
||||||
|
const intervalSel = document.getElementById("interval");
|
||||||
|
const startInput = document.getElementById("start");
|
||||||
|
const endInput = document.getElementById("end");
|
||||||
|
const runBtn = document.getElementById("runBtn");
|
||||||
|
const summaryDiv = document.getElementById("summary");
|
||||||
|
const chartDiv = document.getElementById("chart");
|
||||||
|
|
||||||
|
function parseParams() {
|
||||||
|
return {
|
||||||
|
term: qs.get("term") || "",
|
||||||
|
channel: qs.get("channel_id") || "all",
|
||||||
|
interval: qs.get("interval") || "month",
|
||||||
|
start: qs.get("start") || "",
|
||||||
|
end: qs.get("end") || "",
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
function setFormFromParams() {
|
||||||
|
const params = parseParams();
|
||||||
|
termInput.value = params.term;
|
||||||
|
intervalSel.value = params.interval;
|
||||||
|
startInput.value = params.start;
|
||||||
|
endInput.value = params.end;
|
||||||
|
return params;
|
||||||
|
}
|
||||||
|
|
||||||
|
function updateUrl(params) {
|
||||||
|
const url = new URL(window.location.href);
|
||||||
|
url.searchParams.set("term", params.term);
|
||||||
|
url.searchParams.set("channel_id", params.channel);
|
||||||
|
url.searchParams.set("interval", params.interval);
|
||||||
|
if (params.start) url.searchParams.set("start", params.start);
|
||||||
|
else url.searchParams.delete("start");
|
||||||
|
if (params.end) url.searchParams.set("end", params.end);
|
||||||
|
else url.searchParams.delete("end");
|
||||||
|
history.pushState({}, "", url.toString());
|
||||||
|
qs = new URLSearchParams(url.search);
|
||||||
|
}
|
||||||
|
|
||||||
|
async function loadChannels(initialValue) {
|
||||||
|
try {
|
||||||
|
const res = await fetch("/api/channels");
|
||||||
|
const data = await res.json();
|
||||||
|
data.forEach((item) => {
|
||||||
|
const opt = document.createElement("option");
|
||||||
|
opt.value = item.Id;
|
||||||
|
opt.textContent = `${item.Name} (${item.Count})`;
|
||||||
|
channelSel.appendChild(opt);
|
||||||
|
});
|
||||||
|
} catch (err) {
|
||||||
|
console.error("Failed to load channels", err);
|
||||||
|
}
|
||||||
|
channelSel.value = initialValue || "all";
|
||||||
|
}
|
||||||
|
|
||||||
|
function drawChart(data) {
|
||||||
|
chartDiv.innerHTML = "";
|
||||||
|
if (!data.length) {
|
||||||
|
const msg = document.createElement("div");
|
||||||
|
msg.className = "muted";
|
||||||
|
msg.textContent = "No matching documents for this term.";
|
||||||
|
chartDiv.appendChild(msg);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
const parsed = data
|
||||||
|
.map((d) => ({
|
||||||
|
date: d3.isoParse(d.date) || new Date(d.date),
|
||||||
|
value: d.count,
|
||||||
|
}))
|
||||||
|
.filter((d) => d.date instanceof Date && !Number.isNaN(d.date.valueOf()));
|
||||||
|
|
||||||
|
if (!parsed.length) {
|
||||||
|
const msg = document.createElement("div");
|
||||||
|
msg.className = "muted";
|
||||||
|
msg.textContent = "Unable to parse dates for this series.";
|
||||||
|
chartDiv.appendChild(msg);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
const margin = { top: 20, right: 30, bottom: 40, left: 56 };
|
||||||
|
const fullWidth = chartDiv.clientWidth || 900;
|
||||||
|
const fullHeight = 360;
|
||||||
|
const width = fullWidth - margin.left - margin.right;
|
||||||
|
const height = fullHeight - margin.top - margin.bottom;
|
||||||
|
|
||||||
|
const svg = d3
|
||||||
|
.select(chartDiv)
|
||||||
|
.append("svg")
|
||||||
|
.attr("width", fullWidth)
|
||||||
|
.attr("height", fullHeight);
|
||||||
|
|
||||||
|
const g = svg
|
||||||
|
.append("g")
|
||||||
|
.attr("transform", `translate(${margin.left},${margin.top})`);
|
||||||
|
|
||||||
|
const x = d3
|
||||||
|
.scaleTime()
|
||||||
|
.domain(d3.extent(parsed, (d) => d.date))
|
||||||
|
.range([0, width]);
|
||||||
|
|
||||||
|
const y = d3
|
||||||
|
.scaleLinear()
|
||||||
|
.domain([0, d3.max(parsed, (d) => d.value) || 0])
|
||||||
|
.nice()
|
||||||
|
.range([height, 0]);
|
||||||
|
|
||||||
|
const xAxis = d3.axisBottom(x).ticks(6).tickFormat(d3.timeFormat("%Y-%m-%d"));
|
||||||
|
const yAxis = d3.axisLeft(y).ticks(6);
|
||||||
|
|
||||||
|
g.append("g")
|
||||||
|
.attr("class", "axis")
|
||||||
|
.attr("transform", `translate(0,${height})`)
|
||||||
|
.call(xAxis)
|
||||||
|
.selectAll("text")
|
||||||
|
.attr("text-anchor", "end")
|
||||||
|
.attr("transform", "rotate(-35)")
|
||||||
|
.attr("dx", "-0.8em")
|
||||||
|
.attr("dy", "0.15em");
|
||||||
|
|
||||||
|
g.append("g").attr("class", "axis").call(yAxis);
|
||||||
|
|
||||||
|
const line = d3
|
||||||
|
.line()
|
||||||
|
.x((d) => x(d.date))
|
||||||
|
.y((d) => y(d.value));
|
||||||
|
|
||||||
|
g.append("path")
|
||||||
|
.datum(parsed)
|
||||||
|
.attr("class", "line")
|
||||||
|
.attr("d", line);
|
||||||
|
|
||||||
|
g.selectAll(".dot")
|
||||||
|
.data(parsed)
|
||||||
|
.enter()
|
||||||
|
.append("circle")
|
||||||
|
.attr("class", "dot")
|
||||||
|
.attr("r", 3)
|
||||||
|
.attr("cx", (d) => x(d.date))
|
||||||
|
.attr("cy", (d) => y(d.value))
|
||||||
|
.append("title")
|
||||||
|
.text((d) => `${d3.timeFormat("%Y-%m-%d")(d.date)}: ${d.value}`);
|
||||||
|
}
|
||||||
|
|
||||||
|
async function runFrequency(pushState = true) {
|
||||||
|
const term = termInput.value.trim();
|
||||||
|
if (!term) {
|
||||||
|
summaryDiv.textContent = "Enter a term to begin.";
|
||||||
|
chartDiv.innerHTML = "";
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
const params = {
|
||||||
|
term,
|
||||||
|
channel: channelSel.value,
|
||||||
|
interval: intervalSel.value,
|
||||||
|
start: startInput.value,
|
||||||
|
end: endInput.value,
|
||||||
|
};
|
||||||
|
|
||||||
|
if (pushState) updateUrl(params);
|
||||||
|
|
||||||
|
const search = new URLSearchParams();
|
||||||
|
search.set("term", term);
|
||||||
|
if (params.channel && params.channel !== "all") {
|
||||||
|
search.set("channel_id", params.channel);
|
||||||
|
}
|
||||||
|
search.set("interval", params.interval);
|
||||||
|
if (params.start) search.set("start", params.start);
|
||||||
|
if (params.end) search.set("end", params.end);
|
||||||
|
|
||||||
|
summaryDiv.textContent = "Loading…";
|
||||||
|
chartDiv.innerHTML = "";
|
||||||
|
|
||||||
|
try {
|
||||||
|
const res = await fetch(`/api/frequency?${search.toString()}`);
|
||||||
|
if (!res.ok) {
|
||||||
|
throw new Error(`Request failed: ${res.status}`);
|
||||||
|
}
|
||||||
|
const payload = await res.json();
|
||||||
|
const total = payload.totalResults || 0;
|
||||||
|
summaryDiv.textContent = `Matches: ${total.toLocaleString()} • Buckets: ${
|
||||||
|
(payload.buckets || []).length
|
||||||
|
} • Interval: ${payload.interval}`;
|
||||||
|
drawChart(payload.buckets || []);
|
||||||
|
} catch (err) {
|
||||||
|
console.error(err);
|
||||||
|
summaryDiv.textContent = "Failed to load data.";
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
runBtn.addEventListener("click", () => runFrequency());
|
||||||
|
termInput.addEventListener("keypress", (e) => {
|
||||||
|
if (e.key === "Enter") runFrequency();
|
||||||
|
});
|
||||||
|
intervalSel.addEventListener("change", () => runFrequency());
|
||||||
|
channelSel.addEventListener("change", () => runFrequency());
|
||||||
|
startInput.addEventListener("change", () => runFrequency());
|
||||||
|
endInput.addEventListener("change", () => runFrequency());
|
||||||
|
|
||||||
|
window.addEventListener("popstate", () => {
|
||||||
|
qs = new URLSearchParams(window.location.search);
|
||||||
|
const params = setFormFromParams();
|
||||||
|
channelSel.value = params.channel;
|
||||||
|
runFrequency(false);
|
||||||
|
});
|
||||||
|
|
||||||
|
const initialParams = setFormFromParams();
|
||||||
|
loadChannels(initialParams.channel).then(() => {
|
||||||
|
if (initialParams.term) {
|
||||||
|
runFrequency(false);
|
||||||
|
} else {
|
||||||
|
summaryDiv.textContent = "Enter a term to begin.";
|
||||||
|
}
|
||||||
|
});
|
||||||
|
})();
|
||||||
|
|
||||||
63
static/index.html
Normal file
63
static/index.html
Normal file
@ -0,0 +1,63 @@
|
|||||||
|
<!doctype html>
|
||||||
|
<html>
|
||||||
|
<head>
|
||||||
|
<meta charset="utf-8" />
|
||||||
|
<meta name="viewport" content="width=device-width, initial-scale=1" />
|
||||||
|
<title>This Little Corner (Python)</title>
|
||||||
|
<link rel="stylesheet" href="/static/style.css" />
|
||||||
|
<script src="https://cdn.jsdelivr.net/npm/d3@7/dist/d3.min.js"></script>
|
||||||
|
</head>
|
||||||
|
<body>
|
||||||
|
<header>
|
||||||
|
<h1>This Little Corner — Elastic Search</h1>
|
||||||
|
<p class="muted">
|
||||||
|
Enter a phrase to query title, description, and transcript text.
|
||||||
|
</p>
|
||||||
|
</header>
|
||||||
|
|
||||||
|
<section class="controls">
|
||||||
|
<input id="q" type="text" placeholder="Search..." size="40" />
|
||||||
|
<details id="channelDropdown" class="channel-dropdown">
|
||||||
|
<summary id="channelSummary">All Channels</summary>
|
||||||
|
<div id="channelOptions" class="channel-options muted">
|
||||||
|
<div>Loading channels…</div>
|
||||||
|
</div>
|
||||||
|
</details>
|
||||||
|
<select id="sort">
|
||||||
|
<option value="relevant">Most relevant</option>
|
||||||
|
<option value="newer">Newest first</option>
|
||||||
|
<option value="older">Oldest first</option>
|
||||||
|
</select>
|
||||||
|
<select id="size">
|
||||||
|
<option value="10">10</option>
|
||||||
|
<option value="25">25</option>
|
||||||
|
<option value="50">50</option>
|
||||||
|
</select>
|
||||||
|
<button id="searchBtn">Search</button>
|
||||||
|
</section>
|
||||||
|
<section class="controls muted">
|
||||||
|
<label><input type="checkbox" id="exactToggle" checked /> Exact</label>
|
||||||
|
<label><input type="checkbox" id="fuzzyToggle" checked /> Fuzzy</label>
|
||||||
|
<label><input type="checkbox" id="phraseToggle" checked /> Phrase</label>
|
||||||
|
<label><input type="checkbox" id="queryStringToggle" /> Query string mode</label>
|
||||||
|
</section>
|
||||||
|
|
||||||
|
<section class="summary-row">
|
||||||
|
<div class="summary-left">
|
||||||
|
<section id="meta" class="muted"></section>
|
||||||
|
<section id="metrics">
|
||||||
|
<div id="metricsStatus" class="muted"></div>
|
||||||
|
<div id="metricsContent"></div>
|
||||||
|
</section>
|
||||||
|
</div>
|
||||||
|
<div class="summary-right">
|
||||||
|
<section id="frequencySummary" class="muted"></section>
|
||||||
|
<div id="frequencyChart"></div>
|
||||||
|
</div>
|
||||||
|
</section>
|
||||||
|
|
||||||
|
<section id="results"></section>
|
||||||
|
|
||||||
|
<script src="/static/app.js"></script>
|
||||||
|
</body>
|
||||||
|
</html>
|
||||||
225
static/style.css
Normal file
225
static/style.css
Normal file
@ -0,0 +1,225 @@
|
|||||||
|
body {
|
||||||
|
font-family: Arial, sans-serif;
|
||||||
|
margin: 24px;
|
||||||
|
color: #222;
|
||||||
|
}
|
||||||
|
|
||||||
|
header {
|
||||||
|
margin-bottom: 16px;
|
||||||
|
}
|
||||||
|
|
||||||
|
.controls {
|
||||||
|
display: flex;
|
||||||
|
flex-wrap: wrap;
|
||||||
|
gap: 8px;
|
||||||
|
align-items: center;
|
||||||
|
margin-bottom: 12px;
|
||||||
|
}
|
||||||
|
|
||||||
|
.channel-dropdown {
|
||||||
|
position: relative;
|
||||||
|
min-width: 220px;
|
||||||
|
flex: 0 1 260px;
|
||||||
|
}
|
||||||
|
|
||||||
|
.channel-dropdown summary {
|
||||||
|
list-style: none;
|
||||||
|
cursor: pointer;
|
||||||
|
border: 1px solid #ccc;
|
||||||
|
border-radius: 4px;
|
||||||
|
padding: 6px 8px;
|
||||||
|
background: #fff;
|
||||||
|
color: #222;
|
||||||
|
display: inline-flex;
|
||||||
|
align-items: center;
|
||||||
|
min-height: 32px;
|
||||||
|
max-width: 100%;
|
||||||
|
overflow: hidden;
|
||||||
|
text-overflow: ellipsis;
|
||||||
|
white-space: nowrap;
|
||||||
|
}
|
||||||
|
|
||||||
|
.channel-dropdown summary::-webkit-details-marker {
|
||||||
|
display: none;
|
||||||
|
}
|
||||||
|
|
||||||
|
.channel-dropdown[open] summary {
|
||||||
|
border-bottom-left-radius: 0;
|
||||||
|
border-bottom-right-radius: 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
.channel-options {
|
||||||
|
margin-top: 4px;
|
||||||
|
padding: 8px;
|
||||||
|
border: 1px solid #ccc;
|
||||||
|
border-radius: 0 0 4px 4px;
|
||||||
|
background: #fff;
|
||||||
|
max-height: 240px;
|
||||||
|
overflow-y: auto;
|
||||||
|
box-shadow: 0 2px 6px rgba(0, 0, 0, 0.12);
|
||||||
|
min-width: 220px;
|
||||||
|
width: max(220px, 100%);
|
||||||
|
}
|
||||||
|
|
||||||
|
.channel-option {
|
||||||
|
display: flex;
|
||||||
|
align-items: center;
|
||||||
|
gap: 6px;
|
||||||
|
margin-bottom: 6px;
|
||||||
|
font-size: 12px;
|
||||||
|
}
|
||||||
|
|
||||||
|
.channel-option:last-child {
|
||||||
|
margin-bottom: 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
input,
|
||||||
|
select,
|
||||||
|
button {
|
||||||
|
padding: 6px 8px;
|
||||||
|
}
|
||||||
|
|
||||||
|
.muted {
|
||||||
|
color: #666;
|
||||||
|
font-size: 12px;
|
||||||
|
}
|
||||||
|
|
||||||
|
#results .item {
|
||||||
|
border-bottom: 1px solid #ddd;
|
||||||
|
padding: 12px 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
.summary-row {
|
||||||
|
display: flex;
|
||||||
|
gap: 16px;
|
||||||
|
flex-wrap: wrap;
|
||||||
|
align-items: flex-start;
|
||||||
|
margin-top: 12px;
|
||||||
|
}
|
||||||
|
|
||||||
|
.summary-left {
|
||||||
|
flex: 0 1 280px;
|
||||||
|
max-width: 360px;
|
||||||
|
}
|
||||||
|
|
||||||
|
.summary-right {
|
||||||
|
flex: 1 1 0%;
|
||||||
|
min-width: 0;
|
||||||
|
background: #f5f5f5;
|
||||||
|
padding: 12px;
|
||||||
|
border-radius: 8px;
|
||||||
|
box-shadow: 0 1px 2px rgba(0, 0, 0, 0.08);
|
||||||
|
}
|
||||||
|
|
||||||
|
#metrics {
|
||||||
|
margin-top: 12px;
|
||||||
|
display: flex;
|
||||||
|
flex-direction: column;
|
||||||
|
gap: 8px;
|
||||||
|
}
|
||||||
|
|
||||||
|
#metricsStatus {
|
||||||
|
min-height: 16px;
|
||||||
|
}
|
||||||
|
|
||||||
|
#metricsContent {
|
||||||
|
display: flex;
|
||||||
|
flex-direction: column;
|
||||||
|
gap: 6px;
|
||||||
|
}
|
||||||
|
|
||||||
|
#frequencyChart {
|
||||||
|
margin-top: 8px;
|
||||||
|
}
|
||||||
|
|
||||||
|
#frequencyChart svg {
|
||||||
|
max-width: 100%;
|
||||||
|
}
|
||||||
|
|
||||||
|
#frequencyChart .axis path,
|
||||||
|
#frequencyChart .axis line {
|
||||||
|
stroke: #ccc;
|
||||||
|
}
|
||||||
|
|
||||||
|
#frequencyChart .freq-layer rect {
|
||||||
|
stroke: #fff;
|
||||||
|
stroke-width: 0.5px;
|
||||||
|
}
|
||||||
|
|
||||||
|
.freq-legend {
|
||||||
|
margin-top: 8px;
|
||||||
|
display: flex;
|
||||||
|
flex-wrap: wrap;
|
||||||
|
gap: 8px;
|
||||||
|
font-size: 12px;
|
||||||
|
color: #444;
|
||||||
|
}
|
||||||
|
|
||||||
|
.freq-legend-item {
|
||||||
|
display: flex;
|
||||||
|
align-items: center;
|
||||||
|
gap: 6px;
|
||||||
|
}
|
||||||
|
|
||||||
|
.freq-legend-swatch {
|
||||||
|
width: 12px;
|
||||||
|
height: 12px;
|
||||||
|
border-radius: 2px;
|
||||||
|
display: inline-block;
|
||||||
|
}
|
||||||
|
|
||||||
|
.transcript {
|
||||||
|
background: #fafafa;
|
||||||
|
padding: 8px;
|
||||||
|
margin-top: 6px;
|
||||||
|
max-height: 200px;
|
||||||
|
overflow-y: auto;
|
||||||
|
}
|
||||||
|
|
||||||
|
.highlight-list {
|
||||||
|
display: flex;
|
||||||
|
flex-direction: column;
|
||||||
|
gap: 8px;
|
||||||
|
max-height: none;
|
||||||
|
overflow: visible;
|
||||||
|
}
|
||||||
|
|
||||||
|
.highlight-row {
|
||||||
|
padding: 4px 0;
|
||||||
|
border-bottom: 1px solid #ececec;
|
||||||
|
}
|
||||||
|
|
||||||
|
.highlight-row:last-child {
|
||||||
|
border-bottom: none;
|
||||||
|
}
|
||||||
|
|
||||||
|
.transcript-wrapper {
|
||||||
|
margin-top: 8px;
|
||||||
|
}
|
||||||
|
|
||||||
|
.pager {
|
||||||
|
margin-top: 12px;
|
||||||
|
display: flex;
|
||||||
|
gap: 8px;
|
||||||
|
}
|
||||||
|
|
||||||
|
mark {
|
||||||
|
background: #ffe58a;
|
||||||
|
padding: 0 2px;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
.badge-row {
|
||||||
|
margin-top: 6px;
|
||||||
|
display: flex;
|
||||||
|
gap: 4px;
|
||||||
|
flex-wrap: wrap;
|
||||||
|
}
|
||||||
|
|
||||||
|
.badge {
|
||||||
|
background: #0b6efd;
|
||||||
|
color: #fff;
|
||||||
|
border-radius: 999px;
|
||||||
|
padding: 2px 8px;
|
||||||
|
font-size: 12px;
|
||||||
|
}
|
||||||
226
transcript_collector.py
Normal file
226
transcript_collector.py
Normal file
@ -0,0 +1,226 @@
|
|||||||
|
"""
|
||||||
|
Lightweight helpers for gathering video metadata and transcripts from YouTube.
|
||||||
|
|
||||||
|
Usage:
|
||||||
|
python -m python_app.transcript_collector --channel UC123 --output data/raw
|
||||||
|
|
||||||
|
Relies on:
|
||||||
|
- YouTube Data API v3 (requires YOUTUBE_API_KEY).
|
||||||
|
- youtube-transcript-api for transcript retrieval.
|
||||||
|
Both libraries are optional at import time so the module can still be referenced
|
||||||
|
when only working with existing JSON dumps.
|
||||||
|
"""
|
||||||
|
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import argparse
|
||||||
|
import json
|
||||||
|
import logging
|
||||||
|
from dataclasses import asdict, dataclass
|
||||||
|
from pathlib import Path
|
||||||
|
from typing import Dict, Iterable, Iterator, List, Optional
|
||||||
|
|
||||||
|
from .config import CONFIG
|
||||||
|
|
||||||
|
try:
|
||||||
|
from googleapiclient.discovery import build as build_youtube # type: ignore
|
||||||
|
except ImportError: # pragma: no cover - library optional
|
||||||
|
build_youtube = None
|
||||||
|
|
||||||
|
try:
|
||||||
|
from youtube_transcript_api import YouTubeTranscriptApi # type: ignore
|
||||||
|
except ImportError: # pragma: no cover - library optional
|
||||||
|
YouTubeTranscriptApi = None
|
||||||
|
|
||||||
|
|
||||||
|
LOGGER = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class TranscriptSegment:
|
||||||
|
start: float
|
||||||
|
duration: float
|
||||||
|
text: str
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class VideoRecord:
|
||||||
|
video_id: str
|
||||||
|
channel_id: str
|
||||||
|
channel_title: str
|
||||||
|
title: str
|
||||||
|
description: str
|
||||||
|
published_at: str
|
||||||
|
url: str
|
||||||
|
transcript: List[TranscriptSegment]
|
||||||
|
|
||||||
|
|
||||||
|
def _ensure_youtube_client(api_key: Optional[str]):
|
||||||
|
if build_youtube is None:
|
||||||
|
raise RuntimeError(
|
||||||
|
"google-api-python-client not installed. "
|
||||||
|
"Install google-api-python-client to collect metadata."
|
||||||
|
)
|
||||||
|
if not api_key:
|
||||||
|
raise RuntimeError(
|
||||||
|
"Set YOUTUBE_API_KEY to collect metadata from YouTube."
|
||||||
|
)
|
||||||
|
return build_youtube("youtube", "v3", developerKey=api_key)
|
||||||
|
|
||||||
|
|
||||||
|
def _ensure_transcript_api():
|
||||||
|
if YouTubeTranscriptApi is None:
|
||||||
|
raise RuntimeError(
|
||||||
|
"youtube-transcript-api not installed. "
|
||||||
|
"Install youtube-transcript-api to fetch transcripts."
|
||||||
|
)
|
||||||
|
return YouTubeTranscriptApi()
|
||||||
|
|
||||||
|
|
||||||
|
def iter_channel_videos(
|
||||||
|
channel_id: str,
|
||||||
|
*,
|
||||||
|
api_key: Optional[str] = None,
|
||||||
|
max_pages: int = 10,
|
||||||
|
) -> Iterator[Dict]:
|
||||||
|
"""
|
||||||
|
Yield raw playlist items for the uploads playlist of the given channel.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
channel_id: Target YouTube channel ID.
|
||||||
|
api_key: Explicit API key (defaults to config value).
|
||||||
|
max_pages: Hard cap on paginated playlist fetches to keep things simple.
|
||||||
|
"""
|
||||||
|
client = _ensure_youtube_client(api_key or CONFIG.youtube.api_key)
|
||||||
|
channels = (
|
||||||
|
client.channels().list(id=channel_id, part="contentDetails").execute()
|
||||||
|
)
|
||||||
|
items = channels.get("items", [])
|
||||||
|
if not items:
|
||||||
|
raise ValueError(f"Channel {channel_id} not found.")
|
||||||
|
uploads_playlist = (
|
||||||
|
items[0]
|
||||||
|
.get("contentDetails", {})
|
||||||
|
.get("relatedPlaylists", {})
|
||||||
|
.get("uploads")
|
||||||
|
)
|
||||||
|
if not uploads_playlist:
|
||||||
|
raise ValueError(f"Channel {channel_id} missing uploads playlist.")
|
||||||
|
|
||||||
|
request = client.playlistItems().list(
|
||||||
|
playlistId=uploads_playlist, part="snippet", maxResults=50
|
||||||
|
)
|
||||||
|
page = 0
|
||||||
|
while request and page < max_pages:
|
||||||
|
response = request.execute()
|
||||||
|
for item in response.get("items", []):
|
||||||
|
yield item
|
||||||
|
page += 1
|
||||||
|
request = client.playlistItems().list_next(request, response)
|
||||||
|
|
||||||
|
|
||||||
|
def fetch_transcript(
|
||||||
|
video_id: str, *, languages: Optional[Iterable[str]] = None
|
||||||
|
) -> List[TranscriptSegment]:
|
||||||
|
"""Return transcript segments for a video, if available."""
|
||||||
|
api = _ensure_transcript_api()
|
||||||
|
try:
|
||||||
|
transcripts = api.get_transcript(video_id, languages=languages)
|
||||||
|
except Exception as exc: # broad catch keeps draft simple
|
||||||
|
LOGGER.warning("Transcript unavailable for %s: %s", video_id, exc)
|
||||||
|
return []
|
||||||
|
return [
|
||||||
|
TranscriptSegment(
|
||||||
|
start=entry.get("start", 0.0),
|
||||||
|
duration=entry.get("duration", 0.0),
|
||||||
|
text=entry.get("text", ""),
|
||||||
|
)
|
||||||
|
for entry in transcripts
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
|
def collect_channel(
|
||||||
|
channel_id: str,
|
||||||
|
output_dir: Path,
|
||||||
|
*,
|
||||||
|
api_key: Optional[str] = None,
|
||||||
|
max_pages: int = 2,
|
||||||
|
languages: Optional[List[str]] = None,
|
||||||
|
) -> List[VideoRecord]:
|
||||||
|
"""
|
||||||
|
Collect metadata + transcripts for a channel and store as JSON files.
|
||||||
|
|
||||||
|
Returns the in-memory list to make it easy to chain into ingestion.
|
||||||
|
"""
|
||||||
|
output_dir.mkdir(parents=True, exist_ok=True)
|
||||||
|
videos: List[VideoRecord] = []
|
||||||
|
for item in iter_channel_videos(
|
||||||
|
channel_id, api_key=api_key, max_pages=max_pages
|
||||||
|
):
|
||||||
|
snippet = item.get("snippet", {})
|
||||||
|
video_id = snippet.get("resourceId", {}).get("videoId")
|
||||||
|
if not video_id:
|
||||||
|
continue
|
||||||
|
segments = fetch_transcript(video_id, languages=languages)
|
||||||
|
record = VideoRecord(
|
||||||
|
video_id=video_id,
|
||||||
|
channel_id=snippet.get("channelId", channel_id),
|
||||||
|
channel_title=snippet.get("channelTitle", ""),
|
||||||
|
title=snippet.get("title", ""),
|
||||||
|
description=snippet.get("description", ""),
|
||||||
|
published_at=snippet.get("publishedAt", ""),
|
||||||
|
url=f"https://www.youtube.com/watch?v={video_id}",
|
||||||
|
transcript=segments,
|
||||||
|
)
|
||||||
|
videos.append(record)
|
||||||
|
dest = output_dir / f"{video_id}.json"
|
||||||
|
with dest.open("w", encoding="utf-8") as handle:
|
||||||
|
json.dump(asdict(record), handle, ensure_ascii=False, indent=2)
|
||||||
|
LOGGER.info("Saved %s", dest)
|
||||||
|
return videos
|
||||||
|
|
||||||
|
|
||||||
|
def _build_parser() -> argparse.ArgumentParser:
|
||||||
|
parser = argparse.ArgumentParser(
|
||||||
|
description="Collect channel transcripts into JSON files."
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--channel",
|
||||||
|
required=True,
|
||||||
|
help="YouTube channel ID (e.g. UCXYZ).",
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--output",
|
||||||
|
type=Path,
|
||||||
|
default=Path("data/raw"),
|
||||||
|
help="Directory to write per-video JSON files.",
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--max-pages",
|
||||||
|
type=int,
|
||||||
|
default=2,
|
||||||
|
help="Number of paginated channel pages to pull (50 videos per page).",
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--language",
|
||||||
|
dest="languages",
|
||||||
|
action="append",
|
||||||
|
help="Preferred transcript languages (can be repeated).",
|
||||||
|
)
|
||||||
|
return parser
|
||||||
|
|
||||||
|
|
||||||
|
def main() -> None:
|
||||||
|
logging.basicConfig(level=logging.INFO, format="%(levelname)s %(message)s")
|
||||||
|
args = _build_parser().parse_args()
|
||||||
|
collect_channel(
|
||||||
|
args.channel,
|
||||||
|
args.output,
|
||||||
|
max_pages=args.max_pages,
|
||||||
|
languages=args.languages,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__": # pragma: no cover
|
||||||
|
main()
|
||||||
|
|
||||||
Loading…
x
Reference in New Issue
Block a user