From 40d4f41f6e4c9cfdd7e90b714979294223dc40e0 Mon Sep 17 00:00:00 2001 From: knight Date: Sun, 9 Nov 2025 14:24:50 -0500 Subject: [PATCH] Add graph and vector search features --- AGENTS.md | 31 ++ config.py | 22 +- requirements.txt | 2 + search_app.py | 475 +++++++++++++++++++++++++-- static/app.js | 691 ++++++++++++++++++++++++++++++---------- static/graph.html | 85 +++++ static/graph.js | 670 ++++++++++++++++++++++++++++++++++++++ static/index.html | 123 ++++++- static/style.css | 500 +++++++++++++++++++++++++---- static/vector.html | 46 +++ static/vector.js | 423 ++++++++++++++++++++++++ sync_qdrant_channels.py | 188 +++++++++++ 12 files changed, 2983 insertions(+), 273 deletions(-) create mode 100644 AGENTS.md create mode 100644 static/graph.html create mode 100644 static/graph.js create mode 100644 static/vector.html create mode 100644 static/vector.js create mode 100644 sync_qdrant_channels.py diff --git a/AGENTS.md b/AGENTS.md new file mode 100644 index 0000000..0542a05 --- /dev/null +++ b/AGENTS.md @@ -0,0 +1,31 @@ +# Repository Guidelines + +## Project Structure & Module Organization +- Core modules live under `python_app/`: `config.py` centralizes settings, `transcript_collector.py` gathers transcripts, `ingest.py` handles Elasticsearch bulk loads, and `search_app.py` exposes the Flask UI. +- Static assets belong in `static/` (`index.html`, `frequency.html`, companion JS/CSS). Keep HTML here and wire it up through Flask routes. +- Runtime artifacts land in `data/` (`raw/` for downloads, `video_metadata/` for cleaned payloads). Preserve the JSON schema emitted by the collector. +- When adding utilities, place them in `python_app/` and use package-relative imports so scripts continue to run via `python -m`. + +## Build, Test, and Development Commands +- `python -m venv .venv && source .venv/bin/activate`: bootstrap the virtualenv used by all scripts. +- `pip install -r requirements.txt`: install Flask, Elasticsearch tooling, Google API clients, and dotenv support. +- `python -m python_app.transcript_collector --channel UC... --output data/raw`: fetch transcript JSON for a channel; rerun to refresh cached data. +- `python -m python_app.ingest --source data/video_metadata --index this_little_corner_py`: index prepared metadata and auto-create mappings when needed. +- `python -m python_app.search_app`: launch the Flask server on port 8080 for UI smoke tests. + +## Coding Style & Naming Conventions +- Follow PEP 8 with 4-space indentation, `snake_case` for functions/modules, and `CamelCase` for classes; reserve UPPER_SNAKE_CASE for configuration constants. +- Keep Elasticsearch payload keys lower-case with underscores, and centralize shared values in `config.py` rather than scattering literals. + +## Testing Guidelines +- No automated suite is committed yet; when adding coverage, create `tests/` modules using `pytest` with files named `test_*.py`. +- Focus tests on collector pagination, ingest transformations, and Flask route helpers, and run `python -m pytest` locally before opening a PR. +- Manually verify by ingesting a small sample into a local Elasticsearch node and checking facets, highlights, and transcript retrieval via the UI. + +## Commit & Pull Request Guidelines +- Mirror the existing history: short, imperative commit subjects (e.g. “Fix results overflow”, “Add video reference tracking”). +- PRs should describe scope, list environment variables or indices touched, link issues, and attach before/after screenshots whenever UI output changes. Highlight Elasticsearch mapping or data migration impacts for both search and frontend reviewers. + +## Configuration & Security Tips +- Load credentials through environment variables (`ELASTIC_URL`, `ELASTIC_USERNAME`, `ELASTIC_PASSWORD`, `ELASTIC_API_KEY`, `YOUTUBE_API_KEY`) or a `.env` file, and keep secrets out of version control. +- Adjust `ELASTIC_VERIFY_CERTS`, `ELASTIC_CA_CERT`, and `ELASTIC_DEBUG` only while debugging, and prefer branch-specific indices (`this_little_corner_py_`) to avoid clobbering shared data. diff --git a/config.py b/config.py index 5017497..1faf15a 100644 --- a/config.py +++ b/config.py @@ -20,13 +20,13 @@ from typing import Optional try: from dotenv import load_dotenv import logging - _logger = logging.getLogger(__name__) + _logger = logging.getLogger(__name__) _env_path = Path(__file__).parent / ".env" if _env_path.exists(): - _logger.info(f"Loading .env from: {_env_path}") + _logger.info("Loading .env from: %s", _env_path) result = load_dotenv(_env_path, override=True) - _logger.info(f"load_dotenv result: {result}") + _logger.info("load_dotenv result: %s", result) except ImportError: pass # python-dotenv not installed @@ -58,6 +58,11 @@ class AppConfig: elastic: ElasticSettings data: DataSettings youtube: YoutubeSettings + qdrant_url: str + qdrant_collection: str + qdrant_vector_name: Optional[str] + qdrant_vector_size: int + qdrant_embed_model: str def _env(name: str, default: Optional[str] = None) -> Optional[str]: @@ -89,7 +94,16 @@ def load_config() -> AppConfig: ) data = DataSettings(root=data_root) youtube = YoutubeSettings(api_key=_env("YOUTUBE_API_KEY")) - return AppConfig(elastic=elastic, data=data, youtube=youtube) + return AppConfig( + elastic=elastic, + data=data, + youtube=youtube, + qdrant_url=_env("QDRANT_URL", "http://localhost:6333"), + qdrant_collection=_env("QDRANT_COLLECTION", "tlc_embeddings"), + qdrant_vector_name=_env("QDRANT_VECTOR_NAME"), + qdrant_vector_size=int(_env("QDRANT_VECTOR_SIZE", "1024")), + qdrant_embed_model=_env("QDRANT_EMBED_MODEL", "BAAI/bge-large-en-v1.5"), + ) CONFIG = load_config() diff --git a/requirements.txt b/requirements.txt index b04a199..9f41679 100644 --- a/requirements.txt +++ b/requirements.txt @@ -3,3 +3,5 @@ elasticsearch>=7.0.0,<9.0.0 youtube-transcript-api>=0.6 google-api-python-client>=2.0.0 python-dotenv>=0.19.0 +requests>=2.31.0 +sentence-transformers>=2.7.0 diff --git a/search_app.py b/search_app.py index ca5f836..730847d 100644 --- a/search_app.py +++ b/search_app.py @@ -1,11 +1,15 @@ """ -Flask application exposing a minimal search API backed by Elasticsearch. +Flask application exposing search, graph, and transcript endpoints for TLC. Routes: - GET / -> Static HTML search page. - GET /api/channels -> List available channels (via terms aggregation). - GET /api/search -> Search index with pagination and simple highlighting. - GET /api/transcript -> Return full transcript for a given video_id. + GET / -> static HTML search page. + GET /graph -> static reference graph UI. + GET /vector-search -> experimental Qdrant vector search UI. + GET /api/channels -> channels aggregation. + GET /api/search -> Elasticsearch keyword search. + POST /api/vector-search -> Qdrant vector similarity query. + GET /api/graph -> reference graph API. + GET /api/transcript -> transcript JSON payload. """ from __future__ import annotations @@ -15,13 +19,20 @@ import json import logging import re from pathlib import Path -from typing import Any, Dict, Iterable, List, Optional, Sequence, Set +from typing import Any, Dict, Iterable, List, Optional, Sequence, Set, Tuple -from collections import Counter +from collections import Counter, deque from datetime import datetime from flask import Flask, jsonify, request, send_from_directory +import requests + +try: + from sentence_transformers import SentenceTransformer # type: ignore +except ImportError: # pragma: no cover - optional dependency + SentenceTransformer = None + from .config import CONFIG, AppConfig try: @@ -32,6 +43,35 @@ except ImportError: # pragma: no cover - dependency optional BadRequestError = Exception # type: ignore LOGGER = logging.getLogger(__name__) +_EMBED_MODEL = None +_EMBED_MODEL_NAME: Optional[str] = None + + +def _ensure_embedder(model_name: str) -> "SentenceTransformer": + global _EMBED_MODEL, _EMBED_MODEL_NAME + if SentenceTransformer is None: # pragma: no cover - optional dependency + raise RuntimeError( + "sentence-transformers is required for vector search. Install via pip install sentence-transformers." + ) + if _EMBED_MODEL is None or _EMBED_MODEL_NAME != model_name: + LOGGER.info("Loading embedding model: %s", model_name) + _EMBED_MODEL = SentenceTransformer(model_name) + _EMBED_MODEL_NAME = model_name + return _EMBED_MODEL + + +def embed_query(text: str, *, model_name: str, expected_dim: int) -> List[float]: + embedder = _ensure_embedder(model_name) + vector = embedder.encode( + [f"query: {text}"], + show_progress_bar=False, + normalize_embeddings=True, + )[0].tolist() + if len(vector) != expected_dim: + raise RuntimeError( + f"Embedding dimension mismatch (expected {expected_dim}, got {len(vector)})" + ) + return vector def _ensure_client(config: AppConfig) -> "Elasticsearch": @@ -428,6 +468,17 @@ def build_query_payload( } } ) + should.append( + { + "match_phrase": { + "title": { + "query": query, + "slop": 0, + "boost": 50.0, + } + } + } + ) if use_fuzzy: should.append( { @@ -513,15 +564,182 @@ def create_app(config: AppConfig = CONFIG) -> Flask: app = Flask(__name__, static_folder=str(Path(__file__).parent / "static")) client = _ensure_client(config) index = config.elastic.index + qdrant_url = config.qdrant_url + qdrant_collection = config.qdrant_collection + qdrant_vector_name = config.qdrant_vector_name + qdrant_vector_size = config.qdrant_vector_size + qdrant_embed_model = config.qdrant_embed_model @app.route("/") def index_page(): return send_from_directory(app.static_folder, "index.html") + @app.route("/graph") + def graph_page(): + return send_from_directory(app.static_folder, "graph.html") + + @app.route("/vector-search") + def vector_search_page(): + return send_from_directory(app.static_folder, "vector.html") + @app.route("/static/") def static_files(filename: str): return send_from_directory(app.static_folder, filename) + def normalize_reference_list(values: Any) -> List[str]: + if values is None: + return [] + if isinstance(values, (list, tuple, set)): + iterable = values + else: + iterable = [values] + normalized: List[str] = [] + for item in iterable: + candidate: Optional[str] + if isinstance(item, dict): + candidate = item.get("video_id") or item.get("id") # type: ignore[assignment] + else: + candidate = item # type: ignore[assignment] + if candidate is None: + continue + text = str(candidate).strip() + if not text: + continue + if text.lower() in {"none", "null"}: + continue + normalized.append(text) + return normalized + + def build_graph_payload( + root_id: str, depth: int, max_nodes: int + ) -> Dict[str, Any]: + root_id = root_id.strip() + if not root_id: + return {"nodes": [], "links": [], "root": root_id, "depth": depth, "meta": {}} + + doc_cache: Dict[str, Optional[Dict[str, Any]]] = {} + + def fetch_document(video_id: str) -> Optional[Dict[str, Any]]: + if video_id in doc_cache: + return doc_cache[video_id] + try: + result = client.get(index=index, id=video_id) + doc_cache[video_id] = result.get("_source") + except Exception as exc: # pragma: no cover - elasticsearch handles errors + LOGGER.debug("Graph: failed to load %s: %s", video_id, exc) + doc_cache[video_id] = None + return doc_cache[video_id] + + nodes: Dict[str, Dict[str, Any]] = {} + links: List[Dict[str, Any]] = [] + link_seen: Set[Tuple[str, str, str]] = set() + queue: deque[Tuple[str, int]] = deque([(root_id, 0)]) + queued: Set[str] = {root_id} + visited: Set[str] = set() + + while queue and len(nodes) < max_nodes: + current_id, level = queue.popleft() + queued.discard(current_id) + if current_id in visited: + continue + doc = fetch_document(current_id) + if doc is None: + if current_id == root_id: + break + visited.add(current_id) + continue + + visited.add(current_id) + nodes[current_id] = { + "id": current_id, + "title": doc.get("title") or current_id, + "channel_id": doc.get("channel_id"), + "channel_name": doc.get("channel_name") or doc.get("channel_id") or "Unknown", + "url": doc.get("url"), + "date": doc.get("date"), + "is_root": current_id == root_id, + } + + if level >= depth: + continue + + neighbor_ids: List[str] = [] + + for ref_id in normalize_reference_list(doc.get("internal_references")): + if ref_id == current_id: + continue + key = (current_id, ref_id, "references") + if key not in link_seen: + links.append( + {"source": current_id, "target": ref_id, "relation": "references"} + ) + link_seen.add(key) + neighbor_ids.append(ref_id) + + for ref_id in normalize_reference_list(doc.get("referenced_by")): + if ref_id == current_id: + continue + key = (ref_id, current_id, "referenced_by") + if key not in link_seen: + links.append( + {"source": ref_id, "target": current_id, "relation": "referenced_by"} + ) + link_seen.add(key) + neighbor_ids.append(ref_id) + + for neighbor in neighbor_ids: + if neighbor in visited or neighbor in queued: + continue + if len(nodes) + len(queue) >= max_nodes: + break + queue.append((neighbor, level + 1)) + queued.add(neighbor) + + # Ensure nodes referenced by links exist in the payload. + for link in links: + for key in ("source", "target"): + node_id = link[key] + if node_id in nodes: + continue + doc = fetch_document(node_id) + if doc is None: + nodes[node_id] = { + "id": node_id, + "title": node_id, + "channel_id": None, + "channel_name": "Unknown", + "url": None, + "date": None, + "is_root": node_id == root_id, + } + else: + nodes[node_id] = { + "id": node_id, + "title": doc.get("title") or node_id, + "channel_id": doc.get("channel_id"), + "channel_name": doc.get("channel_name") or doc.get("channel_id") or "Unknown", + "url": doc.get("url"), + "date": doc.get("date"), + "is_root": node_id == root_id, + } + + links = [ + link + for link in links + if link.get("source") in nodes and link.get("target") in nodes + ] + + return { + "root": root_id, + "depth": depth, + "nodes": list(nodes.values()), + "links": links, + "meta": { + "node_count": len(nodes), + "link_count": len(links), + }, + } + @app.route("/api/channels") def channels(): base_channels_body = { @@ -580,23 +798,54 @@ def create_app(config: AppConfig = CONFIG) -> Flask: .get("channels", {}) .get("buckets", []) ) - data = [ - { - "Id": bucket.get("key"), - "Name": ( - bucket.get("name", {}) - .get("hits", {}) - .get("hits", [{}])[0] - .get("_source", {}) - .get("channel_name", bucket.get("key")) - ), - "Count": bucket.get("doc_count", 0), - } - for bucket in buckets - ] + data = [] + for bucket in buckets: + key = bucket.get("key") + name_hit = ( + bucket.get("name", {}) + .get("hits", {}) + .get("hits", [{}])[0] + .get("_source", {}) + .get("channel_name") + ) + display_name = name_hit or key or "Unknown" + data.append( + { + "Id": key, + "Name": display_name, + "Count": bucket.get("doc_count", 0), + } + ) data.sort(key=lambda item: item["Name"].lower()) return jsonify(data) + @app.route("/api/graph") + def graph_api(): + video_id = (request.args.get("video_id") or "").strip() + if not video_id: + return jsonify({"error": "video_id is required"}), 400 + + try: + depth = int(request.args.get("depth", "1")) + except ValueError: + depth = 1 + depth = max(0, min(depth, 3)) + + try: + max_nodes = int(request.args.get("max_nodes", "200")) + except ValueError: + max_nodes = 200 + max_nodes = max(10, min(max_nodes, 400)) + + payload = build_graph_payload(video_id, depth, max_nodes) + if not payload["nodes"]: + return ( + jsonify({"error": f"Video '{video_id}' was not found in the index."}), + 404, + ) + payload["meta"]["max_nodes"] = max_nodes + return jsonify(payload) + @app.route("/api/years") def years(): body = { @@ -718,10 +967,13 @@ def create_app(config: AppConfig = CONFIG) -> Flask: for hit in hits.get("hits", []): source = hit.get("_source", {}) highlight_map = hit.get("highlight", {}) - transcript_highlight = ( - (highlight_map.get("transcript_full", []) or []) - + (highlight_map.get("transcript_secondary_full", []) or []) - ) + transcript_highlight = [ + {"html": value, "source": "primary"} + for value in (highlight_map.get("transcript_full", []) or []) + ] + [ + {"html": value, "source": "secondary"} + for value in (highlight_map.get("transcript_secondary_full", []) or []) + ] title_html = ( highlight_map.get("title") @@ -741,6 +993,7 @@ def create_app(config: AppConfig = CONFIG) -> Flask: "description": source.get("description"), "descriptionHtml": description_html, "date": source.get("date"), + "duration": source.get("duration"), "url": source.get("url"), "toHighlight": transcript_highlight, "highlightSource": { @@ -751,6 +1004,7 @@ def create_app(config: AppConfig = CONFIG) -> Flask: "internal_references": source.get("internal_references", []), "referenced_by_count": source.get("referenced_by_count", 0), "referenced_by": source.get("referenced_by", []), + "video_status": source.get("video_status"), } ) @@ -877,7 +1131,15 @@ def create_app(config: AppConfig = CONFIG) -> Flask: "field": "channel_id.keyword", "size": channel_terms_size, "order": {"_count": "desc"}, - } + }, + "aggs": { + "channel_name_hit": { + "top_hits": { + "size": 1, + "_source": {"includes": ["channel_name"]}, + } + } + }, } }, } @@ -916,7 +1178,7 @@ def create_app(config: AppConfig = CONFIG) -> Flask: .get("buckets", []) ) - channel_totals: Dict[str, int] = {} + channel_totals: Dict[str, Dict[str, Any]] = {} buckets: List[Dict[str, Any]] = [] for bucket in raw_buckets: date_str = bucket.get("key_as_string") @@ -926,14 +1188,28 @@ def create_app(config: AppConfig = CONFIG) -> Flask: cid = ch_bucket.get("key") count = ch_bucket.get("doc_count", 0) if cid: - channel_entries.append({"id": cid, "count": count}) - channel_totals[cid] = channel_totals.get(cid, 0) + count + hit_source = ( + ch_bucket.get("channel_name_hit", {}) + .get("hits", {}) + .get("hits", [{}])[0] + .get("_source", {}) + ) + channel_name = hit_source.get("channel_name") if isinstance(hit_source, dict) else None + channel_entries.append({"id": cid, "count": count, "name": channel_name}) + if cid not in channel_totals: + channel_totals[cid] = {"total": 0, "name": channel_name} + channel_totals[cid]["total"] += count + if channel_name and not channel_totals[cid].get("name"): + channel_totals[cid]["name"] = channel_name buckets.append( {"date": date_str, "total": total, "channels": channel_entries} ) ranked_channels = sorted( - [{"id": cid, "total": total} for cid, total in channel_totals.items()], + [ + {"id": cid, "total": info.get("total", 0), "name": info.get("name")} + for cid, info in channel_totals.items() + ], key=lambda item: item["total"], reverse=True, ) @@ -953,6 +1229,145 @@ def create_app(config: AppConfig = CONFIG) -> Flask: def frequency_page(): return send_from_directory(app.static_folder, "frequency.html") + @app.route("/api/vector-search", methods=["POST"]) + def api_vector_search(): + payload = request.get_json(silent=True) or {} + query_text = (payload.get("query") or "").strip() + filters = payload.get("filters") or {} + limit = max(int(payload.get("size", 10)), 1) + offset = max(int(payload.get("offset", 0)), 0) + + if not query_text: + return jsonify( + {"items": [], "totalResults": 0, "offset": offset, "error": "empty_query"} + ) + + try: + query_vector = embed_query( + query_text, model_name=qdrant_embed_model, expected_dim=qdrant_vector_size + ) + except Exception as exc: # pragma: no cover - runtime dependency + LOGGER.error("Embedding failed: %s", exc, exc_info=config.elastic.debug) + return jsonify({"error": "embedding_unavailable"}), 500 + + qdrant_vector_payload: Any + if qdrant_vector_name: + qdrant_vector_payload = {qdrant_vector_name: query_vector} + else: + qdrant_vector_payload = query_vector + + qdrant_body: Dict[str, Any] = { + "vector": qdrant_vector_payload, + "limit": limit, + "offset": offset, + "with_payload": True, + "with_vectors": False, + } + if filters: + qdrant_body["filter"] = filters + + try: + response = requests.post( + f"{qdrant_url}/collections/{qdrant_collection}/points/search", + json=qdrant_body, + timeout=20, + ) + response.raise_for_status() + data = response.json() + except Exception as exc: + LOGGER.error("Vector search failed: %s", exc, exc_info=config.elastic.debug) + return jsonify({"error": "vector_search_unavailable"}), 502 + + points = data.get("result", []) if isinstance(data, dict) else [] + items: List[Dict[str, Any]] = [] + missing_channel_ids: Set[str] = set() + for point in points: + payload = point.get("payload", {}) or {} + raw_highlights = payload.get("highlights") or [] + highlight_entries: List[Dict[str, str]] = [] + for entry in raw_highlights: + if isinstance(entry, dict): + html_value = entry.get("html") or entry.get("text") + else: + html_value = str(entry) + if not html_value: + continue + highlight_entries.append({"html": html_value, "source": "primary"}) + + channel_label = ( + payload.get("channel_name") + or payload.get("channel_title") + or payload.get("channel_id") + ) + items.append( + { + "video_id": payload.get("video_id"), + "channel_id": payload.get("channel_id"), + "channel_name": channel_label, + "title": payload.get("title"), + "titleHtml": payload.get("title"), + "description": payload.get("description"), + "descriptionHtml": payload.get("description"), + "date": payload.get("date"), + "url": payload.get("url"), + "chunkText": payload.get("text") + or payload.get("chunk_text") + or payload.get("chunk") + or payload.get("content"), + "chunkTimestamp": payload.get("timestamp") + or payload.get("start_seconds") + or payload.get("start"), + "toHighlight": highlight_entries, + "highlightSource": { + "primary": bool(highlight_entries), + "secondary": False, + }, + "distance": point.get("score"), + "internal_references_count": payload.get("internal_references_count", 0), + "internal_references": payload.get("internal_references", []), + "referenced_by_count": payload.get("referenced_by_count", 0), + "referenced_by": payload.get("referenced_by", []), + "video_status": payload.get("video_status"), + "duration": payload.get("duration"), + } + ) + if (not channel_label) and payload.get("channel_id"): + missing_channel_ids.add(str(payload.get("channel_id"))) + + if missing_channel_ids: + try: + es_lookup = client.search( + index=index, + body={ + "size": len(missing_channel_ids) * 2, + "_source": ["channel_id", "channel_name"], + "query": {"terms": {"channel_id.keyword": list(missing_channel_ids)}}, + }, + ) + hits = es_lookup.get("hits", {}).get("hits", []) + channel_lookup = {} + for hit in hits: + src = hit.get("_source", {}) or {} + cid = src.get("channel_id") + cname = src.get("channel_name") + if cid and cname and cid not in channel_lookup: + channel_lookup[cid] = cname + for item in items: + if not item.get("channel_name"): + cid = item.get("channel_id") + if cid and cid in channel_lookup: + item["channel_name"] = channel_lookup[cid] + except Exception as exc: + LOGGER.debug("Vector channel lookup failed: %s", exc) + + return jsonify( + { + "items": items, + "totalResults": len(items), + "offset": offset, + } + ) + @app.route("/api/transcript") def transcript(): video_id = request.args.get("video_id", type=str) diff --git a/static/app.js b/static/app.js index 5f0bfd2..4da22d3 100644 --- a/static/app.js +++ b/static/app.js @@ -32,9 +32,7 @@ let qs = new URLSearchParams(window.location.search); const qInput = document.getElementById("q"); - const channelDropdown = document.getElementById("channelDropdown"); - const channelSummary = document.getElementById("channelSummary"); - const channelOptions = document.getElementById("channelOptions"); + const channelSelect = document.getElementById("channel"); const yearSel = document.getElementById("year"); const sortSel = document.getElementById("sort"); const sizeSel = document.getElementById("size"); @@ -43,6 +41,9 @@ const phraseToggle = document.getElementById("phraseToggle"); const queryToggle = document.getElementById("queryStringToggle"); const searchBtn = document.getElementById("searchBtn"); + const aboutBtn = document.getElementById("aboutBtn"); + const aboutPanel = document.getElementById("aboutPanel"); + const aboutCloseBtn = document.getElementById("aboutCloseBtn"); const resultsDiv = document.getElementById("results"); const metaDiv = document.getElementById("meta"); const metricsContainer = document.getElementById("metrics"); @@ -50,17 +51,27 @@ const metricsContent = document.getElementById("metricsContent"); const freqSummary = document.getElementById("frequencySummary"); const freqChart = document.getElementById("frequencyChart"); + const graphOverlay = document.getElementById("graphModalOverlay"); + const graphModalClose = document.getElementById("graphModalClose"); const channelMap = new Map(); - const selectedChannels = new Set(); - let pendingChannelSelection = []; + const transcriptCache = new Map(); + let lastFocusBeforeModal = null; + let pendingChannelSelection = ""; let channelsReady = false; - let suppressChannelChange = false; - let allChannelsCheckbox = null; let previousToggleState = { exact: true, fuzzy: true, phrase: true }; let currentPage = parseInt(qs.get("page") || "0", 10) || 0; + function toggleAboutPanel(show) { + if (!aboutPanel) return; + if (show) { + aboutPanel.removeAttribute("hidden"); + } else { + aboutPanel.setAttribute("hidden", "hidden"); + } + } + function parseBoolParam(name, defaultValue) { const raw = qs.get(name); if (raw === null) return defaultValue; @@ -68,9 +79,8 @@ return !["0", "false", "no"].includes(lowered); } - function parseChannelParams(params) { - const collected = []; - if (!params) return collected; + function parseChannelParam(params) { + if (!params) return ""; const seen = new Set(); const rawValues = params.getAll("channel_id"); const legacy = params.get("channel"); @@ -84,61 +94,17 @@ .forEach((part) => { if (!seen.has(part)) { seen.add(part); - collected.push(part); } }); }); - return collected; + const first = Array.from(seen)[0]; + return first || ""; } function getSelectedChannels() { - return Array.from(selectedChannels); - } - - function ensureAllCheckboxState() { - if (allChannelsCheckbox) { - allChannelsCheckbox.checked = selectedChannels.size === 0; - } - } - - function updateChannelSummary() { - if (!channelSummary) return; - if (!selectedChannels.size) { - channelSummary.textContent = "All Channels"; - return; - } - const names = Array.from(selectedChannels).map( - (id) => channelMap.get(id) || id - ); - if (names.length > 1) { - names.sort((a, b) => a.localeCompare(b, undefined, { sensitivity: "base" })); - } - let label = names.slice(0, 3).join(", "); - if (names.length > 3) { - label += ` +${names.length - 3} more`; - } - channelSummary.textContent = label; - } - - function applyChannelSelection(ids, { silent = false } = {}) { - selectedChannels.clear(); - ids.forEach((id) => selectedChannels.add(id)); - pendingChannelSelection = getSelectedChannels(); - ensureAllCheckboxState(); - if (channelOptions) { - suppressChannelChange = true; - const checkboxes = channelOptions.querySelectorAll( - 'input[type="checkbox"][data-channel="1"]' - ); - checkboxes.forEach((checkbox) => { - checkbox.checked = selectedChannels.has(checkbox.value); - }); - suppressChannelChange = false; - } - updateChannelSummary(); - if (!silent && channelsReady) { - runSearch(0); - } + if (!channelSelect) return []; + const value = channelSelect.value; + return value ? [value] : []; } async function loadYears() { @@ -166,8 +132,10 @@ yearSel.value = qs.get("year") || ""; sortSel.value = qs.get("sort") || "relevant"; sizeSel.value = qs.get("size") || "10"; - pendingChannelSelection = parseChannelParams(qs); - applyChannelSelection(pendingChannelSelection, { silent: true }); + pendingChannelSelection = parseChannelParam(qs); + if (channelSelect) { + channelSelect.value = pendingChannelSelection || ""; + } exactToggle.checked = parseBoolParam("exact", true); fuzzyToggle.checked = parseBoolParam("fuzzy", true); phraseToggle.checked = parseBoolParam("phrase", true); @@ -212,6 +180,76 @@ } } + function graphUiAvailable() { + return !!(window.GraphUI && window.GraphUI.ready); + } + + function openGraphModal(videoId) { + if (!graphOverlay || !graphUiAvailable()) { + return; + } + lastFocusBeforeModal = + document.activeElement instanceof HTMLElement ? document.activeElement : null; + graphOverlay.classList.add("active"); + graphOverlay.setAttribute("aria-hidden", "false"); + document.body.classList.add("modal-open"); + + window.requestAnimationFrame(() => { + window.GraphUI.setDepth(1); + window.GraphUI.setMaxNodes(200); + window.GraphUI.setLabelSize("tiny"); + const graphVideoField = document.getElementById("graphVideoId"); + if (videoId && graphVideoField) { + graphVideoField.value = videoId; + } + if (videoId) { + window.GraphUI.load(videoId, undefined, undefined, { updateInputs: true }); + } + window.GraphUI.focusInput(); + }); + } + + function closeGraphModal() { + if (!graphOverlay) { + return; + } + graphOverlay.classList.remove("active"); + graphOverlay.setAttribute("aria-hidden", "true"); + document.body.classList.remove("modal-open"); + if (graphUiAvailable()) { + window.GraphUI.stop(); + } + if (lastFocusBeforeModal && typeof lastFocusBeforeModal.focus === "function") { + lastFocusBeforeModal.focus(); + } + lastFocusBeforeModal = null; + } + + if (graphModalClose) { + graphModalClose.addEventListener("click", closeGraphModal); + } + if (graphOverlay) { + graphOverlay.addEventListener("click", (event) => { + if (event.target === graphOverlay) { + closeGraphModal(); + } + }); + } + document.addEventListener("keydown", (event) => { + if (event.key === "Escape" && graphOverlay && graphOverlay.classList.contains("active")) { + closeGraphModal(); + } + }); + window.addEventListener("graph-ui-ready", () => { + document + .querySelectorAll('.graph-launch-btn[data-await-graph-ready="1"]') + .forEach((btn) => { + btn.removeAttribute("disabled"); + btn.removeAttribute("data-await-graph-ready"); + btn.title = "Open reference graph"; + }); + }); + function ensureQueryStringMode() { if (!queryToggle) return; if (!queryToggle.checked) { @@ -242,60 +280,8 @@ return `${field}:(${escaped.join(" OR ")})`; } - if (channelOptions) { - channelOptions.addEventListener("change", (event) => { - const target = event.target; - if (!(target instanceof HTMLInputElement) || target.type !== "checkbox") { - return; - } - if (suppressChannelChange) { - return; - } - if (target.dataset.all === "1") { - if (!target.checked && !selectedChannels.size) { - suppressChannelChange = true; - target.checked = true; - suppressChannelChange = false; - return; - } - if (target.checked) { - selectedChannels.clear(); - pendingChannelSelection = []; - suppressChannelChange = true; - const others = channelOptions.querySelectorAll( - 'input[type="checkbox"][data-channel="1"]' - ); - others.forEach((checkbox) => { - checkbox.checked = false; - }); - suppressChannelChange = false; - ensureAllCheckboxState(); - updateChannelSummary(); - if (channelsReady) { - runSearch(0); - } - } - return; - } - - const id = target.value; - if (!id) return; - if (target.checked) { - selectedChannels.add(id); - } else { - selectedChannels.delete(id); - } - pendingChannelSelection = getSelectedChannels(); - ensureAllCheckboxState(); - updateChannelSummary(); - if (channelsReady) { - runSearch(0); - } - }); - } - async function loadChannels() { - if (!channelOptions) { + if (!channelSelect) { channelsReady = true; return; } @@ -303,57 +289,27 @@ const res = await fetch("/api/channels"); const data = await res.json(); channelMap.clear(); - channelOptions.innerHTML = ""; - - const listFragment = document.createDocumentFragment(); - - const allLabel = document.createElement("label"); - allLabel.className = "channel-option"; - allChannelsCheckbox = document.createElement("input"); - allChannelsCheckbox.type = "checkbox"; - allChannelsCheckbox.dataset.all = "1"; - allChannelsCheckbox.checked = selectedChannels.size === 0; - const allText = document.createElement("span"); - allText.textContent = "All Channels"; - allLabel.appendChild(allChannelsCheckbox); - allLabel.appendChild(allText); - listFragment.appendChild(allLabel); + channelSelect.innerHTML = ''; data.forEach((item) => { - const label = document.createElement("label"); - label.className = "channel-option"; - const checkbox = document.createElement("input"); - checkbox.type = "checkbox"; - checkbox.value = item.Id; - checkbox.dataset.channel = "1"; - const text = document.createElement("span"); - text.textContent = `${item.Name} (${item.Count})`; - label.appendChild(checkbox); - label.appendChild(text); - listFragment.appendChild(label); + const option = document.createElement("option"); + option.value = item.Id; + option.textContent = `${item.Name} (${item.Count})`; + channelSelect.appendChild(option); channelMap.set(item.Id, item.Name); }); - channelOptions.appendChild(listFragment); - - if (!data.length) { - const empty = document.createElement("div"); - empty.textContent = "No channels available."; - channelOptions.appendChild(empty); + if (pendingChannelSelection && channelMap.has(pendingChannelSelection)) { + channelSelect.value = pendingChannelSelection; + } else { + channelSelect.value = ""; } - const initialSelection = pendingChannelSelection.length - ? pendingChannelSelection - : Array.from(selectedChannels); - applyChannelSelection(initialSelection, { silent: true }); channelsReady = true; - updateChannelSummary(); } catch (err) { console.error("Failed to load channels", err); - channelOptions.innerHTML = "
Failed to load channels.
"; + channelSelect.innerHTML = ''; channelsReady = true; - ensureAllCheckboxState(); - updateChannelSummary(); } } @@ -391,6 +347,188 @@ return n; } + async function getTranscriptData(videoId) { + if (!videoId) return null; + if (transcriptCache.has(videoId)) { + return transcriptCache.get(videoId); + } + const res = await fetch(`/api/transcript?video_id=${encodeURIComponent(videoId)}`); + if (!res.ok) { + throw new Error(`Transcript fetch failed (${res.status})`); + } + const data = await res.json(); + transcriptCache.set(videoId, data); + return data; + } + + function formatMlaDate(value) { + if (!value) return ""; + const parsed = new Date(value); + if (Number.isNaN(parsed.valueOf())) { + return value; + } + const months = [ + "Jan.", "Feb.", "Mar.", "Apr.", "May", "June", + "July", "Aug.", "Sept.", "Oct.", "Nov.", "Dec.", + ]; + return `${parsed.getDate()} ${months[parsed.getMonth()]} ${parsed.getFullYear()}`; + } + + function buildMlaCitation(item) { + const channel = (item.channel_name || item.channel_id || "Unknown channel").trim(); + const title = (item.title || "Untitled").trim(); + const url = item.url || ""; + const publishDate = formatMlaDate(item.date) || "n.d."; + const today = formatMlaDate(new Date().toISOString().split("T")[0]); + return `${channel}. "${title}." YouTube, uploaded by ${channel}, ${publishDate}, ${url}. Accessed ${today}.`; + } + + function formatSegmentTimestamp(segment) { + if (!segment) return ""; + if (segment.timestamp) return segment.timestamp; + const candidates = [ + segment.start_seconds, + segment.start, + segment.offset, + segment.time, + ]; + for (const value of candidates) { + if (value == null) continue; + const seconds = parseFloat(value); + if (!Number.isNaN(seconds)) { + return formatTimestamp(seconds); + } + } + return ""; + } + + function serializeTranscriptSection(label, parts, fullText) { + let content = ""; + if (typeof fullText === "string" && fullText.trim()) { + content = fullText.trim(); + } else if (Array.isArray(parts) && parts.length) { + content = parts + .map((segment) => { + const ts = formatSegmentTimestamp(segment); + const text = segment && segment.text ? segment.text : ""; + return ts ? `[${ts}] ${text}` : text; + }) + .join("\n") + .trim(); + } + if (!content) { + return ""; + } + return `${label}\n${content}\n`; + } + + function buildTranscriptDownloadText(item, transcriptData) { + const lines = []; + lines.push(`Title: ${item.title || "Untitled"}`); + if (item.channel_name) { + lines.push(`Channel: ${item.channel_name}`); + } + if (item.date) { + lines.push(`Published: ${item.date}`); + } + if (item.url) { + lines.push(`URL: ${item.url}`); + } + lines.push(""); + + const primaryText = serializeTranscriptSection( + "Primary Transcript", + transcriptData.transcript_parts, + transcriptData.transcript_full + ); + const secondaryText = serializeTranscriptSection( + "Secondary Transcript", + transcriptData.transcript_secondary_parts, + transcriptData.transcript_secondary_full + ); + + if (primaryText) { + lines.push(primaryText); + } + if (secondaryText) { + lines.push(secondaryText); + } + if (!primaryText && !secondaryText) { + lines.push("No transcript available."); + } + return lines.join("\n").trim() + "\n"; + } + + function flashButtonMessage(button, message, duration = 1800) { + if (!button) return; + const original = button.dataset.originalLabel || button.textContent; + button.dataset.originalLabel = original; + button.textContent = message; + setTimeout(() => { + button.textContent = button.dataset.originalLabel || original; + }, duration); + } + + async function handleTranscriptDownload(item, button) { + if (!item.video_id) return; + button.disabled = true; + try { + const data = await getTranscriptData(item.video_id); + if (!data) { + throw new Error("Transcript unavailable"); + } + const text = buildTranscriptDownloadText(item, data); + const blob = new Blob([text], { type: "text/plain" }); + const url = URL.createObjectURL(blob); + const link = document.createElement("a"); + link.href = url; + link.download = `${item.video_id || "transcript"}.txt`; + document.body.appendChild(link); + link.click(); + document.body.removeChild(link); + URL.revokeObjectURL(url); + flashButtonMessage(button, "Downloaded"); + } catch (err) { + console.error("Download failed", err); + console.error("Download failed", err); + alert("Unable to download transcript right now."); + } finally { + button.disabled = false; + } + } + + async function handleCopyCitation(item, button) { + const citation = buildMlaCitation(item); + try { + if (navigator.clipboard && window.isSecureContext) { + await navigator.clipboard.writeText(citation); + } else { + const textarea = document.createElement("textarea"); + textarea.value = citation; + textarea.style.position = "fixed"; + textarea.style.opacity = "0"; + document.body.appendChild(textarea); + textarea.focus(); + textarea.select(); + document.execCommand("copy"); + document.body.removeChild(textarea); + } + flashButtonMessage(button, "Copied!"); + } catch (err) { + console.error("Citation copy failed", err); + alert(citation); + } + } + + function getVideoStatus(item) { + if (!item || !item.video_status) return ""; + return String(item.video_status).toLowerCase(); + } + + function isLikelyDeleted(item) { + return getVideoStatus(item) === "deleted"; + } + function formatTimestamp(seconds) { if (!seconds && seconds !== 0) return "00:00"; const hours = Math.floor(seconds / 3600); @@ -621,7 +759,65 @@ }, 3000); } - async function fetchAndDisplayTranscript(videoId, videoUrl, containerElement, button, highlightText = null) { + const COMMON_STOP_WORDS = new Set([ + "the","and","that","this","with","for","are","but","not","you","your","they","their", + "have","from","was","been","has","had","were","about","what","when","where","which", + "will","would","there","here","into","them","then","than","also","more","some","just", + "like","said","because","make","made","could","should","might" + ]); + + const tokenizeContent = (text) => { + if (!text) return []; + return text + .toLowerCase() + .split(/[^a-z0-9]+/g) + .filter((token) => token.length > 2 && !COMMON_STOP_WORDS.has(token)) + .slice(0, 20); + }; + + function collectHighlightTokens(entries) { + const collected = []; + if (!Array.isArray(entries)) return collected; + entries.forEach((entry) => { + const raw = typeof entry === "string" ? entry : entry?.html || entry?.text || ""; + if (!raw) return; + const marked = extractMarkedText(raw); + if (marked) { + collected.push(...tokenizeContent(marked)); + } else { + collected.push(...tokenizeContent(stripHtmlAndNormalize(raw))); + } + }); + return collected; + } + + function buildQueryTokens(query) { + return tokenizeContent(query || "").slice(0, 20); + } + + function highlightTranscriptMatches(transcriptDiv, entries, searchQuery) { + if (!transcriptDiv) return; + const tokens = new Set(); + collectHighlightTokens(entries).forEach((token) => tokens.add(token)); + buildQueryTokens(searchQuery).forEach((token) => tokens.add(token)); + if (!tokens.size) return; + const segments = transcriptDiv.querySelectorAll(".transcript-segment"); + segments.forEach((segment) => { + const text = segment.dataset.text || ""; + const matched = Array.from(tokens).some((token) => text.includes(token)); + segment.classList.toggle("transcript-segment--matched", matched); + }); + } + + async function fetchAndDisplayTranscript( + videoId, + videoUrl, + containerElement, + button, + highlightText = null, + allHighlights = null, + searchQuery = "" + ) { const existingTranscript = containerElement.querySelector('.full-transcript'); if (existingTranscript && !highlightText) { existingTranscript.remove(); @@ -631,6 +827,7 @@ // If transcript exists and we have highlight text, just scroll to it if (existingTranscript && highlightText) { + highlightTranscriptMatches(existingTranscript, allHighlights, searchQuery); const segment = findMatchingSegment(existingTranscript, highlightText); if (segment) { scrollToSegment(segment); @@ -728,6 +925,7 @@ } }, 100); } + highlightTranscriptMatches(transcriptDiv, allHighlights, searchQuery); } catch (err) { console.error('Error fetching transcript:', err); button.textContent = 'View Full Transcript'; @@ -797,7 +995,8 @@ function clearFrequency(message) { } } -function renderFrequencyChart(buckets, channelTotals) { + + function renderFrequencyChart(buckets, channelTotals) { if (!freqChart || typeof d3 === "undefined") { return; } @@ -807,6 +1006,26 @@ function renderFrequencyChart(buckets, channelTotals) { return; } + const channelNameFallback = new Map(); + (channelTotals || []).forEach((entry) => { + if (!entry || !entry.id) return; + if (entry.name) { + channelNameFallback.set(entry.id, entry.name); + } + }); + buckets.forEach((bucket) => { + (bucket.channels || []).forEach((entry) => { + if (entry && entry.id && entry.name && !channelNameFallback.has(entry.id)) { + channelNameFallback.set(entry.id, entry.name); + } + }); + }); + + const getChannelLabel = (id) => { + if (!id) return ""; + return channelMap.get(id) || channelNameFallback.get(id) || id; + }; + let channelsOrder = (channelTotals && channelTotals.length ? channelTotals.map((entry) => entry.id) @@ -929,7 +1148,7 @@ function renderFrequencyChart(buckets, channelTotals) { .text(function (d) { const group = this.parentNode ? this.parentNode.parentNode : null; const key = group ? d3.select(group).datum().key : undefined; - const label = key ? channelMap.get(key) || key : key || ''; + const label = key ? getChannelLabel(key) : key || ''; return `${dateKeyFormat(d.data.date)}: ${d[1] - d[0]}${label ? " (" + label + ")" : ''}`; }); @@ -942,7 +1161,7 @@ function renderFrequencyChart(buckets, channelTotals) { swatch.className = "freq-legend-swatch"; swatch.style.backgroundColor = color(key); const label = document.createElement("span"); - label.textContent = channelMap.get(key) || key; + label.textContent = getChannelLabel(key) || key; item.appendChild(swatch); item.appendChild(label); legend.appendChild(item); @@ -1027,12 +1246,15 @@ async function updateFrequencyChart(term, channels, year, queryMode, toggles = { item.descriptionHtml || escapeHtml(item.description || ""); const header = document.createElement("div"); + header.className = "result-header"; + const headerMain = document.createElement("div"); + headerMain.className = "result-header-main"; const badgeDefs = []; if (item.highlightSource && item.highlightSource.primary) { - badgeDefs.push({ label: "primary transcript" }); + badgeDefs.push({ label: "primary transcript", badgeType: "transcript-primary" }); } if (item.highlightSource && item.highlightSource.secondary) { - badgeDefs.push({ label: "secondary transcript" }); + badgeDefs.push({ label: "secondary transcript", badgeType: "transcript-secondary" }); } // Add reference count badges @@ -1068,13 +1290,47 @@ async function updateFrequencyChart(term, channels, year, queryMode, toggles = { }); } - header.innerHTML = ` - ${titleHtml} -
${escapeHtml(item.channel_name || "")} • ${fmtDate( - item.date - )}
- - `; + const titleEl = document.createElement("strong"); + titleEl.innerHTML = titleHtml; + headerMain.appendChild(titleEl); + + const metaLine = document.createElement("div"); + metaLine.className = "muted result-meta"; + const channelLabel = item.channel_name || ""; + const dateLabel = fmtDate(item.date); + let durationSeconds = null; + if (typeof item.duration === "number") { + durationSeconds = item.duration; + } else if (typeof item.duration === "string" && item.duration.trim()) { + const parsed = parseFloat(item.duration); + if (!Number.isNaN(parsed)) { + durationSeconds = parsed; + } + } + const durationLabel = durationSeconds != null ? ` • ${formatTimestamp(durationSeconds)}` : ""; + metaLine.textContent = channelLabel + ? `${channelLabel} • ${dateLabel}${durationLabel}` + : `${dateLabel}${durationLabel}`; + if (isLikelyDeleted(item)) { + metaLine.appendChild(document.createTextNode(" ")); + const statusEl = document.createElement("span"); + statusEl.className = "result-status result-status--deleted"; + statusEl.textContent = "Likely deleted"; + statusEl.title = "YouTube reported this video as unavailable when we last checked."; + metaLine.appendChild(statusEl); + } + headerMain.appendChild(metaLine); + + const linkLine = document.createElement("div"); + linkLine.className = "muted"; + const openLink = document.createElement("a"); + openLink.href = item.url; + openLink.target = "_blank"; + openLink.rel = "noopener"; + openLink.textContent = "Open on YouTube"; + linkLine.appendChild(openLink); + headerMain.appendChild(linkLine); + header.appendChild(headerMain); if (badgeDefs.length) { const badgeRow = document.createElement("div"); badgeRow.className = "badge-row"; @@ -1086,6 +1342,9 @@ async function updateFrequencyChart(term, channels, year, queryMode, toggles = { if (badge.title) { badgeEl.title = badge.title; } + if (badge.badgeType) { + badgeEl.classList.add(`badge--${badge.badgeType}`); + } if (badge.query) { badgeEl.classList.add("badge-clickable"); badgeEl.setAttribute("role", "button"); @@ -1110,7 +1369,45 @@ async function updateFrequencyChart(term, channels, year, queryMode, toggles = { badgeRow.appendChild(badgeEl); }); if (badgeRow.childElementCount) { - header.appendChild(badgeRow); + headerMain.appendChild(badgeRow); + } + } + if (item.video_id) { + const actions = document.createElement("div"); + actions.className = "result-actions"; + + const downloadBtn = document.createElement("button"); + downloadBtn.type = "button"; + downloadBtn.className = "result-action-btn"; + downloadBtn.textContent = "Download transcript"; + downloadBtn.addEventListener("click", () => handleTranscriptDownload(item, downloadBtn)); + actions.appendChild(downloadBtn); + + const citationBtn = document.createElement("button"); + citationBtn.type = "button"; + citationBtn.className = "result-action-btn"; + citationBtn.textContent = "Copy citation"; + citationBtn.addEventListener("click", () => handleCopyCitation(item, citationBtn)); + actions.appendChild(citationBtn); + + if (graphOverlay) { + const graphBtn = document.createElement("button"); + graphBtn.type = "button"; + graphBtn.className = "result-action-btn graph-launch-btn"; + graphBtn.textContent = "Graph"; + if (graphUiAvailable()) { + graphBtn.title = "Open reference graph"; + } else { + graphBtn.disabled = true; + graphBtn.title = "Reference graph is still loading…"; + graphBtn.dataset.awaitGraphReady = "1"; + } + graphBtn.addEventListener("click", () => openGraphModal(item.video_id)); + actions.appendChild(graphBtn); + } + + if (actions.childElementCount) { + header.appendChild(actions); } } el.appendChild(header); @@ -1128,9 +1425,25 @@ async function updateFrequencyChart(term, channels, year, queryMode, toggles = { item.toHighlight.forEach((entry) => { const html = typeof entry === "string" ? entry : entry?.html; if (!html) return; + const source = entry && typeof entry === "object" ? entry.source : null; const row = document.createElement("div"); row.className = "highlight-row"; - row.innerHTML = html; + if (source === "primary") { + row.classList.add("highlight-row--primary"); + } else if (source === "secondary") { + row.classList.add("highlight-row--secondary"); + } + const textBlock = document.createElement("div"); + textBlock.className = "highlight-text"; + textBlock.innerHTML = html; + row.appendChild(textBlock); + if (source) { + const indicator = document.createElement("span"); + indicator.className = `highlight-source-indicator highlight-source-indicator--${source}`; + indicator.title = + source === "primary" ? "Highlight from primary transcript" : "Highlight from secondary transcript"; + row.appendChild(indicator); + } row.title = "Click to jump to this location in the transcript"; // Make highlight clickable @@ -1138,7 +1451,15 @@ async function updateFrequencyChart(term, channels, year, queryMode, toggles = { const transcriptBtn = el.querySelector(".transcript-toggle"); if (transcriptBtn && item.video_id) { const highlightText = stripHtmlAndNormalize(html); - fetchAndDisplayTranscript(item.video_id, item.url, el, transcriptBtn, highlightText); + fetchAndDisplayTranscript( + item.video_id, + item.url, + el, + transcriptBtn, + highlightText, + item.toHighlight, + qInput.value + ); } }; @@ -1154,7 +1475,15 @@ async function updateFrequencyChart(term, channels, year, queryMode, toggles = { transcriptBtn.className = "transcript-toggle"; transcriptBtn.textContent = "View Full Transcript"; transcriptBtn.onclick = () => { - fetchAndDisplayTranscript(item.video_id, item.url, el, transcriptBtn); + fetchAndDisplayTranscript( + item.video_id, + item.url, + el, + transcriptBtn, + null, + item.toHighlight, + qInput.value + ); }; el.appendChild(transcriptBtn); } @@ -1223,10 +1552,28 @@ async function updateFrequencyChart(term, channels, year, queryMode, toggles = { updateFrequencyChart(q, channels, year, queryMode, { exact, fuzzy, phrase }); } -searchBtn.addEventListener("click", () => runSearch(0)); + searchBtn.addEventListener("click", () => runSearch(0)); + if (aboutBtn && aboutPanel) { + aboutBtn.addEventListener("click", () => { + const isHidden = aboutPanel.hasAttribute("hidden"); + toggleAboutPanel(isHidden); + }); + } + if (aboutCloseBtn) { + aboutCloseBtn.addEventListener("click", () => toggleAboutPanel(false)); + } + qInput.addEventListener("keypress", (e) => { if (e.key === "Enter") runSearch(0); }); + if (channelSelect) { + channelSelect.addEventListener("change", () => { + pendingChannelSelection = channelSelect.value || ""; + if (channelsReady) { + runSearch(0); + } + }); + } yearSel.addEventListener("change", () => runSearch(0)); sortSel.addEventListener("change", () => runSearch(0)); sizeSel.addEventListener("change", () => runSearch(0)); diff --git a/static/graph.html b/static/graph.html new file mode 100644 index 0000000..597b7dd --- /dev/null +++ b/static/graph.html @@ -0,0 +1,85 @@ + + + + + + TLC Reference Graph + + + + + + +
+
+
Reference Graph
+ +
+
+

+ Explore how videos reference each other. Enter a video_id to see its immediate + neighbors (referenced and referencing videos). Choose a larger depth to expand the graph. +

+ +
+
+ + +
+ +
+ + +
+ +
+ + +
+ +
+ + +
+ + +
+ +
Enter a video ID to begin.
+
+
+ +
+

Click nodes to open the video on YouTube

+

Colors represent channels

+
+
+ + + + diff --git a/static/graph.js b/static/graph.js new file mode 100644 index 0000000..a4a32fd --- /dev/null +++ b/static/graph.js @@ -0,0 +1,670 @@ +(() => { + const global = window; + const GraphUI = (global.GraphUI = global.GraphUI || {}); + GraphUI.ready = false; + const form = document.getElementById("graphForm"); + const videoInput = document.getElementById("graphVideoId"); + const depthInput = document.getElementById("graphDepth"); + const maxNodesInput = document.getElementById("graphMaxNodes"); + const labelSizeInput = document.getElementById("graphLabelSize"); + const statusEl = document.getElementById("graphStatus"); + const container = document.getElementById("graphContainer"); + const isEmbedded = + container && container.dataset && container.dataset.embedded === "true"; + + if (!form || !videoInput || !depthInput || !maxNodesInput || !labelSizeInput || !container) { + console.error("Graph: required DOM elements missing."); + return; + } + + const color = d3.scaleOrdinal(d3.schemeTableau10); + const colorRange = typeof color.range === "function" ? color.range() : []; + const paletteSizeDefault = colorRange.length || 10; + const PATTERN_TYPES = [ + { key: "none", legendClass: "none" }, + { key: "diag-forward", legendClass: "diag-forward" }, + { key: "diag-back", legendClass: "diag-back" }, + { key: "cross", legendClass: "cross" }, + { key: "dots", legendClass: "dots" }, + ]; + const ADDITIONAL_PATTERNS = PATTERN_TYPES.filter((pattern) => pattern.key !== "none"); + + const sanitizeDepth = (value) => { + const parsed = parseInt(value, 10); + if (Number.isNaN(parsed)) return 1; + return Math.max(0, Math.min(parsed, 3)); + }; + + const sanitizeMaxNodes = (value) => { + const parsed = parseInt(value, 10); + if (Number.isNaN(parsed)) return 200; + return Math.max(10, Math.min(parsed, 400)); + }; + + const LABEL_SIZE_VALUES = ["off", "tiny", "small", "normal", "medium", "large", "xlarge"]; + const LABEL_FONT_SIZES = { + tiny: "7px", + small: "8px", + normal: "9px", + medium: "10px", + large: "11px", + xlarge: "13px", + }; + const DEFAULT_LABEL_SIZE = "tiny"; + const isValidLabelSize = (value) => LABEL_SIZE_VALUES.includes(value); + + const getLabelSize = () => { + if (!labelSizeInput) return DEFAULT_LABEL_SIZE; + const value = labelSizeInput.value; + return isValidLabelSize(value) ? value : DEFAULT_LABEL_SIZE; + }; + + function setLabelSizeInput(value) { + if (!labelSizeInput) return; + labelSizeInput.value = isValidLabelSize(value) ? value : DEFAULT_LABEL_SIZE; + } + + const getChannelLabel = (node) => + (node && (node.channel_name || node.channel_id)) || "Unknown"; + + function appendPatternContent(pattern, baseColor, patternKey) { + pattern.append("rect").attr("width", 8).attr("height", 8).attr("fill", baseColor); + + const strokeColor = "#1f1f1f"; + const strokeOpacity = 0.35; + + const addForward = () => { + pattern + .append("path") + .attr("d", "M-2,6 L2,2 M0,8 L8,0 M6,10 L10,4") + .attr("stroke", strokeColor) + .attr("stroke-width", 1) + .attr("stroke-opacity", strokeOpacity) + .attr("fill", "none"); + }; + + const addBackward = () => { + pattern + .append("path") + .attr("d", "M-2,2 L2,6 M0,0 L8,8 M6,-2 L10,2") + .attr("stroke", strokeColor) + .attr("stroke-width", 1) + .attr("stroke-opacity", strokeOpacity) + .attr("fill", "none"); + }; + + switch (patternKey) { + case "diag-forward": + addForward(); + break; + case "diag-back": + addBackward(); + break; + case "cross": + addForward(); + addBackward(); + break; + case "dots": + pattern + .append("circle") + .attr("cx", 4) + .attr("cy", 4) + .attr("r", 1.5) + .attr("fill", strokeColor) + .attr("fill-opacity", strokeOpacity); + break; + default: + break; + } + } + + function createChannelStyle(label, baseColor, patternKey) { + const patternInfo = + PATTERN_TYPES.find((pattern) => pattern.key === patternKey) || PATTERN_TYPES[0]; + return { + baseColor, + hatch: patternInfo ? patternInfo.key : "none", + legendClass: patternInfo ? patternInfo.legendClass : "none", + }; + } + + let currentGraphData = null; + let currentChannelStyles = new Map(); + let currentDepth = sanitizeDepth(depthInput.value); + let currentMaxNodes = sanitizeMaxNodes(maxNodesInput.value); + let currentSimulation = null; + + function setStatus(message, isError = false) { + if (!statusEl) return; + statusEl.textContent = message; + if (isError) { + statusEl.classList.add("error"); + } else { + statusEl.classList.remove("error"); + } + } + + function sanitizeId(value) { + return (value || "").trim(); + } + + async function fetchGraph(videoId, depth, maxNodes) { + const params = new URLSearchParams(); + params.set("video_id", videoId); + params.set("depth", String(depth)); + params.set("max_nodes", String(maxNodes)); + const response = await fetch(`/api/graph?${params.toString()}`); + if (!response.ok) { + const errorPayload = await response.json().catch(() => ({})); + const errorMessage = + errorPayload.error || + `Graph request failed (${response.status} ${response.statusText})`; + throw new Error(errorMessage); + } + return response.json(); + } + + function resizeContainer() { + if (!container) return; + const minHeight = 520; + const viewportHeight = window.innerHeight; + container.style.height = `${Math.max(minHeight, Math.round(viewportHeight * 0.6))}px`; + } + + function renderGraph(data, labelSize = "normal") { + if (!container) return; + + if (currentSimulation) { + currentSimulation.stop(); + currentSimulation = null; + } + container.innerHTML = ""; + + const width = container.clientWidth || 900; + const height = container.clientHeight || 600; + + const svg = d3 + .select(container) + .append("svg") + .attr("viewBox", [0, 0, width, height]) + .attr("width", "100%") + .attr("height", height); + + const defs = svg.append("defs"); + + defs + .append("marker") + .attr("id", "arrow-references") + .attr("viewBox", "0 -5 10 10") + .attr("refX", 18) + .attr("refY", 0) + .attr("markerWidth", 6) + .attr("markerHeight", 6) + .attr("orient", "auto") + .append("path") + .attr("d", "M0,-5L10,0L0,5") + .attr("fill", "#6c83c7"); + + defs + .append("marker") + .attr("id", "arrow-referenced-by") + .attr("viewBox", "0 -5 10 10") + .attr("refX", 18) + .attr("refY", 0) + .attr("markerWidth", 6) + .attr("markerHeight", 6) + .attr("orient", "auto") + .append("path") + .attr("d", "M0,-5L10,0L0,5") + .attr("fill", "#c76c6c"); + + const contentGroup = svg.append("g").attr("class", "graph-content"); + const linkGroup = contentGroup.append("g").attr("class", "graph-links"); + const nodeGroup = contentGroup.append("g").attr("class", "graph-nodes"); + const labelGroup = contentGroup.append("g").attr("class", "graph-labels"); + + const links = data.links || []; + const nodes = data.nodes || []; + + currentChannelStyles = new Map(); + const uniqueChannels = []; + nodes.forEach((node) => { + const label = getChannelLabel(node); + if (!currentChannelStyles.has(label)) { + uniqueChannels.push(label); + } + }); + + const additionalPatternCount = ADDITIONAL_PATTERNS.length; + uniqueChannels.forEach((label, idx) => { + const baseColor = color(label); + let patternKey = "none"; + if (idx >= paletteSizeDefault && additionalPatternCount > 0) { + const patternInfo = + ADDITIONAL_PATTERNS[(idx - paletteSizeDefault) % additionalPatternCount]; + patternKey = patternInfo.key; + } + const style = createChannelStyle(label, baseColor, patternKey); + currentChannelStyles.set(label, style); + }); + + const linkSelection = linkGroup + .selectAll("line") + .data(links) + .enter() + .append("line") + .attr("stroke-width", 1.2) + .attr("stroke", (d) => + d.relation === "references" ? "#6c83c7" : "#c76c6c" + ) + .attr("stroke-opacity", 0.7) + .attr("marker-end", (d) => + d.relation === "references" ? "url(#arrow-references)" : "url(#arrow-referenced-by)" + ); + + let nodePatternCounter = 0; + const nodePatternRefs = new Map(); + + const getNodeFill = (node) => { + const style = currentChannelStyles.get(getChannelLabel(node)); + if (!style) { + return color(getChannelLabel(node)); + } + if (!style.hatch || style.hatch === "none") { + return style.baseColor; + } + const patternId = `node-pattern-${nodePatternCounter++}`; + const pattern = defs + .append("pattern") + .attr("id", patternId) + .attr("patternUnits", "userSpaceOnUse") + .attr("width", 8) + .attr("height", 8); + appendPatternContent(pattern, style.baseColor, style.hatch); + pattern.attr("patternTransform", "translate(0,0)"); + nodePatternRefs.set(node.id, pattern); + return `url(#${patternId})`; + }; + + const nodeSelection = nodeGroup + .selectAll("circle") + .data(nodes, (d) => d.id) + .enter() + .append("circle") + .attr("r", (d) => (d.is_root ? 10 : 7)) + .attr("fill", (d) => getNodeFill(d)) + .attr("stroke", "#1f1f1f") + .attr("stroke-width", (d) => (d.is_root ? 2 : 1)) + .call( + d3 + .drag() + .on("start", (event, d) => { + if (!event.active) simulation.alphaTarget(0.3).restart(); + d.fx = d.x; + d.fy = d.y; + }) + .on("drag", (event, d) => { + d.fx = event.x; + d.fy = event.y; + }) + .on("end", (event, d) => { + if (!event.active) simulation.alphaTarget(0); + d.fx = null; + d.fy = null; + }) + ) + .on("click", (event, d) => { + if (d.url) { + window.open(d.url, "_blank", "noopener"); + } + }) + .on("contextmenu", (event, d) => { + event.preventDefault(); + loadGraph(d.id, currentDepth, currentMaxNodes, { updateInputs: true }); + }); + + nodeSelection + .append("title") + .text((d) => { + const parts = []; + parts.push(d.title || d.id); + if (d.channel_name) { + parts.push(`Channel: ${d.channel_name}`); + } + if (d.date) { + parts.push(`Date: ${d.date}`); + } + return parts.join("\n"); + }); + + const labelSelection = labelGroup + .selectAll("text") + .data(nodes, (d) => d.id) + .enter() + .append("text") + .attr("class", "graph-node-label") + .attr("text-anchor", "middle") + .attr("fill", "#1f1f1f") + .attr("pointer-events", "none") + .text((d) => d.title || d.id); + + applyLabelAppearance(labelSelection, labelSize); + + const simulation = d3 + .forceSimulation(nodes) + .force( + "link", + d3 + .forceLink(links) + .id((d) => d.id) + .distance(120) + .strength(0.8) + ) + .force("charge", d3.forceManyBody().strength(-320)) + .force("center", d3.forceCenter(width / 2, height / 2)) + .force( + "collide", + d3.forceCollide().radius((d) => (d.is_root ? 20 : 14)).iterations(2) + ); + + simulation.on("tick", () => { + linkSelection + .attr("x1", (d) => d.source.x) + .attr("y1", (d) => d.source.y) + .attr("x2", (d) => d.target.x) + .attr("y2", (d) => d.target.y); + + nodeSelection.attr("cx", (d) => d.x).attr("cy", (d) => d.y); + + labelSelection.attr("x", (d) => d.x).attr("y", (d) => d.y - (d.is_root ? 14 : 12)); + + nodeSelection.each(function (d) { + const pattern = nodePatternRefs.get(d.id); + if (pattern) { + const safeX = Number.isFinite(d.x) ? d.x : 0; + const safeY = Number.isFinite(d.y) ? d.y : 0; + pattern.attr("patternTransform", `translate(${safeX}, ${safeY})`); + } + }); + }); + + const zoomBehavior = d3 + .zoom() + .scaleExtent([0.3, 3]) + .on("zoom", (event) => { + contentGroup.attr("transform", event.transform); + }); + + svg.call(zoomBehavior); + currentSimulation = simulation; + } + + async function loadGraph(videoId, depth, maxNodes, { updateInputs = false } = {}) { + const sanitizedId = sanitizeId(videoId); + if (!sanitizedId) { + setStatus("Please enter a video ID.", true); + return; + } + const safeDepth = sanitizeDepth(depth); + const safeMaxNodes = sanitizeMaxNodes(maxNodes); + + if (updateInputs) { + videoInput.value = sanitizedId; + depthInput.value = String(safeDepth); + maxNodesInput.value = String(safeMaxNodes); + } + + setStatus("Loading graph…"); + try { + const data = await fetchGraph(sanitizedId, safeDepth, safeMaxNodes); + if (!data.nodes || data.nodes.length === 0) { + setStatus("No nodes returned for this video.", true); + container.innerHTML = ""; + currentGraphData = null; + currentChannelStyles = new Map(); + renderLegend([]); + return; + } + currentGraphData = data; + currentDepth = safeDepth; + currentMaxNodes = safeMaxNodes; + renderGraph(data, getLabelSize()); + renderLegend(data.nodes); + setStatus( + `Showing ${data.nodes.length} nodes and ${data.links.length} links (depth ${data.depth})` + ); + updateUrlState(sanitizedId, safeDepth, safeMaxNodes, getLabelSize()); + } catch (err) { + console.error(err); + setStatus(err.message || "Failed to build graph.", true); + container.innerHTML = ""; + currentGraphData = null; + currentChannelStyles = new Map(); + renderLegend([]); + } + } + + async function handleSubmit(event) { + event.preventDefault(); + await loadGraph(videoInput.value, depthInput.value, maxNodesInput.value, { + updateInputs: true, + }); + } + + function renderLegend(nodes) { + let legend = document.getElementById("graphLegend"); + if (!legend) { + legend = document.createElement("div"); + legend.id = "graphLegend"; + legend.className = "graph-legend"; + if (statusEl && statusEl.parentNode) { + statusEl.insertAdjacentElement("afterend", legend); + } else { + container.parentElement?.insertBefore(legend, container); + } + } + + legend.innerHTML = ""; + + const edgesSection = document.createElement("div"); + edgesSection.className = "graph-legend-section"; + + const edgesTitle = document.createElement("div"); + edgesTitle.className = "graph-legend-title"; + edgesTitle.textContent = "Edges"; + edgesSection.appendChild(edgesTitle); + + const createEdgeRow = (swatchClass, text) => { + const row = document.createElement("div"); + row.className = "graph-legend-row"; + const swatch = document.createElement("span"); + swatch.className = `graph-legend-swatch ${swatchClass}`; + const label = document.createElement("span"); + label.textContent = text; + row.appendChild(swatch); + row.appendChild(label); + return row; + }; + + edgesSection.appendChild( + createEdgeRow( + "graph-legend-swatch--references", + "Outgoing reference (video references other)" + ) + ); + edgesSection.appendChild( + createEdgeRow( + "graph-legend-swatch--referenced", + "Incoming reference (other video references this)" + ) + ); + legend.appendChild(edgesSection); + + const channelSection = document.createElement("div"); + channelSection.className = "graph-legend-section"; + const channelTitle = document.createElement("div"); + channelTitle.className = "graph-legend-title"; + channelTitle.textContent = "Channels in view"; + channelSection.appendChild(channelTitle); + + const channelList = document.createElement("div"); + channelList.className = "graph-legend-channel-list"; + + const channelEntries = Array.from(currentChannelStyles.entries()).sort((a, b) => + a[0].localeCompare(b[0], undefined, { sensitivity: "base" }) + ); + const maxChannelItems = 20; + + channelEntries.slice(0, maxChannelItems).forEach(([label, style]) => { + const item = document.createElement("div"); + item.className = `graph-legend-channel graph-legend-channel--${ + style.legendClass || "none" + }`; + const swatch = document.createElement("span"); + swatch.className = "graph-legend-swatch graph-legend-channel-swatch"; + swatch.style.backgroundColor = style.baseColor; + const text = document.createElement("span"); + text.textContent = label; + item.appendChild(swatch); + item.appendChild(text); + channelList.appendChild(item); + }); + + const totalChannels = channelEntries.length; + if (channelList.childElementCount) { + channelSection.appendChild(channelList); + if (totalChannels > maxChannelItems) { + const note = document.createElement("div"); + note.className = "graph-legend-note"; + note.textContent = `+${totalChannels - maxChannelItems} more channels`; + channelSection.appendChild(note); + } + } else { + const empty = document.createElement("div"); + empty.className = "graph-legend-note"; + empty.textContent = "No channel data available."; + channelSection.appendChild(empty); + } + + legend.appendChild(channelSection); + } + + function applyLabelAppearance(selection, labelSize) { + if (labelSize === "off") { + selection.style("display", "none"); + } else { + selection + .style("display", null) + .attr("font-size", LABEL_FONT_SIZES[labelSize] || LABEL_FONT_SIZES.normal); + } + } + + function updateUrlState(videoId, depth, maxNodes, labelSize) { + if (isEmbedded) { + return; + } + const next = new URL(window.location.href); + next.searchParams.set("video_id", videoId); + next.searchParams.set("depth", String(depth)); + next.searchParams.set("max_nodes", String(maxNodes)); + if (labelSize && labelSize !== "normal") { + next.searchParams.set("label_size", labelSize); + } else { + next.searchParams.delete("label_size"); + } + history.replaceState({}, "", next.toString()); + } + + function initFromQuery() { + const params = new URLSearchParams(window.location.search); + const videoId = sanitizeId(params.get("video_id")); + const depth = sanitizeDepth(params.get("depth") || ""); + const maxNodes = sanitizeMaxNodes(params.get("max_nodes") || ""); + const labelSizeParam = params.get("label_size"); + if (videoId) { + videoInput.value = videoId; + } + depthInput.value = String(depth); + maxNodesInput.value = String(maxNodes); + if (labelSizeParam && isValidLabelSize(labelSizeParam)) { + setLabelSizeInput(labelSizeParam); + } else { + setLabelSizeInput(getLabelSize()); + } + if (!videoId || isEmbedded) { + return; + } + loadGraph(videoId, depth, maxNodes, { updateInputs: false }); + } + + resizeContainer(); + window.addEventListener("resize", resizeContainer); + form.addEventListener("submit", handleSubmit); + labelSizeInput.addEventListener("change", () => { + const size = getLabelSize(); + if (currentGraphData) { + renderGraph(currentGraphData, size); + renderLegend(currentGraphData.nodes); + } + updateUrlState( + sanitizeId(videoInput.value), + currentDepth, + currentMaxNodes, + size + ); + }); + initFromQuery(); + + Object.assign(GraphUI, { + load(videoId, depth, maxNodes, options = {}) { + const targetDepth = depth != null ? depth : currentDepth; + const targetMax = maxNodes != null ? maxNodes : currentMaxNodes; + return loadGraph(videoId, targetDepth, targetMax, { + updateInputs: options.updateInputs !== false, + }); + }, + setLabelSize(size) { + if (!labelSizeInput || !size) return; + setLabelSizeInput(size); + labelSizeInput.dispatchEvent(new Event("change", { bubbles: true })); + }, + setDepth(value) { + if (!depthInput) return; + const safe = sanitizeDepth(value); + depthInput.value = String(safe); + currentDepth = safe; + }, + setMaxNodes(value) { + if (!maxNodesInput) return; + const safe = sanitizeMaxNodes(value); + maxNodesInput.value = String(safe); + currentMaxNodes = safe; + }, + focusInput() { + if (videoInput) { + videoInput.focus(); + videoInput.select(); + } + }, + stop() { + if (currentSimulation) { + currentSimulation.stop(); + currentSimulation = null; + } + }, + getState() { + return { + depth: currentDepth, + maxNodes: currentMaxNodes, + labelSize: getLabelSize(), + nodes: currentGraphData ? currentGraphData.nodes.slice() : [], + links: currentGraphData ? currentGraphData.links.slice() : [], + }; + }, + isEmbedded, + }); + GraphUI.ready = true; + setTimeout(() => { + window.dispatchEvent(new CustomEvent("graph-ui-ready")); + }, 0); +})(); diff --git a/static/index.html b/static/index.html index c3445e2..217b382 100644 --- a/static/index.html +++ b/static/index.html @@ -14,6 +14,7 @@
This Little Corner
+ @@ -21,6 +22,10 @@

Enter a phrase to query title, description, and transcript text.

+

+ Looking for semantic matches? Try the + vector search beta. +

Search @@ -31,13 +36,10 @@
- -
- All Channels -
-
Loading channels…
-
-
+ + + Match all terms exactly.
+ Allow small typos and variations.
+ Boost exact phrases inside transcripts.
+ Use raw Lucene syntax (overrides other toggles).
@@ -107,11 +113,110 @@ -
-

Ready

+
+

Ready

+
+
+ + + + + diff --git a/static/style.css b/static/style.css index 448f8aa..fc129d8 100644 --- a/static/style.css +++ b/static/style.css @@ -63,7 +63,7 @@ body.dimmed { } .field-row input[type="text"], - .field-row .channel-dropdown { + .field-row select#channel { flex: 1 1 100% !important; min-width: 0 !important; max-width: 100% !important; @@ -86,37 +86,18 @@ body.dimmed { max-width: 100%; min-width: 100%; } -} -/* Channel dropdown custom styling */ -.channel-dropdown { - position: relative; - display: inline-block; -} + .graph-controls { + flex-direction: column; + align-items: stretch; + } -.channel-dropdown summary { - list-style: none; - cursor: pointer; - padding: 3px 4px; - background: ButtonFace; - border: 1px solid; - border-color: ButtonHighlight ButtonShadow ButtonShadow ButtonHighlight; - min-width: 180px; - text-align: left; -} - -.channel-dropdown summary::-webkit-details-marker { - display: none; -} - -.channel-dropdown summary::after { - content: ' ▼'; - font-size: 8px; - float: right; -} - -.channel-dropdown[open] summary::after { - content: ' ▲'; + .graph-controls .field-group, + .graph-controls input, + .graph-controls select { + width: 100%; + min-width: 0; + } } .toggle-row { @@ -174,32 +155,6 @@ body.dimmed { overflow-y: auto; } -.channel-options { - position: absolute; - margin-top: 2px; - padding: 4px; - background: ButtonFace; - border: 1px solid; - border-color: ButtonHighlight ButtonShadow ButtonShadow ButtonHighlight; - max-height: 300px; - overflow-y: auto; - box-shadow: 2px 2px 0 rgba(0, 0, 0, 0.2); - z-index: 100; - min-width: 220px; -} - -.channel-option { - display: flex; - align-items: center; - gap: 6px; - margin-bottom: 4px; - font-size: 11px; -} - -.channel-option:last-child { - margin-bottom: 0; -} - /* Layout helpers */ .summary-row { display: flex; @@ -218,6 +173,344 @@ body.dimmed { min-width: 300px; } +.graph-window { + width: 95%; +} + +.graph-controls { + display: flex; + flex-wrap: wrap; + gap: 12px; + align-items: flex-end; + margin-bottom: 12px; +} + +.graph-controls .field-group { + display: flex; + flex-direction: column; + gap: 4px; +} + +.graph-controls label { + font-size: 11px; + font-weight: bold; +} + +.graph-controls input, +.graph-controls select { + min-width: 160px; +} + +.graph-status { + font-size: 11px; + margin-bottom: 8px; + color: #1f1f1f; +} + +.graph-status.error { + color: #b00020; +} + +.graph-container { + background: Window; + border: 1px solid #919b9c; + box-shadow: inset -1px -1px #0a0a0a, inset 1px 1px #fff; + position: relative; + width: 100%; + min-height: 520px; + height: auto; + overflow: visible; +} + +.graph-modal-overlay { + position: fixed; + inset: 0; + display: none; + align-items: center; + justify-content: center; + padding: 24px; + background: rgba(0, 0, 0, 0.35); + z-index: 2000; +} + +.graph-modal-overlay.active { + display: flex; +} + +.graph-modal-window { + width: min(960px, 100%); + max-height: calc(100vh - 48px); +} + +.graph-modal-window .window-body { + max-height: calc(100vh - 180px); + overflow-y: auto; +} + +.graph-modal-window .graph-container { + height: 560px; +} + +body.modal-open { + overflow: hidden; +} + +.result-header { + display: flex; + justify-content: flex-start; + gap: 6px; + flex-wrap: wrap; + align-items: flex-start; +} + +.result-header-main { + flex: 1 1 auto; + min-width: 220px; +} + +.result-actions { + display: flex; + align-items: flex-start; + gap: 6px; + margin-left: auto; +} + +.result-action-btn { + white-space: nowrap; + font-family: "Tahoma", "MS Sans Serif", sans-serif; + font-size: 11px; + padding: 4px 10px; +} + +.result-meta { + display: flex; + align-items: center; + flex-wrap: wrap; + gap: 4px; +} + +.result-status { + display: inline-flex; + align-items: center; + gap: 4px; + padding: 1px 6px; + border-radius: 3px; + font-size: 10px; + line-height: 1.3; + border: 1px solid #c4a3a3; + background: #fff6f6; + color: #6b1f1f; +} + +.result-status::before { + content: "⚠"; + font-size: 10px; + line-height: 1; +} + +.result-status--deleted { + border-color: #d1a6a6; + background: #fff8f8; + color: #6b1f1f; +} + +.graph-launch-btn { + white-space: nowrap; +} + +.graph-node-label { + text-shadow: -1px -1px 0 #fff, 1px -1px 0 #fff, -1px 1px 0 #fff, 1px 1px 0 #fff; +} + +.graph-nodes circle { + cursor: pointer; +} + +.graph-legend { + margin: 12px 0; + font-size: 11px; + background: Window; + border: 1px solid #919b9c; + padding: 8px 10px; + display: inline-flex; + flex-direction: column; + gap: 4px; + box-shadow: inset -1px -1px #0a0a0a, inset 1px 1px #fff; +} + +.graph-legend-section { + display: flex; + flex-direction: column; + gap: 4px; +} + +.graph-legend-title { + font-weight: bold; + color: #1f1f1f; +} + +.graph-legend-row { + display: flex; + align-items: center; + gap: 8px; +} + +.graph-legend-swatch { + display: inline-block; + width: 18px; + height: 12px; + border: 1px solid #1f1f1f; +} + +.graph-legend-swatch--references { + background: #6c83c7; +} + +.graph-legend-swatch--referenced { + background: #c76c6c; +} + +.graph-legend-channel-list { + display: flex; + flex-wrap: wrap; + gap: 8px; +} + +.graph-legend-channel { + display: flex; + align-items: center; + gap: 6px; +} + +.graph-legend-channel-swatch { + width: 14px; + height: 14px; + background-repeat: repeat; + background-position: 0 0; + background-size: 6px 6px; +} + +.graph-legend-channel--none .graph-legend-channel-swatch { + background-image: none; +} + +.graph-legend-channel--diag-forward .graph-legend-channel-swatch { + background-image: repeating-linear-gradient( + 45deg, + rgba(0, 0, 0, 0.35) 0, + rgba(0, 0, 0, 0.35) 2px, + transparent 2px, + transparent 4px + ); + background-blend-mode: multiply; +} + +.graph-legend-channel--diag-back .graph-legend-channel-swatch { + background-image: repeating-linear-gradient( + -45deg, + rgba(0, 0, 0, 0.35) 0, + rgba(0, 0, 0, 0.35) 2px, + transparent 2px, + transparent 4px + ); + background-blend-mode: multiply; +} + +.graph-legend-channel--cross .graph-legend-channel-swatch { + background-image: + repeating-linear-gradient( + 45deg, + rgba(0, 0, 0, 0.25) 0, + rgba(0, 0, 0, 0.25) 2px, + transparent 2px, + transparent 4px + ), + repeating-linear-gradient( + -45deg, + rgba(0, 0, 0, 0.25) 0, + rgba(0, 0, 0, 0.25) 2px, + transparent 2px, + transparent 4px + ); + background-blend-mode: multiply; +} + +.graph-legend-channel--dots .graph-legend-channel-swatch { + background-image: radial-gradient(rgba(0, 0, 0, 0.35) 30%, transparent 31%); + background-size: 6px 6px; + background-blend-mode: multiply; +} + +.graph-legend-note { + font-size: 10px; + color: #555; + font-style: italic; +} + +.title-bar-link { + display: inline-block; + color: inherit; + text-decoration: none; + font-size: 11px; + padding: 2px 6px; + border: 1px solid; + border-color: ButtonHighlight ButtonShadow ButtonShadow ButtonHighlight; + background: ButtonFace; +} + +.title-bar-controls #aboutBtn { + font-weight: bold; + font-size: 12px; + padding: 0 6px; + margin-right: 6px; +} + +.toggle-item { + display: flex; + align-items: center; + gap: 6px; +} + +.toggle-help { + font-size: 10px; + color: #555; +} + +.about-panel { + position: fixed; + top: 20px; + right: 20px; + width: 280px; + background: Window; + border: 2px solid #919b9c; + box-shadow: 0 4px 12px rgba(0, 0, 0, 0.25); + z-index: 2100; + font-size: 11px; +} + +.about-panel__header { + display: flex; + justify-content: space-between; + align-items: center; + padding: 6px 8px; + background: #0055aa; + color: #fff; +} + +.about-panel__body { + padding: 8px; + background: Window; + color: #000; +} + +.about-panel__header button { + border: none; + background: transparent; + color: inherit; + font-weight: bold; + cursor: pointer; +} + /* Results styling */ #results .item { background: Window; @@ -227,6 +520,7 @@ body.dimmed { max-width: 100%; overflow: hidden; word-wrap: break-word; + box-sizing: border-box; box-shadow: 2px 2px 0 rgba(0, 0, 0, 0.15); } @@ -243,7 +537,9 @@ body.dimmed { .window-body { max-width: 100%; overflow-x: hidden; - margin: 1rem; + margin: 0; + padding: 1rem; + box-sizing: border-box; } /* Badges */ @@ -267,6 +563,14 @@ body.dimmed { word-break: keep-all; } +.badge--transcript-primary { + background: #0b6efd; +} + +.badge--transcript-secondary { + background: #8f4bff; +} + .badge-clickable { cursor: pointer; } @@ -297,9 +601,14 @@ body.dimmed { } .highlight-row { - padding: 4px; + padding: 4px 6px; cursor: pointer; border: 1px solid transparent; + display: flex; + align-items: flex-start; + gap: 8px; + max-width: 100%; + box-sizing: border-box; } .highlight-row:hover { @@ -308,6 +617,77 @@ body.dimmed { border: 1px dotted WindowText; } +.highlight-text { + flex: 1 1 auto; + word-break: break-word; + overflow-wrap: anywhere; +} + +.highlight-source-indicator { + width: 10px; + height: 10px; + border-radius: 2px; + border: 1px solid transparent; + margin-left: auto; + flex: 0 0 auto; +} + +.highlight-source-indicator--primary { + background: #0b6efd; + border-color: #084bb5; +} + +.highlight-source-indicator--secondary { + background: #8f4bff; + border-color: #5d2db3; +} + +.vector-chunk { + margin-top: 8px; + padding: 8px; + background: #f3f7ff; + border: 1px solid #c7d0e2; + font-size: 11px; + line-height: 1.5; + word-break: break-word; +} + +@media screen and (max-width: 640px) { + .result-header { + flex-direction: column; + gap: 6px; + } + + .result-header-main { + flex: 1 1 auto; + min-width: 0; + width: 100%; + } + + .result-actions { + width: auto; + align-self: flex-start; + justify-content: flex-start; + flex-wrap: wrap; + gap: 4px; + margin-left: 0; + } + + .result-action-btn { + width: 100%; + text-align: left; + } + + .highlight-row { + flex-direction: column; + gap: 4px; + } + + .highlight-source-indicator { + align-self: flex-end; + } +} + mark { background: yellow; color: black; @@ -334,6 +714,10 @@ mark { border-bottom: 1px solid ButtonShadow; } +.transcript-segment--matched { + background: #fff6cc; +} + .transcript-segment:last-child { border-bottom: none; margin-bottom: 0; diff --git a/static/vector.html b/static/vector.html new file mode 100644 index 0000000..2f9fc5f --- /dev/null +++ b/static/vector.html @@ -0,0 +1,46 @@ + + + + + + TLC Vector Search + + + + + +
+
+
Vector Search (Experimental)
+ +
+
+

Enter a natural language prompt; results come from the Qdrant vector index.

+ +
+ Vector Query +
+ + + +
+
+ +
+ +
+ Results +
+
+
+ +
+

Experimental mode • Qdrant

+
+
+ + + + diff --git a/static/vector.js b/static/vector.js new file mode 100644 index 0000000..ce91865 --- /dev/null +++ b/static/vector.js @@ -0,0 +1,423 @@ +(() => { + const queryInput = document.getElementById("vectorQuery"); + const searchBtn = document.getElementById("vectorSearchBtn"); + const resultsDiv = document.getElementById("vectorResults"); + const metaDiv = document.getElementById("vectorMeta"); + const transcriptCache = new Map(); + + if (!queryInput || !searchBtn || !resultsDiv || !metaDiv) { + console.error("Vector search elements missing"); + return; + } + + /** Utility helpers **/ + const escapeHtml = (str) => + (str || "").replace(/[&<>"']/g, (ch) => { + switch (ch) { + case "&": + return "&"; + case "<": + return "<"; + case ">": + return ">"; + case '"': + return """; + case "'": + return "'"; + default: + return ch; + } + }); + + const fmtDate = (value) => { + try { + return (value || "").split("T")[0]; + } catch { + return value; + } + }; + + const fmtSimilarity = (score) => { + if (typeof score !== "number" || Number.isNaN(score)) return ""; + return score.toFixed(3); + }; + + const getVideoStatus = (item) => + (item && item.video_status ? String(item.video_status).toLowerCase() : ""); + const isLikelyDeleted = (item) => getVideoStatus(item) === "deleted"; + + const formatTimestamp = (seconds) => { + if (!seconds && seconds !== 0) return "00:00"; + const hours = Math.floor(seconds / 3600); + const mins = Math.floor((seconds % 3600) / 60); + const secs = Math.floor(seconds % 60); + if (hours > 0) { + return `${hours}:${mins.toString().padStart(2, "0")}:${secs + .toString() + .padStart(2, "0")}`; + } + return `${mins}:${secs.toString().padStart(2, "0")}`; + }; + + const formatSegmentTimestamp = (segment) => { + if (!segment) return ""; + if (segment.timestamp) return segment.timestamp; + const fields = [ + segment.start_seconds, + segment.start, + segment.offset, + segment.time, + ]; + for (const value of fields) { + if (value == null) continue; + const num = parseFloat(value); + if (!Number.isNaN(num)) { + return formatTimestamp(num); + } + } + return ""; + }; + + const serializeTranscriptSection = (label, parts, fullText) => { + let content = ""; + if (typeof fullText === "string" && fullText.trim()) { + content = fullText.trim(); + } else if (Array.isArray(parts) && parts.length) { + content = parts + .map((segment) => { + const ts = formatSegmentTimestamp(segment); + const text = segment && segment.text ? segment.text : ""; + return ts ? `[${ts}] ${text}` : text; + }) + .join("\n") + .trim(); + } + if (!content) return ""; + return `${label}\n${content}\n`; + }; + + const fetchTranscriptData = async (videoId) => { + if (!videoId) return null; + if (transcriptCache.has(videoId)) { + return transcriptCache.get(videoId); + } + const res = await fetch(`/api/transcript?video_id=${encodeURIComponent(videoId)}`); + if (!res.ok) { + throw new Error(`Transcript fetch failed (${res.status})`); + } + const data = await res.json(); + transcriptCache.set(videoId, data); + return data; + }; + + const buildTranscriptDownloadText = (item, transcriptData) => { + const lines = []; + lines.push(`Title: ${item.title || "Untitled"}`); + if (item.channel_name) lines.push(`Channel: ${item.channel_name}`); + if (item.date) lines.push(`Published: ${item.date}`); + if (item.url) lines.push(`URL: ${item.url}`); + lines.push(""); + + const primaryText = serializeTranscriptSection( + "Primary Transcript", + transcriptData.transcript_parts, + transcriptData.transcript_full + ); + const secondaryText = serializeTranscriptSection( + "Secondary Transcript", + transcriptData.transcript_secondary_parts, + transcriptData.transcript_secondary_full + ); + + if (primaryText) lines.push(primaryText); + if (secondaryText) lines.push(secondaryText); + if (!primaryText && !secondaryText) { + lines.push("No transcript available."); + } + return lines.join("\n").trim() + "\n"; + }; + + const flashButtonMessage = (button, message, duration = 1800) => { + if (!button) return; + const original = button.dataset.originalLabel || button.textContent; + button.dataset.originalLabel = original; + button.textContent = message; + setTimeout(() => { + button.textContent = button.dataset.originalLabel || original; + }, duration); + }; + + const handleTranscriptDownload = async (item, button) => { + if (!item.video_id) return; + button.disabled = true; + try { + const transcriptData = await fetchTranscriptData(item.video_id); + if (!transcriptData) throw new Error("Transcript unavailable"); + const text = buildTranscriptDownloadText(item, transcriptData); + const blob = new Blob([text], { type: "text/plain" }); + const url = URL.createObjectURL(blob); + const link = document.createElement("a"); + link.href = url; + link.download = `${item.video_id}.txt`; + document.body.appendChild(link); + link.click(); + document.body.removeChild(link); + URL.revokeObjectURL(url); + flashButtonMessage(button, "Downloaded"); + } catch (err) { + console.error("Download failed", err); + alert("Unable to download transcript right now."); + } finally { + button.disabled = false; + } + }; + + const formatMlaDate = (value) => { + if (!value) return "n.d."; + const parsed = new Date(value); + if (Number.isNaN(parsed.valueOf())) return value; + const months = [ + "Jan.", "Feb.", "Mar.", "Apr.", "May", "June", + "July", "Aug.", "Sept.", "Oct.", "Nov.", "Dec.", + ]; + return `${parsed.getDate()} ${months[parsed.getMonth()]} ${parsed.getFullYear()}`; + }; + + const buildMlaCitation = (item) => { + const channel = (item.channel_name || item.channel_id || "Unknown").trim(); + const title = (item.title || "Untitled").trim(); + const url = item.url || ""; + const publishDate = formatMlaDate(item.date); + const today = formatMlaDate(new Date().toISOString().split("T")[0]); + return `${channel}. "${title}." YouTube, uploaded by ${channel}, ${publishDate}, ${url}. Accessed ${today}.`; + }; + + const handleCopyCitation = async (item, button) => { + const citation = buildMlaCitation(item); + try { + if (navigator.clipboard && window.isSecureContext) { + await navigator.clipboard.writeText(citation); + } else { + const textarea = document.createElement("textarea"); + textarea.value = citation; + textarea.style.position = "fixed"; + textarea.style.opacity = "0"; + document.body.appendChild(textarea); + textarea.select(); + document.execCommand("copy"); + document.body.removeChild(textarea); + } + flashButtonMessage(button, "Copied!"); + } catch (err) { + console.error("Citation copy failed", err); + alert(citation); + } + }; + + /** Rendering helpers **/ + const createHighlightRows = (entries) => { + if (!Array.isArray(entries) || !entries.length) return null; + const container = document.createElement("div"); + container.className = "transcript highlight-list"; + entries.forEach((entry) => { + if (!entry) return; + const row = document.createElement("div"); + row.className = "highlight-row"; + const textBlock = document.createElement("div"); + textBlock.className = "highlight-text"; + const html = entry.html || entry.text || entry; + textBlock.innerHTML = html || ""; + row.appendChild(textBlock); + const indicator = document.createElement("span"); + indicator.className = "highlight-source-indicator highlight-source-indicator--primary"; + indicator.title = "Vector highlight"; + row.appendChild(indicator); + container.appendChild(row); + }); + return container; + }; + + const createActions = (item) => { + const actions = document.createElement("div"); + actions.className = "result-actions"; + const downloadBtn = document.createElement("button"); + downloadBtn.type = "button"; + downloadBtn.className = "result-action-btn"; + downloadBtn.textContent = "Download transcript"; + downloadBtn.addEventListener("click", () => handleTranscriptDownload(item, downloadBtn)); + actions.appendChild(downloadBtn); + + const citationBtn = document.createElement("button"); + citationBtn.type = "button"; + citationBtn.className = "result-action-btn"; + citationBtn.textContent = "Copy citation"; + citationBtn.addEventListener("click", () => handleCopyCitation(item, citationBtn)); + actions.appendChild(citationBtn); + + const graphBtn = document.createElement("button"); + graphBtn.type = "button"; + graphBtn.className = "result-action-btn graph-launch-btn"; + graphBtn.textContent = "Graph"; + graphBtn.disabled = !item.video_id; + graphBtn.addEventListener("click", () => { + if (!item.video_id) return; + const target = `/graph?video_id=${encodeURIComponent(item.video_id)}`; + window.open(target, "_blank", "noopener"); + }); + actions.appendChild(graphBtn); + + return actions; + }; + + const renderVectorResults = (payload) => { + resultsDiv.innerHTML = ""; + const items = payload.items || []; + if (!items.length) { + metaDiv.textContent = "No vector matches for this prompt."; + return; + } + metaDiv.textContent = `Matches: ${items.length} (vector mode)`; + + items.forEach((item) => { + const el = document.createElement("div"); + el.className = "item"; + const header = document.createElement("div"); + header.className = "result-header"; + const headerMain = document.createElement("div"); + headerMain.className = "result-header-main"; + const titleEl = document.createElement("strong"); + titleEl.innerHTML = item.titleHtml || escapeHtml(item.title || "Untitled"); + headerMain.appendChild(titleEl); + + const metaLine = document.createElement("div"); + metaLine.className = "muted result-meta"; + const channelLabel = item.channel_name || item.channel_id || "Unknown"; + const dateLabel = fmtDate(item.date); + let durationSeconds = null; + if (typeof item.duration === "number") { + durationSeconds = item.duration; + } else if (typeof item.duration === "string" && item.duration.trim()) { + const parsed = parseFloat(item.duration); + if (!Number.isNaN(parsed)) { + durationSeconds = parsed; + } + } + const durationLabel = durationSeconds != null ? ` • ${formatTimestamp(durationSeconds)}` : ""; + metaLine.textContent = channelLabel ? `${channelLabel} • ${dateLabel}${durationLabel}` : `${dateLabel}${durationLabel}`; + if (isLikelyDeleted(item)) { + metaLine.appendChild(document.createTextNode(" ")); + const statusEl = document.createElement("span"); + statusEl.className = "result-status result-status--deleted"; + statusEl.textContent = "Likely deleted"; + metaLine.appendChild(statusEl); + } + headerMain.appendChild(metaLine); + + if (item.url) { + const linkLine = document.createElement("div"); + linkLine.className = "muted"; + const anchor = document.createElement("a"); + anchor.href = item.url; + anchor.target = "_blank"; + anchor.rel = "noopener"; + anchor.textContent = "Open on YouTube"; + linkLine.appendChild(anchor); + headerMain.appendChild(linkLine); + } + + if (typeof item.distance === "number") { + const scoreLine = document.createElement("div"); + scoreLine.className = "muted"; + scoreLine.textContent = `Similarity score: ${fmtSimilarity(item.distance)}`; + headerMain.appendChild(scoreLine); + } + + header.appendChild(headerMain); + header.appendChild(createActions(item)); + el.appendChild(header); + + if (item.descriptionHtml || item.description) { + const desc = document.createElement("div"); + desc.className = "muted description-block"; + desc.innerHTML = item.descriptionHtml || escapeHtml(item.description); + el.appendChild(desc); + } + + if (item.chunkText) { + const chunkBlock = document.createElement("div"); + chunkBlock.className = "vector-chunk"; + if (item.chunkTimestamp && item.url) { + const tsObj = + typeof item.chunkTimestamp === "object" + ? item.chunkTimestamp + : { timestamp: item.chunkTimestamp }; + const ts = formatSegmentTimestamp(tsObj); + const tsLink = document.createElement("a"); + const paramValue = + typeof item.chunkTimestamp === "number" + ? Math.floor(item.chunkTimestamp) + : item.chunkTimestamp; + tsLink.href = `${item.url}${item.url.includes("?") ? "&" : "?"}t=${encodeURIComponent( + paramValue + )}`; + tsLink.target = "_blank"; + tsLink.rel = "noopener"; + tsLink.textContent = ts ? `[${ts}]` : "[timestamp]"; + chunkBlock.appendChild(tsLink); + chunkBlock.appendChild(document.createTextNode(" ")); + } + const chunkTextSpan = document.createElement("span"); + chunkTextSpan.textContent = item.chunkText; + chunkBlock.appendChild(chunkTextSpan); + el.appendChild(chunkBlock); + } + + const highlights = createHighlightRows(item.toHighlight); + if (highlights) { + el.appendChild(highlights); + } + + resultsDiv.appendChild(el); + }); + }; + + /** Search handler **/ + const runVectorSearch = async () => { + const query = queryInput.value.trim(); + if (!query) { + alert("Please enter a query."); + return; + } + metaDiv.textContent = "Searching vector index…"; + resultsDiv.innerHTML = ""; + searchBtn.disabled = true; + try { + const res = await fetch("/api/vector-search", { + method: "POST", + headers: { "Content-Type": "application/json" }, + body: JSON.stringify({ query }), + }); + if (!res.ok) { + throw new Error(`Vector search failed (${res.status})`); + } + const data = await res.json(); + if (data.error) { + metaDiv.textContent = "Vector search unavailable."; + return; + } + renderVectorResults(data); + } catch (err) { + console.error(err); + metaDiv.textContent = "Vector search unavailable."; + } finally { + searchBtn.disabled = false; + } + }; + + searchBtn.addEventListener("click", runVectorSearch); + queryInput.addEventListener("keypress", (event) => { + if (event.key === "Enter") { + runVectorSearch(); + } + }); +})(); diff --git a/sync_qdrant_channels.py b/sync_qdrant_channels.py new file mode 100644 index 0000000..7435e67 --- /dev/null +++ b/sync_qdrant_channels.py @@ -0,0 +1,188 @@ +""" +Utility to backfill channel titles/names inside the Qdrant payloads. + +Usage: + python -m python_app.sync_qdrant_channels \ + --batch-size 512 \ + --max-batches 200 \ + --dry-run +""" + +from __future__ import annotations + +import argparse +import logging +from typing import Dict, Iterable, List, Optional, Set, Tuple +import time + +import requests + +from .config import CONFIG +from .search_app import _ensure_client + +LOGGER = logging.getLogger(__name__) + + +def chunked(iterable: Iterable, size: int): + chunk: List = [] + for item in iterable: + chunk.append(item) + if len(chunk) >= size: + yield chunk + chunk = [] + if chunk: + yield chunk + + +def resolve_channels(channel_ids: Iterable[str]) -> Dict[str, str]: + client = _ensure_client(CONFIG) + ids = list(set(channel_ids)) + if not ids: + return {} + body = { + "size": len(ids) * 2, + "_source": ["channel_id", "channel_name"], + "query": {"terms": {"channel_id.keyword": ids}}, + } + response = client.search(index=CONFIG.elastic.index, body=body) + resolved: Dict[str, str] = {} + for hit in response.get("hits", {}).get("hits", []): + source = hit.get("_source") or {} + cid = source.get("channel_id") + cname = source.get("channel_name") + if cid and cname and cid not in resolved: + resolved[cid] = cname + return resolved + + +def upsert_channel_payload( + qdrant_url: str, + collection: str, + channel_id: str, + channel_name: str, + *, + dry_run: bool = False, +) -> bool: + """Set channel_name/channel_title for all vectors with this channel_id.""" + payload = {"channel_name": channel_name, "channel_title": channel_name} + body = { + "payload": payload, + "filter": {"must": [{"key": "channel_id", "match": {"value": channel_id}}]}, + } + LOGGER.info("Updating channel_id=%s -> %s", channel_id, channel_name) + if dry_run: + return True + resp = requests.post( + f"{qdrant_url}/collections/{collection}/points/payload", + json=body, + timeout=120, + ) + if resp.status_code >= 400: + LOGGER.error("Failed to update %s: %s", channel_id, resp.text) + return False + return True + + +def scroll_missing_payloads( + qdrant_url: str, + collection: str, + batch_size: int, + *, + max_points: Optional[int] = None, +) -> Iterable[List[Tuple[str, Dict[str, any]]]]: + """Yield batches of (point_id, payload) missing channel names.""" + fetched = 0 + next_page = None + while True: + current_limit = batch_size + while True: + body = { + "limit": current_limit, + "with_payload": True, + "filter": {"must": [{"is_empty": {"key": "channel_name"}}]}, + } + if next_page: + body["offset"] = next_page + try: + resp = requests.post( + f"{qdrant_url}/collections/{collection}/points/scroll", + json=body, + timeout=120, + ) + resp.raise_for_status() + break + except requests.HTTPError as exc: + LOGGER.warning( + "Scroll request failed at limit=%s: %s", current_limit, exc + ) + if current_limit <= 5: + raise + current_limit = max(5, current_limit // 2) + LOGGER.info("Reducing scroll batch size to %s", current_limit) + time.sleep(2) + except requests.RequestException as exc: # type: ignore[attr-defined] + LOGGER.warning("Transient scroll error: %s", exc) + time.sleep(2) + payload = resp.json().get("result", {}) + points = payload.get("points", []) + if not points: + break + batch: List[Tuple[str, Dict[str, any]]] = [] + for point in points: + pid = point.get("id") + p_payload = point.get("payload") or {} + batch.append((pid, p_payload)) + yield batch + fetched += len(points) + if max_points and fetched >= max_points: + break + next_page = payload.get("next_page_offset") + if not next_page: + break + + +def main() -> None: + logging.basicConfig(level=logging.INFO, format="%(levelname)s %(message)s") + parser = argparse.ArgumentParser( + description="Backfill missing channel_name/channel_title in Qdrant payloads" + ) + parser.add_argument("--batch-size", type=int, default=512) + parser.add_argument( + "--max-points", + type=int, + default=None, + help="Limit processing to the first N points for testing", + ) + parser.add_argument("--dry-run", action="store_true") + args = parser.parse_args() + + q_url = CONFIG.qdrant_url + collection = CONFIG.qdrant_collection + total_updates = 0 + + for batch in scroll_missing_payloads( + q_url, collection, args.batch_size, max_points=args.max_points + ): + channel_ids: Set[str] = set() + for _, payload in batch: + cid = payload.get("channel_id") + if cid: + channel_ids.add(str(cid)) + if not channel_ids: + continue + resolved = resolve_channels(channel_ids) + if not resolved: + LOGGER.warning("No channel names resolved for ids: %s", channel_ids) + continue + for cid, name in resolved.items(): + if upsert_channel_payload( + q_url, collection, cid, name, dry_run=args.dry_run + ): + total_updates += 1 + LOGGER.info("Updated %s channel payloads so far", total_updates) + + LOGGER.info("Finished. Total channel updates attempted: %s", total_updates) + + +if __name__ == "__main__": + main()