Add graph and vector search features

This commit is contained in:
knight 2025-11-09 14:24:50 -05:00
parent 14d37f23e4
commit 40d4f41f6e
12 changed files with 2983 additions and 273 deletions

31
AGENTS.md Normal file
View File

@ -0,0 +1,31 @@
# Repository Guidelines
## Project Structure & Module Organization
- Core modules live under `python_app/`: `config.py` centralizes settings, `transcript_collector.py` gathers transcripts, `ingest.py` handles Elasticsearch bulk loads, and `search_app.py` exposes the Flask UI.
- Static assets belong in `static/` (`index.html`, `frequency.html`, companion JS/CSS). Keep HTML here and wire it up through Flask routes.
- Runtime artifacts land in `data/` (`raw/` for downloads, `video_metadata/` for cleaned payloads). Preserve the JSON schema emitted by the collector.
- When adding utilities, place them in `python_app/` and use package-relative imports so scripts continue to run via `python -m`.
## Build, Test, and Development Commands
- `python -m venv .venv && source .venv/bin/activate`: bootstrap the virtualenv used by all scripts.
- `pip install -r requirements.txt`: install Flask, Elasticsearch tooling, Google API clients, and dotenv support.
- `python -m python_app.transcript_collector --channel UC... --output data/raw`: fetch transcript JSON for a channel; rerun to refresh cached data.
- `python -m python_app.ingest --source data/video_metadata --index this_little_corner_py`: index prepared metadata and auto-create mappings when needed.
- `python -m python_app.search_app`: launch the Flask server on port 8080 for UI smoke tests.
## Coding Style & Naming Conventions
- Follow PEP 8 with 4-space indentation, `snake_case` for functions/modules, and `CamelCase` for classes; reserve UPPER_SNAKE_CASE for configuration constants.
- Keep Elasticsearch payload keys lower-case with underscores, and centralize shared values in `config.py` rather than scattering literals.
## Testing Guidelines
- No automated suite is committed yet; when adding coverage, create `tests/` modules using `pytest` with files named `test_*.py`.
- Focus tests on collector pagination, ingest transformations, and Flask route helpers, and run `python -m pytest` locally before opening a PR.
- Manually verify by ingesting a small sample into a local Elasticsearch node and checking facets, highlights, and transcript retrieval via the UI.
## Commit & Pull Request Guidelines
- Mirror the existing history: short, imperative commit subjects (e.g. “Fix results overflow”, “Add video reference tracking”).
- PRs should describe scope, list environment variables or indices touched, link issues, and attach before/after screenshots whenever UI output changes. Highlight Elasticsearch mapping or data migration impacts for both search and frontend reviewers.
## Configuration & Security Tips
- Load credentials through environment variables (`ELASTIC_URL`, `ELASTIC_USERNAME`, `ELASTIC_PASSWORD`, `ELASTIC_API_KEY`, `YOUTUBE_API_KEY`) or a `.env` file, and keep secrets out of version control.
- Adjust `ELASTIC_VERIFY_CERTS`, `ELASTIC_CA_CERT`, and `ELASTIC_DEBUG` only while debugging, and prefer branch-specific indices (`this_little_corner_py_<initials>`) to avoid clobbering shared data.

View File

@ -20,13 +20,13 @@ from typing import Optional
try: try:
from dotenv import load_dotenv from dotenv import load_dotenv
import logging import logging
_logger = logging.getLogger(__name__)
_logger = logging.getLogger(__name__)
_env_path = Path(__file__).parent / ".env" _env_path = Path(__file__).parent / ".env"
if _env_path.exists(): if _env_path.exists():
_logger.info(f"Loading .env from: {_env_path}") _logger.info("Loading .env from: %s", _env_path)
result = load_dotenv(_env_path, override=True) result = load_dotenv(_env_path, override=True)
_logger.info(f"load_dotenv result: {result}") _logger.info("load_dotenv result: %s", result)
except ImportError: except ImportError:
pass # python-dotenv not installed pass # python-dotenv not installed
@ -58,6 +58,11 @@ class AppConfig:
elastic: ElasticSettings elastic: ElasticSettings
data: DataSettings data: DataSettings
youtube: YoutubeSettings youtube: YoutubeSettings
qdrant_url: str
qdrant_collection: str
qdrant_vector_name: Optional[str]
qdrant_vector_size: int
qdrant_embed_model: str
def _env(name: str, default: Optional[str] = None) -> Optional[str]: def _env(name: str, default: Optional[str] = None) -> Optional[str]:
@ -89,7 +94,16 @@ def load_config() -> AppConfig:
) )
data = DataSettings(root=data_root) data = DataSettings(root=data_root)
youtube = YoutubeSettings(api_key=_env("YOUTUBE_API_KEY")) youtube = YoutubeSettings(api_key=_env("YOUTUBE_API_KEY"))
return AppConfig(elastic=elastic, data=data, youtube=youtube) return AppConfig(
elastic=elastic,
data=data,
youtube=youtube,
qdrant_url=_env("QDRANT_URL", "http://localhost:6333"),
qdrant_collection=_env("QDRANT_COLLECTION", "tlc_embeddings"),
qdrant_vector_name=_env("QDRANT_VECTOR_NAME"),
qdrant_vector_size=int(_env("QDRANT_VECTOR_SIZE", "1024")),
qdrant_embed_model=_env("QDRANT_EMBED_MODEL", "BAAI/bge-large-en-v1.5"),
)
CONFIG = load_config() CONFIG = load_config()

View File

@ -3,3 +3,5 @@ elasticsearch>=7.0.0,<9.0.0
youtube-transcript-api>=0.6 youtube-transcript-api>=0.6
google-api-python-client>=2.0.0 google-api-python-client>=2.0.0
python-dotenv>=0.19.0 python-dotenv>=0.19.0
requests>=2.31.0
sentence-transformers>=2.7.0

View File

@ -1,11 +1,15 @@
""" """
Flask application exposing a minimal search API backed by Elasticsearch. Flask application exposing search, graph, and transcript endpoints for TLC.
Routes: Routes:
GET / -> Static HTML search page. GET / -> static HTML search page.
GET /api/channels -> List available channels (via terms aggregation). GET /graph -> static reference graph UI.
GET /api/search -> Search index with pagination and simple highlighting. GET /vector-search -> experimental Qdrant vector search UI.
GET /api/transcript -> Return full transcript for a given video_id. GET /api/channels -> channels aggregation.
GET /api/search -> Elasticsearch keyword search.
POST /api/vector-search -> Qdrant vector similarity query.
GET /api/graph -> reference graph API.
GET /api/transcript -> transcript JSON payload.
""" """
from __future__ import annotations from __future__ import annotations
@ -15,13 +19,20 @@ import json
import logging import logging
import re import re
from pathlib import Path from pathlib import Path
from typing import Any, Dict, Iterable, List, Optional, Sequence, Set from typing import Any, Dict, Iterable, List, Optional, Sequence, Set, Tuple
from collections import Counter from collections import Counter, deque
from datetime import datetime from datetime import datetime
from flask import Flask, jsonify, request, send_from_directory from flask import Flask, jsonify, request, send_from_directory
import requests
try:
from sentence_transformers import SentenceTransformer # type: ignore
except ImportError: # pragma: no cover - optional dependency
SentenceTransformer = None
from .config import CONFIG, AppConfig from .config import CONFIG, AppConfig
try: try:
@ -32,6 +43,35 @@ except ImportError: # pragma: no cover - dependency optional
BadRequestError = Exception # type: ignore BadRequestError = Exception # type: ignore
LOGGER = logging.getLogger(__name__) LOGGER = logging.getLogger(__name__)
_EMBED_MODEL = None
_EMBED_MODEL_NAME: Optional[str] = None
def _ensure_embedder(model_name: str) -> "SentenceTransformer":
global _EMBED_MODEL, _EMBED_MODEL_NAME
if SentenceTransformer is None: # pragma: no cover - optional dependency
raise RuntimeError(
"sentence-transformers is required for vector search. Install via pip install sentence-transformers."
)
if _EMBED_MODEL is None or _EMBED_MODEL_NAME != model_name:
LOGGER.info("Loading embedding model: %s", model_name)
_EMBED_MODEL = SentenceTransformer(model_name)
_EMBED_MODEL_NAME = model_name
return _EMBED_MODEL
def embed_query(text: str, *, model_name: str, expected_dim: int) -> List[float]:
embedder = _ensure_embedder(model_name)
vector = embedder.encode(
[f"query: {text}"],
show_progress_bar=False,
normalize_embeddings=True,
)[0].tolist()
if len(vector) != expected_dim:
raise RuntimeError(
f"Embedding dimension mismatch (expected {expected_dim}, got {len(vector)})"
)
return vector
def _ensure_client(config: AppConfig) -> "Elasticsearch": def _ensure_client(config: AppConfig) -> "Elasticsearch":
@ -428,6 +468,17 @@ def build_query_payload(
} }
} }
) )
should.append(
{
"match_phrase": {
"title": {
"query": query,
"slop": 0,
"boost": 50.0,
}
}
}
)
if use_fuzzy: if use_fuzzy:
should.append( should.append(
{ {
@ -513,15 +564,182 @@ def create_app(config: AppConfig = CONFIG) -> Flask:
app = Flask(__name__, static_folder=str(Path(__file__).parent / "static")) app = Flask(__name__, static_folder=str(Path(__file__).parent / "static"))
client = _ensure_client(config) client = _ensure_client(config)
index = config.elastic.index index = config.elastic.index
qdrant_url = config.qdrant_url
qdrant_collection = config.qdrant_collection
qdrant_vector_name = config.qdrant_vector_name
qdrant_vector_size = config.qdrant_vector_size
qdrant_embed_model = config.qdrant_embed_model
@app.route("/") @app.route("/")
def index_page(): def index_page():
return send_from_directory(app.static_folder, "index.html") return send_from_directory(app.static_folder, "index.html")
@app.route("/graph")
def graph_page():
return send_from_directory(app.static_folder, "graph.html")
@app.route("/vector-search")
def vector_search_page():
return send_from_directory(app.static_folder, "vector.html")
@app.route("/static/<path:filename>") @app.route("/static/<path:filename>")
def static_files(filename: str): def static_files(filename: str):
return send_from_directory(app.static_folder, filename) return send_from_directory(app.static_folder, filename)
def normalize_reference_list(values: Any) -> List[str]:
if values is None:
return []
if isinstance(values, (list, tuple, set)):
iterable = values
else:
iterable = [values]
normalized: List[str] = []
for item in iterable:
candidate: Optional[str]
if isinstance(item, dict):
candidate = item.get("video_id") or item.get("id") # type: ignore[assignment]
else:
candidate = item # type: ignore[assignment]
if candidate is None:
continue
text = str(candidate).strip()
if not text:
continue
if text.lower() in {"none", "null"}:
continue
normalized.append(text)
return normalized
def build_graph_payload(
root_id: str, depth: int, max_nodes: int
) -> Dict[str, Any]:
root_id = root_id.strip()
if not root_id:
return {"nodes": [], "links": [], "root": root_id, "depth": depth, "meta": {}}
doc_cache: Dict[str, Optional[Dict[str, Any]]] = {}
def fetch_document(video_id: str) -> Optional[Dict[str, Any]]:
if video_id in doc_cache:
return doc_cache[video_id]
try:
result = client.get(index=index, id=video_id)
doc_cache[video_id] = result.get("_source")
except Exception as exc: # pragma: no cover - elasticsearch handles errors
LOGGER.debug("Graph: failed to load %s: %s", video_id, exc)
doc_cache[video_id] = None
return doc_cache[video_id]
nodes: Dict[str, Dict[str, Any]] = {}
links: List[Dict[str, Any]] = []
link_seen: Set[Tuple[str, str, str]] = set()
queue: deque[Tuple[str, int]] = deque([(root_id, 0)])
queued: Set[str] = {root_id}
visited: Set[str] = set()
while queue and len(nodes) < max_nodes:
current_id, level = queue.popleft()
queued.discard(current_id)
if current_id in visited:
continue
doc = fetch_document(current_id)
if doc is None:
if current_id == root_id:
break
visited.add(current_id)
continue
visited.add(current_id)
nodes[current_id] = {
"id": current_id,
"title": doc.get("title") or current_id,
"channel_id": doc.get("channel_id"),
"channel_name": doc.get("channel_name") or doc.get("channel_id") or "Unknown",
"url": doc.get("url"),
"date": doc.get("date"),
"is_root": current_id == root_id,
}
if level >= depth:
continue
neighbor_ids: List[str] = []
for ref_id in normalize_reference_list(doc.get("internal_references")):
if ref_id == current_id:
continue
key = (current_id, ref_id, "references")
if key not in link_seen:
links.append(
{"source": current_id, "target": ref_id, "relation": "references"}
)
link_seen.add(key)
neighbor_ids.append(ref_id)
for ref_id in normalize_reference_list(doc.get("referenced_by")):
if ref_id == current_id:
continue
key = (ref_id, current_id, "referenced_by")
if key not in link_seen:
links.append(
{"source": ref_id, "target": current_id, "relation": "referenced_by"}
)
link_seen.add(key)
neighbor_ids.append(ref_id)
for neighbor in neighbor_ids:
if neighbor in visited or neighbor in queued:
continue
if len(nodes) + len(queue) >= max_nodes:
break
queue.append((neighbor, level + 1))
queued.add(neighbor)
# Ensure nodes referenced by links exist in the payload.
for link in links:
for key in ("source", "target"):
node_id = link[key]
if node_id in nodes:
continue
doc = fetch_document(node_id)
if doc is None:
nodes[node_id] = {
"id": node_id,
"title": node_id,
"channel_id": None,
"channel_name": "Unknown",
"url": None,
"date": None,
"is_root": node_id == root_id,
}
else:
nodes[node_id] = {
"id": node_id,
"title": doc.get("title") or node_id,
"channel_id": doc.get("channel_id"),
"channel_name": doc.get("channel_name") or doc.get("channel_id") or "Unknown",
"url": doc.get("url"),
"date": doc.get("date"),
"is_root": node_id == root_id,
}
links = [
link
for link in links
if link.get("source") in nodes and link.get("target") in nodes
]
return {
"root": root_id,
"depth": depth,
"nodes": list(nodes.values()),
"links": links,
"meta": {
"node_count": len(nodes),
"link_count": len(links),
},
}
@app.route("/api/channels") @app.route("/api/channels")
def channels(): def channels():
base_channels_body = { base_channels_body = {
@ -580,23 +798,54 @@ def create_app(config: AppConfig = CONFIG) -> Flask:
.get("channels", {}) .get("channels", {})
.get("buckets", []) .get("buckets", [])
) )
data = [ data = []
{ for bucket in buckets:
"Id": bucket.get("key"), key = bucket.get("key")
"Name": ( name_hit = (
bucket.get("name", {}) bucket.get("name", {})
.get("hits", {}) .get("hits", {})
.get("hits", [{}])[0] .get("hits", [{}])[0]
.get("_source", {}) .get("_source", {})
.get("channel_name", bucket.get("key")) .get("channel_name")
), )
"Count": bucket.get("doc_count", 0), display_name = name_hit or key or "Unknown"
} data.append(
for bucket in buckets {
] "Id": key,
"Name": display_name,
"Count": bucket.get("doc_count", 0),
}
)
data.sort(key=lambda item: item["Name"].lower()) data.sort(key=lambda item: item["Name"].lower())
return jsonify(data) return jsonify(data)
@app.route("/api/graph")
def graph_api():
video_id = (request.args.get("video_id") or "").strip()
if not video_id:
return jsonify({"error": "video_id is required"}), 400
try:
depth = int(request.args.get("depth", "1"))
except ValueError:
depth = 1
depth = max(0, min(depth, 3))
try:
max_nodes = int(request.args.get("max_nodes", "200"))
except ValueError:
max_nodes = 200
max_nodes = max(10, min(max_nodes, 400))
payload = build_graph_payload(video_id, depth, max_nodes)
if not payload["nodes"]:
return (
jsonify({"error": f"Video '{video_id}' was not found in the index."}),
404,
)
payload["meta"]["max_nodes"] = max_nodes
return jsonify(payload)
@app.route("/api/years") @app.route("/api/years")
def years(): def years():
body = { body = {
@ -718,10 +967,13 @@ def create_app(config: AppConfig = CONFIG) -> Flask:
for hit in hits.get("hits", []): for hit in hits.get("hits", []):
source = hit.get("_source", {}) source = hit.get("_source", {})
highlight_map = hit.get("highlight", {}) highlight_map = hit.get("highlight", {})
transcript_highlight = ( transcript_highlight = [
(highlight_map.get("transcript_full", []) or []) {"html": value, "source": "primary"}
+ (highlight_map.get("transcript_secondary_full", []) or []) for value in (highlight_map.get("transcript_full", []) or [])
) ] + [
{"html": value, "source": "secondary"}
for value in (highlight_map.get("transcript_secondary_full", []) or [])
]
title_html = ( title_html = (
highlight_map.get("title") highlight_map.get("title")
@ -741,6 +993,7 @@ def create_app(config: AppConfig = CONFIG) -> Flask:
"description": source.get("description"), "description": source.get("description"),
"descriptionHtml": description_html, "descriptionHtml": description_html,
"date": source.get("date"), "date": source.get("date"),
"duration": source.get("duration"),
"url": source.get("url"), "url": source.get("url"),
"toHighlight": transcript_highlight, "toHighlight": transcript_highlight,
"highlightSource": { "highlightSource": {
@ -751,6 +1004,7 @@ def create_app(config: AppConfig = CONFIG) -> Flask:
"internal_references": source.get("internal_references", []), "internal_references": source.get("internal_references", []),
"referenced_by_count": source.get("referenced_by_count", 0), "referenced_by_count": source.get("referenced_by_count", 0),
"referenced_by": source.get("referenced_by", []), "referenced_by": source.get("referenced_by", []),
"video_status": source.get("video_status"),
} }
) )
@ -877,7 +1131,15 @@ def create_app(config: AppConfig = CONFIG) -> Flask:
"field": "channel_id.keyword", "field": "channel_id.keyword",
"size": channel_terms_size, "size": channel_terms_size,
"order": {"_count": "desc"}, "order": {"_count": "desc"},
} },
"aggs": {
"channel_name_hit": {
"top_hits": {
"size": 1,
"_source": {"includes": ["channel_name"]},
}
}
},
} }
}, },
} }
@ -916,7 +1178,7 @@ def create_app(config: AppConfig = CONFIG) -> Flask:
.get("buckets", []) .get("buckets", [])
) )
channel_totals: Dict[str, int] = {} channel_totals: Dict[str, Dict[str, Any]] = {}
buckets: List[Dict[str, Any]] = [] buckets: List[Dict[str, Any]] = []
for bucket in raw_buckets: for bucket in raw_buckets:
date_str = bucket.get("key_as_string") date_str = bucket.get("key_as_string")
@ -926,14 +1188,28 @@ def create_app(config: AppConfig = CONFIG) -> Flask:
cid = ch_bucket.get("key") cid = ch_bucket.get("key")
count = ch_bucket.get("doc_count", 0) count = ch_bucket.get("doc_count", 0)
if cid: if cid:
channel_entries.append({"id": cid, "count": count}) hit_source = (
channel_totals[cid] = channel_totals.get(cid, 0) + count ch_bucket.get("channel_name_hit", {})
.get("hits", {})
.get("hits", [{}])[0]
.get("_source", {})
)
channel_name = hit_source.get("channel_name") if isinstance(hit_source, dict) else None
channel_entries.append({"id": cid, "count": count, "name": channel_name})
if cid not in channel_totals:
channel_totals[cid] = {"total": 0, "name": channel_name}
channel_totals[cid]["total"] += count
if channel_name and not channel_totals[cid].get("name"):
channel_totals[cid]["name"] = channel_name
buckets.append( buckets.append(
{"date": date_str, "total": total, "channels": channel_entries} {"date": date_str, "total": total, "channels": channel_entries}
) )
ranked_channels = sorted( ranked_channels = sorted(
[{"id": cid, "total": total} for cid, total in channel_totals.items()], [
{"id": cid, "total": info.get("total", 0), "name": info.get("name")}
for cid, info in channel_totals.items()
],
key=lambda item: item["total"], key=lambda item: item["total"],
reverse=True, reverse=True,
) )
@ -953,6 +1229,145 @@ def create_app(config: AppConfig = CONFIG) -> Flask:
def frequency_page(): def frequency_page():
return send_from_directory(app.static_folder, "frequency.html") return send_from_directory(app.static_folder, "frequency.html")
@app.route("/api/vector-search", methods=["POST"])
def api_vector_search():
payload = request.get_json(silent=True) or {}
query_text = (payload.get("query") or "").strip()
filters = payload.get("filters") or {}
limit = max(int(payload.get("size", 10)), 1)
offset = max(int(payload.get("offset", 0)), 0)
if not query_text:
return jsonify(
{"items": [], "totalResults": 0, "offset": offset, "error": "empty_query"}
)
try:
query_vector = embed_query(
query_text, model_name=qdrant_embed_model, expected_dim=qdrant_vector_size
)
except Exception as exc: # pragma: no cover - runtime dependency
LOGGER.error("Embedding failed: %s", exc, exc_info=config.elastic.debug)
return jsonify({"error": "embedding_unavailable"}), 500
qdrant_vector_payload: Any
if qdrant_vector_name:
qdrant_vector_payload = {qdrant_vector_name: query_vector}
else:
qdrant_vector_payload = query_vector
qdrant_body: Dict[str, Any] = {
"vector": qdrant_vector_payload,
"limit": limit,
"offset": offset,
"with_payload": True,
"with_vectors": False,
}
if filters:
qdrant_body["filter"] = filters
try:
response = requests.post(
f"{qdrant_url}/collections/{qdrant_collection}/points/search",
json=qdrant_body,
timeout=20,
)
response.raise_for_status()
data = response.json()
except Exception as exc:
LOGGER.error("Vector search failed: %s", exc, exc_info=config.elastic.debug)
return jsonify({"error": "vector_search_unavailable"}), 502
points = data.get("result", []) if isinstance(data, dict) else []
items: List[Dict[str, Any]] = []
missing_channel_ids: Set[str] = set()
for point in points:
payload = point.get("payload", {}) or {}
raw_highlights = payload.get("highlights") or []
highlight_entries: List[Dict[str, str]] = []
for entry in raw_highlights:
if isinstance(entry, dict):
html_value = entry.get("html") or entry.get("text")
else:
html_value = str(entry)
if not html_value:
continue
highlight_entries.append({"html": html_value, "source": "primary"})
channel_label = (
payload.get("channel_name")
or payload.get("channel_title")
or payload.get("channel_id")
)
items.append(
{
"video_id": payload.get("video_id"),
"channel_id": payload.get("channel_id"),
"channel_name": channel_label,
"title": payload.get("title"),
"titleHtml": payload.get("title"),
"description": payload.get("description"),
"descriptionHtml": payload.get("description"),
"date": payload.get("date"),
"url": payload.get("url"),
"chunkText": payload.get("text")
or payload.get("chunk_text")
or payload.get("chunk")
or payload.get("content"),
"chunkTimestamp": payload.get("timestamp")
or payload.get("start_seconds")
or payload.get("start"),
"toHighlight": highlight_entries,
"highlightSource": {
"primary": bool(highlight_entries),
"secondary": False,
},
"distance": point.get("score"),
"internal_references_count": payload.get("internal_references_count", 0),
"internal_references": payload.get("internal_references", []),
"referenced_by_count": payload.get("referenced_by_count", 0),
"referenced_by": payload.get("referenced_by", []),
"video_status": payload.get("video_status"),
"duration": payload.get("duration"),
}
)
if (not channel_label) and payload.get("channel_id"):
missing_channel_ids.add(str(payload.get("channel_id")))
if missing_channel_ids:
try:
es_lookup = client.search(
index=index,
body={
"size": len(missing_channel_ids) * 2,
"_source": ["channel_id", "channel_name"],
"query": {"terms": {"channel_id.keyword": list(missing_channel_ids)}},
},
)
hits = es_lookup.get("hits", {}).get("hits", [])
channel_lookup = {}
for hit in hits:
src = hit.get("_source", {}) or {}
cid = src.get("channel_id")
cname = src.get("channel_name")
if cid and cname and cid not in channel_lookup:
channel_lookup[cid] = cname
for item in items:
if not item.get("channel_name"):
cid = item.get("channel_id")
if cid and cid in channel_lookup:
item["channel_name"] = channel_lookup[cid]
except Exception as exc:
LOGGER.debug("Vector channel lookup failed: %s", exc)
return jsonify(
{
"items": items,
"totalResults": len(items),
"offset": offset,
}
)
@app.route("/api/transcript") @app.route("/api/transcript")
def transcript(): def transcript():
video_id = request.args.get("video_id", type=str) video_id = request.args.get("video_id", type=str)

View File

@ -32,9 +32,7 @@
let qs = new URLSearchParams(window.location.search); let qs = new URLSearchParams(window.location.search);
const qInput = document.getElementById("q"); const qInput = document.getElementById("q");
const channelDropdown = document.getElementById("channelDropdown"); const channelSelect = document.getElementById("channel");
const channelSummary = document.getElementById("channelSummary");
const channelOptions = document.getElementById("channelOptions");
const yearSel = document.getElementById("year"); const yearSel = document.getElementById("year");
const sortSel = document.getElementById("sort"); const sortSel = document.getElementById("sort");
const sizeSel = document.getElementById("size"); const sizeSel = document.getElementById("size");
@ -43,6 +41,9 @@
const phraseToggle = document.getElementById("phraseToggle"); const phraseToggle = document.getElementById("phraseToggle");
const queryToggle = document.getElementById("queryStringToggle"); const queryToggle = document.getElementById("queryStringToggle");
const searchBtn = document.getElementById("searchBtn"); const searchBtn = document.getElementById("searchBtn");
const aboutBtn = document.getElementById("aboutBtn");
const aboutPanel = document.getElementById("aboutPanel");
const aboutCloseBtn = document.getElementById("aboutCloseBtn");
const resultsDiv = document.getElementById("results"); const resultsDiv = document.getElementById("results");
const metaDiv = document.getElementById("meta"); const metaDiv = document.getElementById("meta");
const metricsContainer = document.getElementById("metrics"); const metricsContainer = document.getElementById("metrics");
@ -50,17 +51,27 @@
const metricsContent = document.getElementById("metricsContent"); const metricsContent = document.getElementById("metricsContent");
const freqSummary = document.getElementById("frequencySummary"); const freqSummary = document.getElementById("frequencySummary");
const freqChart = document.getElementById("frequencyChart"); const freqChart = document.getElementById("frequencyChart");
const graphOverlay = document.getElementById("graphModalOverlay");
const graphModalClose = document.getElementById("graphModalClose");
const channelMap = new Map(); const channelMap = new Map();
const selectedChannels = new Set(); const transcriptCache = new Map();
let pendingChannelSelection = []; let lastFocusBeforeModal = null;
let pendingChannelSelection = "";
let channelsReady = false; let channelsReady = false;
let suppressChannelChange = false;
let allChannelsCheckbox = null;
let previousToggleState = { exact: true, fuzzy: true, phrase: true }; let previousToggleState = { exact: true, fuzzy: true, phrase: true };
let currentPage = let currentPage =
parseInt(qs.get("page") || "0", 10) || parseInt(qs.get("page") || "0", 10) ||
0; 0;
function toggleAboutPanel(show) {
if (!aboutPanel) return;
if (show) {
aboutPanel.removeAttribute("hidden");
} else {
aboutPanel.setAttribute("hidden", "hidden");
}
}
function parseBoolParam(name, defaultValue) { function parseBoolParam(name, defaultValue) {
const raw = qs.get(name); const raw = qs.get(name);
if (raw === null) return defaultValue; if (raw === null) return defaultValue;
@ -68,9 +79,8 @@
return !["0", "false", "no"].includes(lowered); return !["0", "false", "no"].includes(lowered);
} }
function parseChannelParams(params) { function parseChannelParam(params) {
const collected = []; if (!params) return "";
if (!params) return collected;
const seen = new Set(); const seen = new Set();
const rawValues = params.getAll("channel_id"); const rawValues = params.getAll("channel_id");
const legacy = params.get("channel"); const legacy = params.get("channel");
@ -84,61 +94,17 @@
.forEach((part) => { .forEach((part) => {
if (!seen.has(part)) { if (!seen.has(part)) {
seen.add(part); seen.add(part);
collected.push(part);
} }
}); });
}); });
return collected; const first = Array.from(seen)[0];
return first || "";
} }
function getSelectedChannels() { function getSelectedChannels() {
return Array.from(selectedChannels); if (!channelSelect) return [];
} const value = channelSelect.value;
return value ? [value] : [];
function ensureAllCheckboxState() {
if (allChannelsCheckbox) {
allChannelsCheckbox.checked = selectedChannels.size === 0;
}
}
function updateChannelSummary() {
if (!channelSummary) return;
if (!selectedChannels.size) {
channelSummary.textContent = "All Channels";
return;
}
const names = Array.from(selectedChannels).map(
(id) => channelMap.get(id) || id
);
if (names.length > 1) {
names.sort((a, b) => a.localeCompare(b, undefined, { sensitivity: "base" }));
}
let label = names.slice(0, 3).join(", ");
if (names.length > 3) {
label += ` +${names.length - 3} more`;
}
channelSummary.textContent = label;
}
function applyChannelSelection(ids, { silent = false } = {}) {
selectedChannels.clear();
ids.forEach((id) => selectedChannels.add(id));
pendingChannelSelection = getSelectedChannels();
ensureAllCheckboxState();
if (channelOptions) {
suppressChannelChange = true;
const checkboxes = channelOptions.querySelectorAll(
'input[type="checkbox"][data-channel="1"]'
);
checkboxes.forEach((checkbox) => {
checkbox.checked = selectedChannels.has(checkbox.value);
});
suppressChannelChange = false;
}
updateChannelSummary();
if (!silent && channelsReady) {
runSearch(0);
}
} }
async function loadYears() { async function loadYears() {
@ -166,8 +132,10 @@
yearSel.value = qs.get("year") || ""; yearSel.value = qs.get("year") || "";
sortSel.value = qs.get("sort") || "relevant"; sortSel.value = qs.get("sort") || "relevant";
sizeSel.value = qs.get("size") || "10"; sizeSel.value = qs.get("size") || "10";
pendingChannelSelection = parseChannelParams(qs); pendingChannelSelection = parseChannelParam(qs);
applyChannelSelection(pendingChannelSelection, { silent: true }); if (channelSelect) {
channelSelect.value = pendingChannelSelection || "";
}
exactToggle.checked = parseBoolParam("exact", true); exactToggle.checked = parseBoolParam("exact", true);
fuzzyToggle.checked = parseBoolParam("fuzzy", true); fuzzyToggle.checked = parseBoolParam("fuzzy", true);
phraseToggle.checked = parseBoolParam("phrase", true); phraseToggle.checked = parseBoolParam("phrase", true);
@ -212,6 +180,76 @@
} }
} }
function graphUiAvailable() {
return !!(window.GraphUI && window.GraphUI.ready);
}
function openGraphModal(videoId) {
if (!graphOverlay || !graphUiAvailable()) {
return;
}
lastFocusBeforeModal =
document.activeElement instanceof HTMLElement ? document.activeElement : null;
graphOverlay.classList.add("active");
graphOverlay.setAttribute("aria-hidden", "false");
document.body.classList.add("modal-open");
window.requestAnimationFrame(() => {
window.GraphUI.setDepth(1);
window.GraphUI.setMaxNodes(200);
window.GraphUI.setLabelSize("tiny");
const graphVideoField = document.getElementById("graphVideoId");
if (videoId && graphVideoField) {
graphVideoField.value = videoId;
}
if (videoId) {
window.GraphUI.load(videoId, undefined, undefined, { updateInputs: true });
}
window.GraphUI.focusInput();
});
}
function closeGraphModal() {
if (!graphOverlay) {
return;
}
graphOverlay.classList.remove("active");
graphOverlay.setAttribute("aria-hidden", "true");
document.body.classList.remove("modal-open");
if (graphUiAvailable()) {
window.GraphUI.stop();
}
if (lastFocusBeforeModal && typeof lastFocusBeforeModal.focus === "function") {
lastFocusBeforeModal.focus();
}
lastFocusBeforeModal = null;
}
if (graphModalClose) {
graphModalClose.addEventListener("click", closeGraphModal);
}
if (graphOverlay) {
graphOverlay.addEventListener("click", (event) => {
if (event.target === graphOverlay) {
closeGraphModal();
}
});
}
document.addEventListener("keydown", (event) => {
if (event.key === "Escape" && graphOverlay && graphOverlay.classList.contains("active")) {
closeGraphModal();
}
});
window.addEventListener("graph-ui-ready", () => {
document
.querySelectorAll('.graph-launch-btn[data-await-graph-ready="1"]')
.forEach((btn) => {
btn.removeAttribute("disabled");
btn.removeAttribute("data-await-graph-ready");
btn.title = "Open reference graph";
});
});
function ensureQueryStringMode() { function ensureQueryStringMode() {
if (!queryToggle) return; if (!queryToggle) return;
if (!queryToggle.checked) { if (!queryToggle.checked) {
@ -242,60 +280,8 @@
return `${field}:(${escaped.join(" OR ")})`; return `${field}:(${escaped.join(" OR ")})`;
} }
if (channelOptions) {
channelOptions.addEventListener("change", (event) => {
const target = event.target;
if (!(target instanceof HTMLInputElement) || target.type !== "checkbox") {
return;
}
if (suppressChannelChange) {
return;
}
if (target.dataset.all === "1") {
if (!target.checked && !selectedChannels.size) {
suppressChannelChange = true;
target.checked = true;
suppressChannelChange = false;
return;
}
if (target.checked) {
selectedChannels.clear();
pendingChannelSelection = [];
suppressChannelChange = true;
const others = channelOptions.querySelectorAll(
'input[type="checkbox"][data-channel="1"]'
);
others.forEach((checkbox) => {
checkbox.checked = false;
});
suppressChannelChange = false;
ensureAllCheckboxState();
updateChannelSummary();
if (channelsReady) {
runSearch(0);
}
}
return;
}
const id = target.value;
if (!id) return;
if (target.checked) {
selectedChannels.add(id);
} else {
selectedChannels.delete(id);
}
pendingChannelSelection = getSelectedChannels();
ensureAllCheckboxState();
updateChannelSummary();
if (channelsReady) {
runSearch(0);
}
});
}
async function loadChannels() { async function loadChannels() {
if (!channelOptions) { if (!channelSelect) {
channelsReady = true; channelsReady = true;
return; return;
} }
@ -303,57 +289,27 @@
const res = await fetch("/api/channels"); const res = await fetch("/api/channels");
const data = await res.json(); const data = await res.json();
channelMap.clear(); channelMap.clear();
channelOptions.innerHTML = ""; channelSelect.innerHTML = '<option value="">All Channels</option>';
const listFragment = document.createDocumentFragment();
const allLabel = document.createElement("label");
allLabel.className = "channel-option";
allChannelsCheckbox = document.createElement("input");
allChannelsCheckbox.type = "checkbox";
allChannelsCheckbox.dataset.all = "1";
allChannelsCheckbox.checked = selectedChannels.size === 0;
const allText = document.createElement("span");
allText.textContent = "All Channels";
allLabel.appendChild(allChannelsCheckbox);
allLabel.appendChild(allText);
listFragment.appendChild(allLabel);
data.forEach((item) => { data.forEach((item) => {
const label = document.createElement("label"); const option = document.createElement("option");
label.className = "channel-option"; option.value = item.Id;
const checkbox = document.createElement("input"); option.textContent = `${item.Name} (${item.Count})`;
checkbox.type = "checkbox"; channelSelect.appendChild(option);
checkbox.value = item.Id;
checkbox.dataset.channel = "1";
const text = document.createElement("span");
text.textContent = `${item.Name} (${item.Count})`;
label.appendChild(checkbox);
label.appendChild(text);
listFragment.appendChild(label);
channelMap.set(item.Id, item.Name); channelMap.set(item.Id, item.Name);
}); });
channelOptions.appendChild(listFragment); if (pendingChannelSelection && channelMap.has(pendingChannelSelection)) {
channelSelect.value = pendingChannelSelection;
if (!data.length) { } else {
const empty = document.createElement("div"); channelSelect.value = "";
empty.textContent = "No channels available.";
channelOptions.appendChild(empty);
} }
const initialSelection = pendingChannelSelection.length
? pendingChannelSelection
: Array.from(selectedChannels);
applyChannelSelection(initialSelection, { silent: true });
channelsReady = true; channelsReady = true;
updateChannelSummary();
} catch (err) { } catch (err) {
console.error("Failed to load channels", err); console.error("Failed to load channels", err);
channelOptions.innerHTML = "<div>Failed to load channels.</div>"; channelSelect.innerHTML = '<option value="">All Channels</option>';
channelsReady = true; channelsReady = true;
ensureAllCheckboxState();
updateChannelSummary();
} }
} }
@ -391,6 +347,188 @@
return n; return n;
} }
async function getTranscriptData(videoId) {
if (!videoId) return null;
if (transcriptCache.has(videoId)) {
return transcriptCache.get(videoId);
}
const res = await fetch(`/api/transcript?video_id=${encodeURIComponent(videoId)}`);
if (!res.ok) {
throw new Error(`Transcript fetch failed (${res.status})`);
}
const data = await res.json();
transcriptCache.set(videoId, data);
return data;
}
function formatMlaDate(value) {
if (!value) return "";
const parsed = new Date(value);
if (Number.isNaN(parsed.valueOf())) {
return value;
}
const months = [
"Jan.", "Feb.", "Mar.", "Apr.", "May", "June",
"July", "Aug.", "Sept.", "Oct.", "Nov.", "Dec.",
];
return `${parsed.getDate()} ${months[parsed.getMonth()]} ${parsed.getFullYear()}`;
}
function buildMlaCitation(item) {
const channel = (item.channel_name || item.channel_id || "Unknown channel").trim();
const title = (item.title || "Untitled").trim();
const url = item.url || "";
const publishDate = formatMlaDate(item.date) || "n.d.";
const today = formatMlaDate(new Date().toISOString().split("T")[0]);
return `${channel}. "${title}." YouTube, uploaded by ${channel}, ${publishDate}, ${url}. Accessed ${today}.`;
}
function formatSegmentTimestamp(segment) {
if (!segment) return "";
if (segment.timestamp) return segment.timestamp;
const candidates = [
segment.start_seconds,
segment.start,
segment.offset,
segment.time,
];
for (const value of candidates) {
if (value == null) continue;
const seconds = parseFloat(value);
if (!Number.isNaN(seconds)) {
return formatTimestamp(seconds);
}
}
return "";
}
function serializeTranscriptSection(label, parts, fullText) {
let content = "";
if (typeof fullText === "string" && fullText.trim()) {
content = fullText.trim();
} else if (Array.isArray(parts) && parts.length) {
content = parts
.map((segment) => {
const ts = formatSegmentTimestamp(segment);
const text = segment && segment.text ? segment.text : "";
return ts ? `[${ts}] ${text}` : text;
})
.join("\n")
.trim();
}
if (!content) {
return "";
}
return `${label}\n${content}\n`;
}
function buildTranscriptDownloadText(item, transcriptData) {
const lines = [];
lines.push(`Title: ${item.title || "Untitled"}`);
if (item.channel_name) {
lines.push(`Channel: ${item.channel_name}`);
}
if (item.date) {
lines.push(`Published: ${item.date}`);
}
if (item.url) {
lines.push(`URL: ${item.url}`);
}
lines.push("");
const primaryText = serializeTranscriptSection(
"Primary Transcript",
transcriptData.transcript_parts,
transcriptData.transcript_full
);
const secondaryText = serializeTranscriptSection(
"Secondary Transcript",
transcriptData.transcript_secondary_parts,
transcriptData.transcript_secondary_full
);
if (primaryText) {
lines.push(primaryText);
}
if (secondaryText) {
lines.push(secondaryText);
}
if (!primaryText && !secondaryText) {
lines.push("No transcript available.");
}
return lines.join("\n").trim() + "\n";
}
function flashButtonMessage(button, message, duration = 1800) {
if (!button) return;
const original = button.dataset.originalLabel || button.textContent;
button.dataset.originalLabel = original;
button.textContent = message;
setTimeout(() => {
button.textContent = button.dataset.originalLabel || original;
}, duration);
}
async function handleTranscriptDownload(item, button) {
if (!item.video_id) return;
button.disabled = true;
try {
const data = await getTranscriptData(item.video_id);
if (!data) {
throw new Error("Transcript unavailable");
}
const text = buildTranscriptDownloadText(item, data);
const blob = new Blob([text], { type: "text/plain" });
const url = URL.createObjectURL(blob);
const link = document.createElement("a");
link.href = url;
link.download = `${item.video_id || "transcript"}.txt`;
document.body.appendChild(link);
link.click();
document.body.removeChild(link);
URL.revokeObjectURL(url);
flashButtonMessage(button, "Downloaded");
} catch (err) {
console.error("Download failed", err);
console.error("Download failed", err);
alert("Unable to download transcript right now.");
} finally {
button.disabled = false;
}
}
async function handleCopyCitation(item, button) {
const citation = buildMlaCitation(item);
try {
if (navigator.clipboard && window.isSecureContext) {
await navigator.clipboard.writeText(citation);
} else {
const textarea = document.createElement("textarea");
textarea.value = citation;
textarea.style.position = "fixed";
textarea.style.opacity = "0";
document.body.appendChild(textarea);
textarea.focus();
textarea.select();
document.execCommand("copy");
document.body.removeChild(textarea);
}
flashButtonMessage(button, "Copied!");
} catch (err) {
console.error("Citation copy failed", err);
alert(citation);
}
}
function getVideoStatus(item) {
if (!item || !item.video_status) return "";
return String(item.video_status).toLowerCase();
}
function isLikelyDeleted(item) {
return getVideoStatus(item) === "deleted";
}
function formatTimestamp(seconds) { function formatTimestamp(seconds) {
if (!seconds && seconds !== 0) return "00:00"; if (!seconds && seconds !== 0) return "00:00";
const hours = Math.floor(seconds / 3600); const hours = Math.floor(seconds / 3600);
@ -621,7 +759,65 @@
}, 3000); }, 3000);
} }
async function fetchAndDisplayTranscript(videoId, videoUrl, containerElement, button, highlightText = null) { const COMMON_STOP_WORDS = new Set([
"the","and","that","this","with","for","are","but","not","you","your","they","their",
"have","from","was","been","has","had","were","about","what","when","where","which",
"will","would","there","here","into","them","then","than","also","more","some","just",
"like","said","because","make","made","could","should","might"
]);
const tokenizeContent = (text) => {
if (!text) return [];
return text
.toLowerCase()
.split(/[^a-z0-9]+/g)
.filter((token) => token.length > 2 && !COMMON_STOP_WORDS.has(token))
.slice(0, 20);
};
function collectHighlightTokens(entries) {
const collected = [];
if (!Array.isArray(entries)) return collected;
entries.forEach((entry) => {
const raw = typeof entry === "string" ? entry : entry?.html || entry?.text || "";
if (!raw) return;
const marked = extractMarkedText(raw);
if (marked) {
collected.push(...tokenizeContent(marked));
} else {
collected.push(...tokenizeContent(stripHtmlAndNormalize(raw)));
}
});
return collected;
}
function buildQueryTokens(query) {
return tokenizeContent(query || "").slice(0, 20);
}
function highlightTranscriptMatches(transcriptDiv, entries, searchQuery) {
if (!transcriptDiv) return;
const tokens = new Set();
collectHighlightTokens(entries).forEach((token) => tokens.add(token));
buildQueryTokens(searchQuery).forEach((token) => tokens.add(token));
if (!tokens.size) return;
const segments = transcriptDiv.querySelectorAll(".transcript-segment");
segments.forEach((segment) => {
const text = segment.dataset.text || "";
const matched = Array.from(tokens).some((token) => text.includes(token));
segment.classList.toggle("transcript-segment--matched", matched);
});
}
async function fetchAndDisplayTranscript(
videoId,
videoUrl,
containerElement,
button,
highlightText = null,
allHighlights = null,
searchQuery = ""
) {
const existingTranscript = containerElement.querySelector('.full-transcript'); const existingTranscript = containerElement.querySelector('.full-transcript');
if (existingTranscript && !highlightText) { if (existingTranscript && !highlightText) {
existingTranscript.remove(); existingTranscript.remove();
@ -631,6 +827,7 @@
// If transcript exists and we have highlight text, just scroll to it // If transcript exists and we have highlight text, just scroll to it
if (existingTranscript && highlightText) { if (existingTranscript && highlightText) {
highlightTranscriptMatches(existingTranscript, allHighlights, searchQuery);
const segment = findMatchingSegment(existingTranscript, highlightText); const segment = findMatchingSegment(existingTranscript, highlightText);
if (segment) { if (segment) {
scrollToSegment(segment); scrollToSegment(segment);
@ -728,6 +925,7 @@
} }
}, 100); }, 100);
} }
highlightTranscriptMatches(transcriptDiv, allHighlights, searchQuery);
} catch (err) { } catch (err) {
console.error('Error fetching transcript:', err); console.error('Error fetching transcript:', err);
button.textContent = 'View Full Transcript'; button.textContent = 'View Full Transcript';
@ -797,7 +995,8 @@ function clearFrequency(message) {
} }
} }
function renderFrequencyChart(buckets, channelTotals) {
function renderFrequencyChart(buckets, channelTotals) {
if (!freqChart || typeof d3 === "undefined") { if (!freqChart || typeof d3 === "undefined") {
return; return;
} }
@ -807,6 +1006,26 @@ function renderFrequencyChart(buckets, channelTotals) {
return; return;
} }
const channelNameFallback = new Map();
(channelTotals || []).forEach((entry) => {
if (!entry || !entry.id) return;
if (entry.name) {
channelNameFallback.set(entry.id, entry.name);
}
});
buckets.forEach((bucket) => {
(bucket.channels || []).forEach((entry) => {
if (entry && entry.id && entry.name && !channelNameFallback.has(entry.id)) {
channelNameFallback.set(entry.id, entry.name);
}
});
});
const getChannelLabel = (id) => {
if (!id) return "";
return channelMap.get(id) || channelNameFallback.get(id) || id;
};
let channelsOrder = let channelsOrder =
(channelTotals && channelTotals.length (channelTotals && channelTotals.length
? channelTotals.map((entry) => entry.id) ? channelTotals.map((entry) => entry.id)
@ -929,7 +1148,7 @@ function renderFrequencyChart(buckets, channelTotals) {
.text(function (d) { .text(function (d) {
const group = this.parentNode ? this.parentNode.parentNode : null; const group = this.parentNode ? this.parentNode.parentNode : null;
const key = group ? d3.select(group).datum().key : undefined; const key = group ? d3.select(group).datum().key : undefined;
const label = key ? channelMap.get(key) || key : key || ''; const label = key ? getChannelLabel(key) : key || '';
return `${dateKeyFormat(d.data.date)}: ${d[1] - d[0]}${label ? " (" + label + ")" : ''}`; return `${dateKeyFormat(d.data.date)}: ${d[1] - d[0]}${label ? " (" + label + ")" : ''}`;
}); });
@ -942,7 +1161,7 @@ function renderFrequencyChart(buckets, channelTotals) {
swatch.className = "freq-legend-swatch"; swatch.className = "freq-legend-swatch";
swatch.style.backgroundColor = color(key); swatch.style.backgroundColor = color(key);
const label = document.createElement("span"); const label = document.createElement("span");
label.textContent = channelMap.get(key) || key; label.textContent = getChannelLabel(key) || key;
item.appendChild(swatch); item.appendChild(swatch);
item.appendChild(label); item.appendChild(label);
legend.appendChild(item); legend.appendChild(item);
@ -1027,12 +1246,15 @@ async function updateFrequencyChart(term, channels, year, queryMode, toggles = {
item.descriptionHtml || escapeHtml(item.description || ""); item.descriptionHtml || escapeHtml(item.description || "");
const header = document.createElement("div"); const header = document.createElement("div");
header.className = "result-header";
const headerMain = document.createElement("div");
headerMain.className = "result-header-main";
const badgeDefs = []; const badgeDefs = [];
if (item.highlightSource && item.highlightSource.primary) { if (item.highlightSource && item.highlightSource.primary) {
badgeDefs.push({ label: "primary transcript" }); badgeDefs.push({ label: "primary transcript", badgeType: "transcript-primary" });
} }
if (item.highlightSource && item.highlightSource.secondary) { if (item.highlightSource && item.highlightSource.secondary) {
badgeDefs.push({ label: "secondary transcript" }); badgeDefs.push({ label: "secondary transcript", badgeType: "transcript-secondary" });
} }
// Add reference count badges // Add reference count badges
@ -1068,13 +1290,47 @@ async function updateFrequencyChart(term, channels, year, queryMode, toggles = {
}); });
} }
header.innerHTML = ` const titleEl = document.createElement("strong");
<strong>${titleHtml}</strong> titleEl.innerHTML = titleHtml;
<div class="muted">${escapeHtml(item.channel_name || "")} ${fmtDate( headerMain.appendChild(titleEl);
item.date
)}</div> const metaLine = document.createElement("div");
<div class="muted"><a href="${item.url}" target="_blank" rel="noopener">Open on YouTube</a></div> metaLine.className = "muted result-meta";
`; const channelLabel = item.channel_name || "";
const dateLabel = fmtDate(item.date);
let durationSeconds = null;
if (typeof item.duration === "number") {
durationSeconds = item.duration;
} else if (typeof item.duration === "string" && item.duration.trim()) {
const parsed = parseFloat(item.duration);
if (!Number.isNaN(parsed)) {
durationSeconds = parsed;
}
}
const durationLabel = durationSeconds != null ? `${formatTimestamp(durationSeconds)}` : "";
metaLine.textContent = channelLabel
? `${channelLabel}${dateLabel}${durationLabel}`
: `${dateLabel}${durationLabel}`;
if (isLikelyDeleted(item)) {
metaLine.appendChild(document.createTextNode(" "));
const statusEl = document.createElement("span");
statusEl.className = "result-status result-status--deleted";
statusEl.textContent = "Likely deleted";
statusEl.title = "YouTube reported this video as unavailable when we last checked.";
metaLine.appendChild(statusEl);
}
headerMain.appendChild(metaLine);
const linkLine = document.createElement("div");
linkLine.className = "muted";
const openLink = document.createElement("a");
openLink.href = item.url;
openLink.target = "_blank";
openLink.rel = "noopener";
openLink.textContent = "Open on YouTube";
linkLine.appendChild(openLink);
headerMain.appendChild(linkLine);
header.appendChild(headerMain);
if (badgeDefs.length) { if (badgeDefs.length) {
const badgeRow = document.createElement("div"); const badgeRow = document.createElement("div");
badgeRow.className = "badge-row"; badgeRow.className = "badge-row";
@ -1086,6 +1342,9 @@ async function updateFrequencyChart(term, channels, year, queryMode, toggles = {
if (badge.title) { if (badge.title) {
badgeEl.title = badge.title; badgeEl.title = badge.title;
} }
if (badge.badgeType) {
badgeEl.classList.add(`badge--${badge.badgeType}`);
}
if (badge.query) { if (badge.query) {
badgeEl.classList.add("badge-clickable"); badgeEl.classList.add("badge-clickable");
badgeEl.setAttribute("role", "button"); badgeEl.setAttribute("role", "button");
@ -1110,7 +1369,45 @@ async function updateFrequencyChart(term, channels, year, queryMode, toggles = {
badgeRow.appendChild(badgeEl); badgeRow.appendChild(badgeEl);
}); });
if (badgeRow.childElementCount) { if (badgeRow.childElementCount) {
header.appendChild(badgeRow); headerMain.appendChild(badgeRow);
}
}
if (item.video_id) {
const actions = document.createElement("div");
actions.className = "result-actions";
const downloadBtn = document.createElement("button");
downloadBtn.type = "button";
downloadBtn.className = "result-action-btn";
downloadBtn.textContent = "Download transcript";
downloadBtn.addEventListener("click", () => handleTranscriptDownload(item, downloadBtn));
actions.appendChild(downloadBtn);
const citationBtn = document.createElement("button");
citationBtn.type = "button";
citationBtn.className = "result-action-btn";
citationBtn.textContent = "Copy citation";
citationBtn.addEventListener("click", () => handleCopyCitation(item, citationBtn));
actions.appendChild(citationBtn);
if (graphOverlay) {
const graphBtn = document.createElement("button");
graphBtn.type = "button";
graphBtn.className = "result-action-btn graph-launch-btn";
graphBtn.textContent = "Graph";
if (graphUiAvailable()) {
graphBtn.title = "Open reference graph";
} else {
graphBtn.disabled = true;
graphBtn.title = "Reference graph is still loading…";
graphBtn.dataset.awaitGraphReady = "1";
}
graphBtn.addEventListener("click", () => openGraphModal(item.video_id));
actions.appendChild(graphBtn);
}
if (actions.childElementCount) {
header.appendChild(actions);
} }
} }
el.appendChild(header); el.appendChild(header);
@ -1128,9 +1425,25 @@ async function updateFrequencyChart(term, channels, year, queryMode, toggles = {
item.toHighlight.forEach((entry) => { item.toHighlight.forEach((entry) => {
const html = typeof entry === "string" ? entry : entry?.html; const html = typeof entry === "string" ? entry : entry?.html;
if (!html) return; if (!html) return;
const source = entry && typeof entry === "object" ? entry.source : null;
const row = document.createElement("div"); const row = document.createElement("div");
row.className = "highlight-row"; row.className = "highlight-row";
row.innerHTML = html; if (source === "primary") {
row.classList.add("highlight-row--primary");
} else if (source === "secondary") {
row.classList.add("highlight-row--secondary");
}
const textBlock = document.createElement("div");
textBlock.className = "highlight-text";
textBlock.innerHTML = html;
row.appendChild(textBlock);
if (source) {
const indicator = document.createElement("span");
indicator.className = `highlight-source-indicator highlight-source-indicator--${source}`;
indicator.title =
source === "primary" ? "Highlight from primary transcript" : "Highlight from secondary transcript";
row.appendChild(indicator);
}
row.title = "Click to jump to this location in the transcript"; row.title = "Click to jump to this location in the transcript";
// Make highlight clickable // Make highlight clickable
@ -1138,7 +1451,15 @@ async function updateFrequencyChart(term, channels, year, queryMode, toggles = {
const transcriptBtn = el.querySelector(".transcript-toggle"); const transcriptBtn = el.querySelector(".transcript-toggle");
if (transcriptBtn && item.video_id) { if (transcriptBtn && item.video_id) {
const highlightText = stripHtmlAndNormalize(html); const highlightText = stripHtmlAndNormalize(html);
fetchAndDisplayTranscript(item.video_id, item.url, el, transcriptBtn, highlightText); fetchAndDisplayTranscript(
item.video_id,
item.url,
el,
transcriptBtn,
highlightText,
item.toHighlight,
qInput.value
);
} }
}; };
@ -1154,7 +1475,15 @@ async function updateFrequencyChart(term, channels, year, queryMode, toggles = {
transcriptBtn.className = "transcript-toggle"; transcriptBtn.className = "transcript-toggle";
transcriptBtn.textContent = "View Full Transcript"; transcriptBtn.textContent = "View Full Transcript";
transcriptBtn.onclick = () => { transcriptBtn.onclick = () => {
fetchAndDisplayTranscript(item.video_id, item.url, el, transcriptBtn); fetchAndDisplayTranscript(
item.video_id,
item.url,
el,
transcriptBtn,
null,
item.toHighlight,
qInput.value
);
}; };
el.appendChild(transcriptBtn); el.appendChild(transcriptBtn);
} }
@ -1223,10 +1552,28 @@ async function updateFrequencyChart(term, channels, year, queryMode, toggles = {
updateFrequencyChart(q, channels, year, queryMode, { exact, fuzzy, phrase }); updateFrequencyChart(q, channels, year, queryMode, { exact, fuzzy, phrase });
} }
searchBtn.addEventListener("click", () => runSearch(0)); searchBtn.addEventListener("click", () => runSearch(0));
if (aboutBtn && aboutPanel) {
aboutBtn.addEventListener("click", () => {
const isHidden = aboutPanel.hasAttribute("hidden");
toggleAboutPanel(isHidden);
});
}
if (aboutCloseBtn) {
aboutCloseBtn.addEventListener("click", () => toggleAboutPanel(false));
}
qInput.addEventListener("keypress", (e) => { qInput.addEventListener("keypress", (e) => {
if (e.key === "Enter") runSearch(0); if (e.key === "Enter") runSearch(0);
}); });
if (channelSelect) {
channelSelect.addEventListener("change", () => {
pendingChannelSelection = channelSelect.value || "";
if (channelsReady) {
runSearch(0);
}
});
}
yearSel.addEventListener("change", () => runSearch(0)); yearSel.addEventListener("change", () => runSearch(0));
sortSel.addEventListener("change", () => runSearch(0)); sortSel.addEventListener("change", () => runSearch(0));
sizeSel.addEventListener("change", () => runSearch(0)); sizeSel.addEventListener("change", () => runSearch(0));

85
static/graph.html Normal file
View File

@ -0,0 +1,85 @@
<!doctype html>
<html>
<head>
<meta charset="utf-8" />
<meta name="viewport" content="width=device-width, initial-scale=1" />
<title>TLC Reference Graph</title>
<link rel="icon" href="/static/favicon.png" type="image/png" />
<link rel="stylesheet" href="https://unpkg.com/xp.css" />
<link rel="stylesheet" href="/static/style.css" />
<script src="https://cdn.jsdelivr.net/npm/d3@7/dist/d3.min.js"></script>
</head>
<body>
<div class="window graph-window" style="max-width: 1100px; margin: 20px auto;">
<div class="title-bar">
<div class="title-bar-text">Reference Graph</div>
<div class="title-bar-controls">
<a class="title-bar-link" href="/">⬅ Search</a>
</div>
</div>
<div class="window-body">
<p>
Explore how videos reference each other. Enter a <code>video_id</code> to see its immediate
neighbors (referenced and referencing videos). Choose a larger depth to expand the graph.
</p>
<form id="graphForm" class="graph-controls">
<div class="field-group">
<label for="graphVideoId">Video ID</label>
<input
id="graphVideoId"
name="video_id"
type="text"
placeholder="e.g. dQw4w9WgXcQ"
required
/>
</div>
<div class="field-group">
<label for="graphDepth">Depth</label>
<select id="graphDepth" name="depth">
<option value="1">1 hop</option>
<option value="2">2 hops</option>
<option value="3">3 hops</option>
</select>
</div>
<div class="field-group">
<label for="graphMaxNodes">Max nodes</label>
<select id="graphMaxNodes" name="max_nodes">
<option value="100">100</option>
<option value="150">150</option>
<option value="200" selected>200</option>
<option value="300">300</option>
</select>
</div>
<div class="field-group">
<label for="graphLabelSize">Labels</label>
<select id="graphLabelSize" name="label_size">
<option value="off">Off</option>
<option value="tiny" selected>Tiny</option>
<option value="small">Small</option>
<option value="normal">Normal</option>
<option value="medium">Medium</option>
<option value="large">Large</option>
<option value="xlarge">Extra large</option>
</select>
</div>
<button type="submit">Build graph</button>
</form>
<div id="graphStatus" class="graph-status">Enter a video ID to begin.</div>
<div id="graphContainer" class="graph-container"></div>
</div>
<div class="status-bar">
<p class="status-bar-field">Click nodes to open the video on YouTube</p>
<p class="status-bar-field">Colors represent channels</p>
</div>
</div>
<script src="/static/graph.js"></script>
</body>
</html>

670
static/graph.js Normal file
View File

@ -0,0 +1,670 @@
(() => {
const global = window;
const GraphUI = (global.GraphUI = global.GraphUI || {});
GraphUI.ready = false;
const form = document.getElementById("graphForm");
const videoInput = document.getElementById("graphVideoId");
const depthInput = document.getElementById("graphDepth");
const maxNodesInput = document.getElementById("graphMaxNodes");
const labelSizeInput = document.getElementById("graphLabelSize");
const statusEl = document.getElementById("graphStatus");
const container = document.getElementById("graphContainer");
const isEmbedded =
container && container.dataset && container.dataset.embedded === "true";
if (!form || !videoInput || !depthInput || !maxNodesInput || !labelSizeInput || !container) {
console.error("Graph: required DOM elements missing.");
return;
}
const color = d3.scaleOrdinal(d3.schemeTableau10);
const colorRange = typeof color.range === "function" ? color.range() : [];
const paletteSizeDefault = colorRange.length || 10;
const PATTERN_TYPES = [
{ key: "none", legendClass: "none" },
{ key: "diag-forward", legendClass: "diag-forward" },
{ key: "diag-back", legendClass: "diag-back" },
{ key: "cross", legendClass: "cross" },
{ key: "dots", legendClass: "dots" },
];
const ADDITIONAL_PATTERNS = PATTERN_TYPES.filter((pattern) => pattern.key !== "none");
const sanitizeDepth = (value) => {
const parsed = parseInt(value, 10);
if (Number.isNaN(parsed)) return 1;
return Math.max(0, Math.min(parsed, 3));
};
const sanitizeMaxNodes = (value) => {
const parsed = parseInt(value, 10);
if (Number.isNaN(parsed)) return 200;
return Math.max(10, Math.min(parsed, 400));
};
const LABEL_SIZE_VALUES = ["off", "tiny", "small", "normal", "medium", "large", "xlarge"];
const LABEL_FONT_SIZES = {
tiny: "7px",
small: "8px",
normal: "9px",
medium: "10px",
large: "11px",
xlarge: "13px",
};
const DEFAULT_LABEL_SIZE = "tiny";
const isValidLabelSize = (value) => LABEL_SIZE_VALUES.includes(value);
const getLabelSize = () => {
if (!labelSizeInput) return DEFAULT_LABEL_SIZE;
const value = labelSizeInput.value;
return isValidLabelSize(value) ? value : DEFAULT_LABEL_SIZE;
};
function setLabelSizeInput(value) {
if (!labelSizeInput) return;
labelSizeInput.value = isValidLabelSize(value) ? value : DEFAULT_LABEL_SIZE;
}
const getChannelLabel = (node) =>
(node && (node.channel_name || node.channel_id)) || "Unknown";
function appendPatternContent(pattern, baseColor, patternKey) {
pattern.append("rect").attr("width", 8).attr("height", 8).attr("fill", baseColor);
const strokeColor = "#1f1f1f";
const strokeOpacity = 0.35;
const addForward = () => {
pattern
.append("path")
.attr("d", "M-2,6 L2,2 M0,8 L8,0 M6,10 L10,4")
.attr("stroke", strokeColor)
.attr("stroke-width", 1)
.attr("stroke-opacity", strokeOpacity)
.attr("fill", "none");
};
const addBackward = () => {
pattern
.append("path")
.attr("d", "M-2,2 L2,6 M0,0 L8,8 M6,-2 L10,2")
.attr("stroke", strokeColor)
.attr("stroke-width", 1)
.attr("stroke-opacity", strokeOpacity)
.attr("fill", "none");
};
switch (patternKey) {
case "diag-forward":
addForward();
break;
case "diag-back":
addBackward();
break;
case "cross":
addForward();
addBackward();
break;
case "dots":
pattern
.append("circle")
.attr("cx", 4)
.attr("cy", 4)
.attr("r", 1.5)
.attr("fill", strokeColor)
.attr("fill-opacity", strokeOpacity);
break;
default:
break;
}
}
function createChannelStyle(label, baseColor, patternKey) {
const patternInfo =
PATTERN_TYPES.find((pattern) => pattern.key === patternKey) || PATTERN_TYPES[0];
return {
baseColor,
hatch: patternInfo ? patternInfo.key : "none",
legendClass: patternInfo ? patternInfo.legendClass : "none",
};
}
let currentGraphData = null;
let currentChannelStyles = new Map();
let currentDepth = sanitizeDepth(depthInput.value);
let currentMaxNodes = sanitizeMaxNodes(maxNodesInput.value);
let currentSimulation = null;
function setStatus(message, isError = false) {
if (!statusEl) return;
statusEl.textContent = message;
if (isError) {
statusEl.classList.add("error");
} else {
statusEl.classList.remove("error");
}
}
function sanitizeId(value) {
return (value || "").trim();
}
async function fetchGraph(videoId, depth, maxNodes) {
const params = new URLSearchParams();
params.set("video_id", videoId);
params.set("depth", String(depth));
params.set("max_nodes", String(maxNodes));
const response = await fetch(`/api/graph?${params.toString()}`);
if (!response.ok) {
const errorPayload = await response.json().catch(() => ({}));
const errorMessage =
errorPayload.error ||
`Graph request failed (${response.status} ${response.statusText})`;
throw new Error(errorMessage);
}
return response.json();
}
function resizeContainer() {
if (!container) return;
const minHeight = 520;
const viewportHeight = window.innerHeight;
container.style.height = `${Math.max(minHeight, Math.round(viewportHeight * 0.6))}px`;
}
function renderGraph(data, labelSize = "normal") {
if (!container) return;
if (currentSimulation) {
currentSimulation.stop();
currentSimulation = null;
}
container.innerHTML = "";
const width = container.clientWidth || 900;
const height = container.clientHeight || 600;
const svg = d3
.select(container)
.append("svg")
.attr("viewBox", [0, 0, width, height])
.attr("width", "100%")
.attr("height", height);
const defs = svg.append("defs");
defs
.append("marker")
.attr("id", "arrow-references")
.attr("viewBox", "0 -5 10 10")
.attr("refX", 18)
.attr("refY", 0)
.attr("markerWidth", 6)
.attr("markerHeight", 6)
.attr("orient", "auto")
.append("path")
.attr("d", "M0,-5L10,0L0,5")
.attr("fill", "#6c83c7");
defs
.append("marker")
.attr("id", "arrow-referenced-by")
.attr("viewBox", "0 -5 10 10")
.attr("refX", 18)
.attr("refY", 0)
.attr("markerWidth", 6)
.attr("markerHeight", 6)
.attr("orient", "auto")
.append("path")
.attr("d", "M0,-5L10,0L0,5")
.attr("fill", "#c76c6c");
const contentGroup = svg.append("g").attr("class", "graph-content");
const linkGroup = contentGroup.append("g").attr("class", "graph-links");
const nodeGroup = contentGroup.append("g").attr("class", "graph-nodes");
const labelGroup = contentGroup.append("g").attr("class", "graph-labels");
const links = data.links || [];
const nodes = data.nodes || [];
currentChannelStyles = new Map();
const uniqueChannels = [];
nodes.forEach((node) => {
const label = getChannelLabel(node);
if (!currentChannelStyles.has(label)) {
uniqueChannels.push(label);
}
});
const additionalPatternCount = ADDITIONAL_PATTERNS.length;
uniqueChannels.forEach((label, idx) => {
const baseColor = color(label);
let patternKey = "none";
if (idx >= paletteSizeDefault && additionalPatternCount > 0) {
const patternInfo =
ADDITIONAL_PATTERNS[(idx - paletteSizeDefault) % additionalPatternCount];
patternKey = patternInfo.key;
}
const style = createChannelStyle(label, baseColor, patternKey);
currentChannelStyles.set(label, style);
});
const linkSelection = linkGroup
.selectAll("line")
.data(links)
.enter()
.append("line")
.attr("stroke-width", 1.2)
.attr("stroke", (d) =>
d.relation === "references" ? "#6c83c7" : "#c76c6c"
)
.attr("stroke-opacity", 0.7)
.attr("marker-end", (d) =>
d.relation === "references" ? "url(#arrow-references)" : "url(#arrow-referenced-by)"
);
let nodePatternCounter = 0;
const nodePatternRefs = new Map();
const getNodeFill = (node) => {
const style = currentChannelStyles.get(getChannelLabel(node));
if (!style) {
return color(getChannelLabel(node));
}
if (!style.hatch || style.hatch === "none") {
return style.baseColor;
}
const patternId = `node-pattern-${nodePatternCounter++}`;
const pattern = defs
.append("pattern")
.attr("id", patternId)
.attr("patternUnits", "userSpaceOnUse")
.attr("width", 8)
.attr("height", 8);
appendPatternContent(pattern, style.baseColor, style.hatch);
pattern.attr("patternTransform", "translate(0,0)");
nodePatternRefs.set(node.id, pattern);
return `url(#${patternId})`;
};
const nodeSelection = nodeGroup
.selectAll("circle")
.data(nodes, (d) => d.id)
.enter()
.append("circle")
.attr("r", (d) => (d.is_root ? 10 : 7))
.attr("fill", (d) => getNodeFill(d))
.attr("stroke", "#1f1f1f")
.attr("stroke-width", (d) => (d.is_root ? 2 : 1))
.call(
d3
.drag()
.on("start", (event, d) => {
if (!event.active) simulation.alphaTarget(0.3).restart();
d.fx = d.x;
d.fy = d.y;
})
.on("drag", (event, d) => {
d.fx = event.x;
d.fy = event.y;
})
.on("end", (event, d) => {
if (!event.active) simulation.alphaTarget(0);
d.fx = null;
d.fy = null;
})
)
.on("click", (event, d) => {
if (d.url) {
window.open(d.url, "_blank", "noopener");
}
})
.on("contextmenu", (event, d) => {
event.preventDefault();
loadGraph(d.id, currentDepth, currentMaxNodes, { updateInputs: true });
});
nodeSelection
.append("title")
.text((d) => {
const parts = [];
parts.push(d.title || d.id);
if (d.channel_name) {
parts.push(`Channel: ${d.channel_name}`);
}
if (d.date) {
parts.push(`Date: ${d.date}`);
}
return parts.join("\n");
});
const labelSelection = labelGroup
.selectAll("text")
.data(nodes, (d) => d.id)
.enter()
.append("text")
.attr("class", "graph-node-label")
.attr("text-anchor", "middle")
.attr("fill", "#1f1f1f")
.attr("pointer-events", "none")
.text((d) => d.title || d.id);
applyLabelAppearance(labelSelection, labelSize);
const simulation = d3
.forceSimulation(nodes)
.force(
"link",
d3
.forceLink(links)
.id((d) => d.id)
.distance(120)
.strength(0.8)
)
.force("charge", d3.forceManyBody().strength(-320))
.force("center", d3.forceCenter(width / 2, height / 2))
.force(
"collide",
d3.forceCollide().radius((d) => (d.is_root ? 20 : 14)).iterations(2)
);
simulation.on("tick", () => {
linkSelection
.attr("x1", (d) => d.source.x)
.attr("y1", (d) => d.source.y)
.attr("x2", (d) => d.target.x)
.attr("y2", (d) => d.target.y);
nodeSelection.attr("cx", (d) => d.x).attr("cy", (d) => d.y);
labelSelection.attr("x", (d) => d.x).attr("y", (d) => d.y - (d.is_root ? 14 : 12));
nodeSelection.each(function (d) {
const pattern = nodePatternRefs.get(d.id);
if (pattern) {
const safeX = Number.isFinite(d.x) ? d.x : 0;
const safeY = Number.isFinite(d.y) ? d.y : 0;
pattern.attr("patternTransform", `translate(${safeX}, ${safeY})`);
}
});
});
const zoomBehavior = d3
.zoom()
.scaleExtent([0.3, 3])
.on("zoom", (event) => {
contentGroup.attr("transform", event.transform);
});
svg.call(zoomBehavior);
currentSimulation = simulation;
}
async function loadGraph(videoId, depth, maxNodes, { updateInputs = false } = {}) {
const sanitizedId = sanitizeId(videoId);
if (!sanitizedId) {
setStatus("Please enter a video ID.", true);
return;
}
const safeDepth = sanitizeDepth(depth);
const safeMaxNodes = sanitizeMaxNodes(maxNodes);
if (updateInputs) {
videoInput.value = sanitizedId;
depthInput.value = String(safeDepth);
maxNodesInput.value = String(safeMaxNodes);
}
setStatus("Loading graph…");
try {
const data = await fetchGraph(sanitizedId, safeDepth, safeMaxNodes);
if (!data.nodes || data.nodes.length === 0) {
setStatus("No nodes returned for this video.", true);
container.innerHTML = "";
currentGraphData = null;
currentChannelStyles = new Map();
renderLegend([]);
return;
}
currentGraphData = data;
currentDepth = safeDepth;
currentMaxNodes = safeMaxNodes;
renderGraph(data, getLabelSize());
renderLegend(data.nodes);
setStatus(
`Showing ${data.nodes.length} nodes and ${data.links.length} links (depth ${data.depth})`
);
updateUrlState(sanitizedId, safeDepth, safeMaxNodes, getLabelSize());
} catch (err) {
console.error(err);
setStatus(err.message || "Failed to build graph.", true);
container.innerHTML = "";
currentGraphData = null;
currentChannelStyles = new Map();
renderLegend([]);
}
}
async function handleSubmit(event) {
event.preventDefault();
await loadGraph(videoInput.value, depthInput.value, maxNodesInput.value, {
updateInputs: true,
});
}
function renderLegend(nodes) {
let legend = document.getElementById("graphLegend");
if (!legend) {
legend = document.createElement("div");
legend.id = "graphLegend";
legend.className = "graph-legend";
if (statusEl && statusEl.parentNode) {
statusEl.insertAdjacentElement("afterend", legend);
} else {
container.parentElement?.insertBefore(legend, container);
}
}
legend.innerHTML = "";
const edgesSection = document.createElement("div");
edgesSection.className = "graph-legend-section";
const edgesTitle = document.createElement("div");
edgesTitle.className = "graph-legend-title";
edgesTitle.textContent = "Edges";
edgesSection.appendChild(edgesTitle);
const createEdgeRow = (swatchClass, text) => {
const row = document.createElement("div");
row.className = "graph-legend-row";
const swatch = document.createElement("span");
swatch.className = `graph-legend-swatch ${swatchClass}`;
const label = document.createElement("span");
label.textContent = text;
row.appendChild(swatch);
row.appendChild(label);
return row;
};
edgesSection.appendChild(
createEdgeRow(
"graph-legend-swatch--references",
"Outgoing reference (video references other)"
)
);
edgesSection.appendChild(
createEdgeRow(
"graph-legend-swatch--referenced",
"Incoming reference (other video references this)"
)
);
legend.appendChild(edgesSection);
const channelSection = document.createElement("div");
channelSection.className = "graph-legend-section";
const channelTitle = document.createElement("div");
channelTitle.className = "graph-legend-title";
channelTitle.textContent = "Channels in view";
channelSection.appendChild(channelTitle);
const channelList = document.createElement("div");
channelList.className = "graph-legend-channel-list";
const channelEntries = Array.from(currentChannelStyles.entries()).sort((a, b) =>
a[0].localeCompare(b[0], undefined, { sensitivity: "base" })
);
const maxChannelItems = 20;
channelEntries.slice(0, maxChannelItems).forEach(([label, style]) => {
const item = document.createElement("div");
item.className = `graph-legend-channel graph-legend-channel--${
style.legendClass || "none"
}`;
const swatch = document.createElement("span");
swatch.className = "graph-legend-swatch graph-legend-channel-swatch";
swatch.style.backgroundColor = style.baseColor;
const text = document.createElement("span");
text.textContent = label;
item.appendChild(swatch);
item.appendChild(text);
channelList.appendChild(item);
});
const totalChannels = channelEntries.length;
if (channelList.childElementCount) {
channelSection.appendChild(channelList);
if (totalChannels > maxChannelItems) {
const note = document.createElement("div");
note.className = "graph-legend-note";
note.textContent = `+${totalChannels - maxChannelItems} more channels`;
channelSection.appendChild(note);
}
} else {
const empty = document.createElement("div");
empty.className = "graph-legend-note";
empty.textContent = "No channel data available.";
channelSection.appendChild(empty);
}
legend.appendChild(channelSection);
}
function applyLabelAppearance(selection, labelSize) {
if (labelSize === "off") {
selection.style("display", "none");
} else {
selection
.style("display", null)
.attr("font-size", LABEL_FONT_SIZES[labelSize] || LABEL_FONT_SIZES.normal);
}
}
function updateUrlState(videoId, depth, maxNodes, labelSize) {
if (isEmbedded) {
return;
}
const next = new URL(window.location.href);
next.searchParams.set("video_id", videoId);
next.searchParams.set("depth", String(depth));
next.searchParams.set("max_nodes", String(maxNodes));
if (labelSize && labelSize !== "normal") {
next.searchParams.set("label_size", labelSize);
} else {
next.searchParams.delete("label_size");
}
history.replaceState({}, "", next.toString());
}
function initFromQuery() {
const params = new URLSearchParams(window.location.search);
const videoId = sanitizeId(params.get("video_id"));
const depth = sanitizeDepth(params.get("depth") || "");
const maxNodes = sanitizeMaxNodes(params.get("max_nodes") || "");
const labelSizeParam = params.get("label_size");
if (videoId) {
videoInput.value = videoId;
}
depthInput.value = String(depth);
maxNodesInput.value = String(maxNodes);
if (labelSizeParam && isValidLabelSize(labelSizeParam)) {
setLabelSizeInput(labelSizeParam);
} else {
setLabelSizeInput(getLabelSize());
}
if (!videoId || isEmbedded) {
return;
}
loadGraph(videoId, depth, maxNodes, { updateInputs: false });
}
resizeContainer();
window.addEventListener("resize", resizeContainer);
form.addEventListener("submit", handleSubmit);
labelSizeInput.addEventListener("change", () => {
const size = getLabelSize();
if (currentGraphData) {
renderGraph(currentGraphData, size);
renderLegend(currentGraphData.nodes);
}
updateUrlState(
sanitizeId(videoInput.value),
currentDepth,
currentMaxNodes,
size
);
});
initFromQuery();
Object.assign(GraphUI, {
load(videoId, depth, maxNodes, options = {}) {
const targetDepth = depth != null ? depth : currentDepth;
const targetMax = maxNodes != null ? maxNodes : currentMaxNodes;
return loadGraph(videoId, targetDepth, targetMax, {
updateInputs: options.updateInputs !== false,
});
},
setLabelSize(size) {
if (!labelSizeInput || !size) return;
setLabelSizeInput(size);
labelSizeInput.dispatchEvent(new Event("change", { bubbles: true }));
},
setDepth(value) {
if (!depthInput) return;
const safe = sanitizeDepth(value);
depthInput.value = String(safe);
currentDepth = safe;
},
setMaxNodes(value) {
if (!maxNodesInput) return;
const safe = sanitizeMaxNodes(value);
maxNodesInput.value = String(safe);
currentMaxNodes = safe;
},
focusInput() {
if (videoInput) {
videoInput.focus();
videoInput.select();
}
},
stop() {
if (currentSimulation) {
currentSimulation.stop();
currentSimulation = null;
}
},
getState() {
return {
depth: currentDepth,
maxNodes: currentMaxNodes,
labelSize: getLabelSize(),
nodes: currentGraphData ? currentGraphData.nodes.slice() : [],
links: currentGraphData ? currentGraphData.links.slice() : [],
};
},
isEmbedded,
});
GraphUI.ready = true;
setTimeout(() => {
window.dispatchEvent(new CustomEvent("graph-ui-ready"));
}, 0);
})();

View File

@ -14,6 +14,7 @@
<div class="title-bar"> <div class="title-bar">
<div class="title-bar-text">This Little Corner</div> <div class="title-bar-text">This Little Corner</div>
<div class="title-bar-controls"> <div class="title-bar-controls">
<button id="aboutBtn" aria-label="About">?</button>
<button id="minimizeBtn" aria-label="Minimize"></button> <button id="minimizeBtn" aria-label="Minimize"></button>
<button aria-label="Maximize"></button> <button aria-label="Maximize"></button>
<button aria-label="Close"></button> <button aria-label="Close"></button>
@ -21,6 +22,10 @@
</div> </div>
<div class="window-body"> <div class="window-body">
<p>Enter a phrase to query title, description, and transcript text.</p> <p>Enter a phrase to query title, description, and transcript text.</p>
<p style="font-size: 11px;">
Looking for semantic matches? Try the
<a href="/vector-search">vector search beta</a>.
</p>
<fieldset> <fieldset>
<legend>Search</legend> <legend>Search</legend>
@ -31,13 +36,10 @@
</div> </div>
<div class="field-row" style="margin-bottom: 8px; align-items: center;"> <div class="field-row" style="margin-bottom: 8px; align-items: center;">
<label style="width: 60px;">Channel:</label> <label for="channel" style="width: 60px;">Channel:</label>
<details id="channelDropdown" class="channel-dropdown" style="flex: 1;"> <select id="channel" style="flex: 1;">
<summary id="channelSummary">All Channels</summary> <option value="">All Channels</option>
<div id="channelOptions" class="channel-options"> </select>
<div>Loading channels…</div>
</div>
</details>
<label for="year" style="margin-left: 8px;">Year:</label> <label for="year" style="margin-left: 8px;">Year:</label>
<select id="year"> <select id="year">
@ -64,21 +66,25 @@
<div class="toggle-item toggle-item--first"> <div class="toggle-item toggle-item--first">
<input type="checkbox" id="exactToggle" checked /> <input type="checkbox" id="exactToggle" checked />
<label for="exactToggle">Exact</label> <label for="exactToggle">Exact</label>
<span class="toggle-help">Match all terms exactly.</span>
</div> </div>
<div class="toggle-item"> <div class="toggle-item">
<input type="checkbox" id="fuzzyToggle" checked /> <input type="checkbox" id="fuzzyToggle" checked />
<label for="fuzzyToggle">Fuzzy</label> <label for="fuzzyToggle">Fuzzy</label>
<span class="toggle-help">Allow small typos and variations.</span>
</div> </div>
<div class="toggle-item"> <div class="toggle-item">
<input type="checkbox" id="phraseToggle" checked /> <input type="checkbox" id="phraseToggle" checked />
<label for="phraseToggle">Phrase</label> <label for="phraseToggle">Phrase</label>
<span class="toggle-help">Boost exact phrases inside transcripts.</span>
</div> </div>
<div class="toggle-item"> <div class="toggle-item">
<input type="checkbox" id="queryStringToggle" /> <input type="checkbox" id="queryStringToggle" />
<label for="queryStringToggle">Query string mode</label> <label for="queryStringToggle">Query string mode</label>
<span class="toggle-help">Use raw Lucene syntax (overrides other toggles).</span>
</div> </div>
</div> </div>
</fieldset> </fieldset>
@ -107,11 +113,110 @@
</fieldset> </fieldset>
</div> </div>
<div class="status-bar"> <div class="status-bar">
<p class="status-bar-field">Ready</p> <p class="status-bar-field">Ready</p>
</div>
</div>
<div class="about-panel" id="aboutPanel" hidden>
<div class="about-panel__header">
<strong>About This App</strong>
<button id="aboutCloseBtn" aria-label="Close about panel">×</button>
</div>
<div class="about-panel__body">
<p>Use the toggles to choose exact, fuzzy, or phrase matching. Query string mode accepts raw Lucene syntax.</p>
<p>Results are ranked by your chosen sort order; the timeline summarizes the same query.</p>
<p>You can download transcripts, copy MLA citations, or explore references via the graph button.</p>
</div> </div>
</div> </div>
<div
id="graphModalOverlay"
class="graph-modal-overlay"
aria-hidden="true"
>
<div
class="window graph-window graph-modal-window"
id="graphModalWindow"
role="dialog"
aria-modal="true"
aria-labelledby="graphModalTitle"
>
<div class="title-bar">
<div class="title-bar-text" id="graphModalTitle">Reference Graph</div>
<div class="title-bar-controls">
<button id="graphModalClose" aria-label="Close"></button>
</div>
</div>
<div class="window-body">
<p>
Explore how this video links with its neighbors. Adjust depth or node cap to expand the graph.
</p>
<form id="graphForm" class="graph-controls">
<div class="field-group">
<label for="graphVideoId">Video ID</label>
<input
id="graphVideoId"
name="video_id"
type="text"
placeholder="e.g. dQw4w9WgXcQ"
required
/>
</div>
<div class="field-group">
<label for="graphDepth">Depth</label>
<select id="graphDepth" name="depth">
<option value="1" selected>1 hop</option>
<option value="2">2 hops</option>
<option value="3">3 hops</option>
</select>
</div>
<div class="field-group">
<label for="graphMaxNodes">Max nodes</label>
<select id="graphMaxNodes" name="max_nodes">
<option value="100">100</option>
<option value="150">150</option>
<option value="200" selected>200</option>
<option value="300">300</option>
<option value="400">400</option>
</select>
</div>
<div class="field-group">
<label for="graphLabelSize">Labels</label>
<select id="graphLabelSize" name="label_size">
<option value="off">Off</option>
<option value="tiny" selected>Tiny</option>
<option value="small">Small</option>
<option value="normal">Normal</option>
<option value="medium">Medium</option>
<option value="large">Large</option>
<option value="xlarge">Extra large</option>
</select>
</div>
<button type="submit">Build graph</button>
</form>
<div id="graphStatus" class="graph-status">Enter a video ID to begin.</div>
<div
id="graphContainer"
class="graph-container"
data-embedded="true"
></div>
</div>
<div class="status-bar">
<p class="status-bar-field">Right-click a node to set a new root</p>
<p class="status-bar-field">Colors (and hatches) represent channels</p>
</div>
</div>
</div>
<script src="/static/graph.js"></script>
<script src="/static/app.js"></script> <script src="/static/app.js"></script>
</body> </body>
</html> </html>

View File

@ -63,7 +63,7 @@ body.dimmed {
} }
.field-row input[type="text"], .field-row input[type="text"],
.field-row .channel-dropdown { .field-row select#channel {
flex: 1 1 100% !important; flex: 1 1 100% !important;
min-width: 0 !important; min-width: 0 !important;
max-width: 100% !important; max-width: 100% !important;
@ -86,37 +86,18 @@ body.dimmed {
max-width: 100%; max-width: 100%;
min-width: 100%; min-width: 100%;
} }
}
/* Channel dropdown custom styling */ .graph-controls {
.channel-dropdown { flex-direction: column;
position: relative; align-items: stretch;
display: inline-block; }
}
.channel-dropdown summary { .graph-controls .field-group,
list-style: none; .graph-controls input,
cursor: pointer; .graph-controls select {
padding: 3px 4px; width: 100%;
background: ButtonFace; min-width: 0;
border: 1px solid; }
border-color: ButtonHighlight ButtonShadow ButtonShadow ButtonHighlight;
min-width: 180px;
text-align: left;
}
.channel-dropdown summary::-webkit-details-marker {
display: none;
}
.channel-dropdown summary::after {
content: ' ▼';
font-size: 8px;
float: right;
}
.channel-dropdown[open] summary::after {
content: ' ▲';
} }
.toggle-row { .toggle-row {
@ -174,32 +155,6 @@ body.dimmed {
overflow-y: auto; overflow-y: auto;
} }
.channel-options {
position: absolute;
margin-top: 2px;
padding: 4px;
background: ButtonFace;
border: 1px solid;
border-color: ButtonHighlight ButtonShadow ButtonShadow ButtonHighlight;
max-height: 300px;
overflow-y: auto;
box-shadow: 2px 2px 0 rgba(0, 0, 0, 0.2);
z-index: 100;
min-width: 220px;
}
.channel-option {
display: flex;
align-items: center;
gap: 6px;
margin-bottom: 4px;
font-size: 11px;
}
.channel-option:last-child {
margin-bottom: 0;
}
/* Layout helpers */ /* Layout helpers */
.summary-row { .summary-row {
display: flex; display: flex;
@ -218,6 +173,344 @@ body.dimmed {
min-width: 300px; min-width: 300px;
} }
.graph-window {
width: 95%;
}
.graph-controls {
display: flex;
flex-wrap: wrap;
gap: 12px;
align-items: flex-end;
margin-bottom: 12px;
}
.graph-controls .field-group {
display: flex;
flex-direction: column;
gap: 4px;
}
.graph-controls label {
font-size: 11px;
font-weight: bold;
}
.graph-controls input,
.graph-controls select {
min-width: 160px;
}
.graph-status {
font-size: 11px;
margin-bottom: 8px;
color: #1f1f1f;
}
.graph-status.error {
color: #b00020;
}
.graph-container {
background: Window;
border: 1px solid #919b9c;
box-shadow: inset -1px -1px #0a0a0a, inset 1px 1px #fff;
position: relative;
width: 100%;
min-height: 520px;
height: auto;
overflow: visible;
}
.graph-modal-overlay {
position: fixed;
inset: 0;
display: none;
align-items: center;
justify-content: center;
padding: 24px;
background: rgba(0, 0, 0, 0.35);
z-index: 2000;
}
.graph-modal-overlay.active {
display: flex;
}
.graph-modal-window {
width: min(960px, 100%);
max-height: calc(100vh - 48px);
}
.graph-modal-window .window-body {
max-height: calc(100vh - 180px);
overflow-y: auto;
}
.graph-modal-window .graph-container {
height: 560px;
}
body.modal-open {
overflow: hidden;
}
.result-header {
display: flex;
justify-content: flex-start;
gap: 6px;
flex-wrap: wrap;
align-items: flex-start;
}
.result-header-main {
flex: 1 1 auto;
min-width: 220px;
}
.result-actions {
display: flex;
align-items: flex-start;
gap: 6px;
margin-left: auto;
}
.result-action-btn {
white-space: nowrap;
font-family: "Tahoma", "MS Sans Serif", sans-serif;
font-size: 11px;
padding: 4px 10px;
}
.result-meta {
display: flex;
align-items: center;
flex-wrap: wrap;
gap: 4px;
}
.result-status {
display: inline-flex;
align-items: center;
gap: 4px;
padding: 1px 6px;
border-radius: 3px;
font-size: 10px;
line-height: 1.3;
border: 1px solid #c4a3a3;
background: #fff6f6;
color: #6b1f1f;
}
.result-status::before {
content: "⚠";
font-size: 10px;
line-height: 1;
}
.result-status--deleted {
border-color: #d1a6a6;
background: #fff8f8;
color: #6b1f1f;
}
.graph-launch-btn {
white-space: nowrap;
}
.graph-node-label {
text-shadow: -1px -1px 0 #fff, 1px -1px 0 #fff, -1px 1px 0 #fff, 1px 1px 0 #fff;
}
.graph-nodes circle {
cursor: pointer;
}
.graph-legend {
margin: 12px 0;
font-size: 11px;
background: Window;
border: 1px solid #919b9c;
padding: 8px 10px;
display: inline-flex;
flex-direction: column;
gap: 4px;
box-shadow: inset -1px -1px #0a0a0a, inset 1px 1px #fff;
}
.graph-legend-section {
display: flex;
flex-direction: column;
gap: 4px;
}
.graph-legend-title {
font-weight: bold;
color: #1f1f1f;
}
.graph-legend-row {
display: flex;
align-items: center;
gap: 8px;
}
.graph-legend-swatch {
display: inline-block;
width: 18px;
height: 12px;
border: 1px solid #1f1f1f;
}
.graph-legend-swatch--references {
background: #6c83c7;
}
.graph-legend-swatch--referenced {
background: #c76c6c;
}
.graph-legend-channel-list {
display: flex;
flex-wrap: wrap;
gap: 8px;
}
.graph-legend-channel {
display: flex;
align-items: center;
gap: 6px;
}
.graph-legend-channel-swatch {
width: 14px;
height: 14px;
background-repeat: repeat;
background-position: 0 0;
background-size: 6px 6px;
}
.graph-legend-channel--none .graph-legend-channel-swatch {
background-image: none;
}
.graph-legend-channel--diag-forward .graph-legend-channel-swatch {
background-image: repeating-linear-gradient(
45deg,
rgba(0, 0, 0, 0.35) 0,
rgba(0, 0, 0, 0.35) 2px,
transparent 2px,
transparent 4px
);
background-blend-mode: multiply;
}
.graph-legend-channel--diag-back .graph-legend-channel-swatch {
background-image: repeating-linear-gradient(
-45deg,
rgba(0, 0, 0, 0.35) 0,
rgba(0, 0, 0, 0.35) 2px,
transparent 2px,
transparent 4px
);
background-blend-mode: multiply;
}
.graph-legend-channel--cross .graph-legend-channel-swatch {
background-image:
repeating-linear-gradient(
45deg,
rgba(0, 0, 0, 0.25) 0,
rgba(0, 0, 0, 0.25) 2px,
transparent 2px,
transparent 4px
),
repeating-linear-gradient(
-45deg,
rgba(0, 0, 0, 0.25) 0,
rgba(0, 0, 0, 0.25) 2px,
transparent 2px,
transparent 4px
);
background-blend-mode: multiply;
}
.graph-legend-channel--dots .graph-legend-channel-swatch {
background-image: radial-gradient(rgba(0, 0, 0, 0.35) 30%, transparent 31%);
background-size: 6px 6px;
background-blend-mode: multiply;
}
.graph-legend-note {
font-size: 10px;
color: #555;
font-style: italic;
}
.title-bar-link {
display: inline-block;
color: inherit;
text-decoration: none;
font-size: 11px;
padding: 2px 6px;
border: 1px solid;
border-color: ButtonHighlight ButtonShadow ButtonShadow ButtonHighlight;
background: ButtonFace;
}
.title-bar-controls #aboutBtn {
font-weight: bold;
font-size: 12px;
padding: 0 6px;
margin-right: 6px;
}
.toggle-item {
display: flex;
align-items: center;
gap: 6px;
}
.toggle-help {
font-size: 10px;
color: #555;
}
.about-panel {
position: fixed;
top: 20px;
right: 20px;
width: 280px;
background: Window;
border: 2px solid #919b9c;
box-shadow: 0 4px 12px rgba(0, 0, 0, 0.25);
z-index: 2100;
font-size: 11px;
}
.about-panel__header {
display: flex;
justify-content: space-between;
align-items: center;
padding: 6px 8px;
background: #0055aa;
color: #fff;
}
.about-panel__body {
padding: 8px;
background: Window;
color: #000;
}
.about-panel__header button {
border: none;
background: transparent;
color: inherit;
font-weight: bold;
cursor: pointer;
}
/* Results styling */ /* Results styling */
#results .item { #results .item {
background: Window; background: Window;
@ -227,6 +520,7 @@ body.dimmed {
max-width: 100%; max-width: 100%;
overflow: hidden; overflow: hidden;
word-wrap: break-word; word-wrap: break-word;
box-sizing: border-box;
box-shadow: 2px 2px 0 rgba(0, 0, 0, 0.15); box-shadow: 2px 2px 0 rgba(0, 0, 0, 0.15);
} }
@ -243,7 +537,9 @@ body.dimmed {
.window-body { .window-body {
max-width: 100%; max-width: 100%;
overflow-x: hidden; overflow-x: hidden;
margin: 1rem; margin: 0;
padding: 1rem;
box-sizing: border-box;
} }
/* Badges */ /* Badges */
@ -267,6 +563,14 @@ body.dimmed {
word-break: keep-all; word-break: keep-all;
} }
.badge--transcript-primary {
background: #0b6efd;
}
.badge--transcript-secondary {
background: #8f4bff;
}
.badge-clickable { .badge-clickable {
cursor: pointer; cursor: pointer;
} }
@ -297,9 +601,14 @@ body.dimmed {
} }
.highlight-row { .highlight-row {
padding: 4px; padding: 4px 6px;
cursor: pointer; cursor: pointer;
border: 1px solid transparent; border: 1px solid transparent;
display: flex;
align-items: flex-start;
gap: 8px;
max-width: 100%;
box-sizing: border-box;
} }
.highlight-row:hover { .highlight-row:hover {
@ -308,6 +617,77 @@ body.dimmed {
border: 1px dotted WindowText; border: 1px dotted WindowText;
} }
.highlight-text {
flex: 1 1 auto;
word-break: break-word;
overflow-wrap: anywhere;
}
.highlight-source-indicator {
width: 10px;
height: 10px;
border-radius: 2px;
border: 1px solid transparent;
margin-left: auto;
flex: 0 0 auto;
}
.highlight-source-indicator--primary {
background: #0b6efd;
border-color: #084bb5;
}
.highlight-source-indicator--secondary {
background: #8f4bff;
border-color: #5d2db3;
}
.vector-chunk {
margin-top: 8px;
padding: 8px;
background: #f3f7ff;
border: 1px solid #c7d0e2;
font-size: 11px;
line-height: 1.5;
word-break: break-word;
}
@media screen and (max-width: 640px) {
.result-header {
flex-direction: column;
gap: 6px;
}
.result-header-main {
flex: 1 1 auto;
min-width: 0;
width: 100%;
}
.result-actions {
width: auto;
align-self: flex-start;
justify-content: flex-start;
flex-wrap: wrap;
gap: 4px;
margin-left: 0;
}
.result-action-btn {
width: 100%;
text-align: left;
}
.highlight-row {
flex-direction: column;
gap: 4px;
}
.highlight-source-indicator {
align-self: flex-end;
}
}
mark { mark {
background: yellow; background: yellow;
color: black; color: black;
@ -334,6 +714,10 @@ mark {
border-bottom: 1px solid ButtonShadow; border-bottom: 1px solid ButtonShadow;
} }
.transcript-segment--matched {
background: #fff6cc;
}
.transcript-segment:last-child { .transcript-segment:last-child {
border-bottom: none; border-bottom: none;
margin-bottom: 0; margin-bottom: 0;

46
static/vector.html Normal file
View File

@ -0,0 +1,46 @@
<!doctype html>
<html>
<head>
<meta charset="utf-8" />
<meta name="viewport" content="width=device-width, initial-scale=1" />
<title>TLC Vector Search</title>
<link rel="icon" href="/static/favicon.png" type="image/png" />
<link rel="stylesheet" href="https://unpkg.com/xp.css" />
<link rel="stylesheet" href="/static/style.css" />
</head>
<body>
<div class="window" style="max-width: 1200px; margin: 20px auto;">
<div class="title-bar">
<div class="title-bar-text">Vector Search (Experimental)</div>
<div class="title-bar-controls">
<a class="title-bar-link" href="/">⬅ Back to Search</a>
</div>
</div>
<div class="window-body">
<p>Enter a natural language prompt; results come from the Qdrant vector index.</p>
<fieldset>
<legend>Vector Query</legend>
<div class="field-row" style="margin-bottom: 8px;">
<label for="vectorQuery" style="width: 60px;">Query:</label>
<input id="vectorQuery" type="text" placeholder="Describe what you are looking for" style="flex: 1;" />
<button id="vectorSearchBtn">Search</button>
</div>
</fieldset>
<div id="vectorMeta" style="margin-top: 12px; font-size: 11px;"></div>
<fieldset style="margin-top: 16px;">
<legend>Results</legend>
<div id="vectorResults"></div>
</fieldset>
</div>
<div class="status-bar">
<p class="status-bar-field">Experimental mode • Qdrant</p>
</div>
</div>
<script src="/static/vector.js"></script>
</body>
</html>

423
static/vector.js Normal file
View File

@ -0,0 +1,423 @@
(() => {
const queryInput = document.getElementById("vectorQuery");
const searchBtn = document.getElementById("vectorSearchBtn");
const resultsDiv = document.getElementById("vectorResults");
const metaDiv = document.getElementById("vectorMeta");
const transcriptCache = new Map();
if (!queryInput || !searchBtn || !resultsDiv || !metaDiv) {
console.error("Vector search elements missing");
return;
}
/** Utility helpers **/
const escapeHtml = (str) =>
(str || "").replace(/[&<>"']/g, (ch) => {
switch (ch) {
case "&":
return "&amp;";
case "<":
return "&lt;";
case ">":
return "&gt;";
case '"':
return "&quot;";
case "'":
return "&#39;";
default:
return ch;
}
});
const fmtDate = (value) => {
try {
return (value || "").split("T")[0];
} catch {
return value;
}
};
const fmtSimilarity = (score) => {
if (typeof score !== "number" || Number.isNaN(score)) return "";
return score.toFixed(3);
};
const getVideoStatus = (item) =>
(item && item.video_status ? String(item.video_status).toLowerCase() : "");
const isLikelyDeleted = (item) => getVideoStatus(item) === "deleted";
const formatTimestamp = (seconds) => {
if (!seconds && seconds !== 0) return "00:00";
const hours = Math.floor(seconds / 3600);
const mins = Math.floor((seconds % 3600) / 60);
const secs = Math.floor(seconds % 60);
if (hours > 0) {
return `${hours}:${mins.toString().padStart(2, "0")}:${secs
.toString()
.padStart(2, "0")}`;
}
return `${mins}:${secs.toString().padStart(2, "0")}`;
};
const formatSegmentTimestamp = (segment) => {
if (!segment) return "";
if (segment.timestamp) return segment.timestamp;
const fields = [
segment.start_seconds,
segment.start,
segment.offset,
segment.time,
];
for (const value of fields) {
if (value == null) continue;
const num = parseFloat(value);
if (!Number.isNaN(num)) {
return formatTimestamp(num);
}
}
return "";
};
const serializeTranscriptSection = (label, parts, fullText) => {
let content = "";
if (typeof fullText === "string" && fullText.trim()) {
content = fullText.trim();
} else if (Array.isArray(parts) && parts.length) {
content = parts
.map((segment) => {
const ts = formatSegmentTimestamp(segment);
const text = segment && segment.text ? segment.text : "";
return ts ? `[${ts}] ${text}` : text;
})
.join("\n")
.trim();
}
if (!content) return "";
return `${label}\n${content}\n`;
};
const fetchTranscriptData = async (videoId) => {
if (!videoId) return null;
if (transcriptCache.has(videoId)) {
return transcriptCache.get(videoId);
}
const res = await fetch(`/api/transcript?video_id=${encodeURIComponent(videoId)}`);
if (!res.ok) {
throw new Error(`Transcript fetch failed (${res.status})`);
}
const data = await res.json();
transcriptCache.set(videoId, data);
return data;
};
const buildTranscriptDownloadText = (item, transcriptData) => {
const lines = [];
lines.push(`Title: ${item.title || "Untitled"}`);
if (item.channel_name) lines.push(`Channel: ${item.channel_name}`);
if (item.date) lines.push(`Published: ${item.date}`);
if (item.url) lines.push(`URL: ${item.url}`);
lines.push("");
const primaryText = serializeTranscriptSection(
"Primary Transcript",
transcriptData.transcript_parts,
transcriptData.transcript_full
);
const secondaryText = serializeTranscriptSection(
"Secondary Transcript",
transcriptData.transcript_secondary_parts,
transcriptData.transcript_secondary_full
);
if (primaryText) lines.push(primaryText);
if (secondaryText) lines.push(secondaryText);
if (!primaryText && !secondaryText) {
lines.push("No transcript available.");
}
return lines.join("\n").trim() + "\n";
};
const flashButtonMessage = (button, message, duration = 1800) => {
if (!button) return;
const original = button.dataset.originalLabel || button.textContent;
button.dataset.originalLabel = original;
button.textContent = message;
setTimeout(() => {
button.textContent = button.dataset.originalLabel || original;
}, duration);
};
const handleTranscriptDownload = async (item, button) => {
if (!item.video_id) return;
button.disabled = true;
try {
const transcriptData = await fetchTranscriptData(item.video_id);
if (!transcriptData) throw new Error("Transcript unavailable");
const text = buildTranscriptDownloadText(item, transcriptData);
const blob = new Blob([text], { type: "text/plain" });
const url = URL.createObjectURL(blob);
const link = document.createElement("a");
link.href = url;
link.download = `${item.video_id}.txt`;
document.body.appendChild(link);
link.click();
document.body.removeChild(link);
URL.revokeObjectURL(url);
flashButtonMessage(button, "Downloaded");
} catch (err) {
console.error("Download failed", err);
alert("Unable to download transcript right now.");
} finally {
button.disabled = false;
}
};
const formatMlaDate = (value) => {
if (!value) return "n.d.";
const parsed = new Date(value);
if (Number.isNaN(parsed.valueOf())) return value;
const months = [
"Jan.", "Feb.", "Mar.", "Apr.", "May", "June",
"July", "Aug.", "Sept.", "Oct.", "Nov.", "Dec.",
];
return `${parsed.getDate()} ${months[parsed.getMonth()]} ${parsed.getFullYear()}`;
};
const buildMlaCitation = (item) => {
const channel = (item.channel_name || item.channel_id || "Unknown").trim();
const title = (item.title || "Untitled").trim();
const url = item.url || "";
const publishDate = formatMlaDate(item.date);
const today = formatMlaDate(new Date().toISOString().split("T")[0]);
return `${channel}. "${title}." YouTube, uploaded by ${channel}, ${publishDate}, ${url}. Accessed ${today}.`;
};
const handleCopyCitation = async (item, button) => {
const citation = buildMlaCitation(item);
try {
if (navigator.clipboard && window.isSecureContext) {
await navigator.clipboard.writeText(citation);
} else {
const textarea = document.createElement("textarea");
textarea.value = citation;
textarea.style.position = "fixed";
textarea.style.opacity = "0";
document.body.appendChild(textarea);
textarea.select();
document.execCommand("copy");
document.body.removeChild(textarea);
}
flashButtonMessage(button, "Copied!");
} catch (err) {
console.error("Citation copy failed", err);
alert(citation);
}
};
/** Rendering helpers **/
const createHighlightRows = (entries) => {
if (!Array.isArray(entries) || !entries.length) return null;
const container = document.createElement("div");
container.className = "transcript highlight-list";
entries.forEach((entry) => {
if (!entry) return;
const row = document.createElement("div");
row.className = "highlight-row";
const textBlock = document.createElement("div");
textBlock.className = "highlight-text";
const html = entry.html || entry.text || entry;
textBlock.innerHTML = html || "";
row.appendChild(textBlock);
const indicator = document.createElement("span");
indicator.className = "highlight-source-indicator highlight-source-indicator--primary";
indicator.title = "Vector highlight";
row.appendChild(indicator);
container.appendChild(row);
});
return container;
};
const createActions = (item) => {
const actions = document.createElement("div");
actions.className = "result-actions";
const downloadBtn = document.createElement("button");
downloadBtn.type = "button";
downloadBtn.className = "result-action-btn";
downloadBtn.textContent = "Download transcript";
downloadBtn.addEventListener("click", () => handleTranscriptDownload(item, downloadBtn));
actions.appendChild(downloadBtn);
const citationBtn = document.createElement("button");
citationBtn.type = "button";
citationBtn.className = "result-action-btn";
citationBtn.textContent = "Copy citation";
citationBtn.addEventListener("click", () => handleCopyCitation(item, citationBtn));
actions.appendChild(citationBtn);
const graphBtn = document.createElement("button");
graphBtn.type = "button";
graphBtn.className = "result-action-btn graph-launch-btn";
graphBtn.textContent = "Graph";
graphBtn.disabled = !item.video_id;
graphBtn.addEventListener("click", () => {
if (!item.video_id) return;
const target = `/graph?video_id=${encodeURIComponent(item.video_id)}`;
window.open(target, "_blank", "noopener");
});
actions.appendChild(graphBtn);
return actions;
};
const renderVectorResults = (payload) => {
resultsDiv.innerHTML = "";
const items = payload.items || [];
if (!items.length) {
metaDiv.textContent = "No vector matches for this prompt.";
return;
}
metaDiv.textContent = `Matches: ${items.length} (vector mode)`;
items.forEach((item) => {
const el = document.createElement("div");
el.className = "item";
const header = document.createElement("div");
header.className = "result-header";
const headerMain = document.createElement("div");
headerMain.className = "result-header-main";
const titleEl = document.createElement("strong");
titleEl.innerHTML = item.titleHtml || escapeHtml(item.title || "Untitled");
headerMain.appendChild(titleEl);
const metaLine = document.createElement("div");
metaLine.className = "muted result-meta";
const channelLabel = item.channel_name || item.channel_id || "Unknown";
const dateLabel = fmtDate(item.date);
let durationSeconds = null;
if (typeof item.duration === "number") {
durationSeconds = item.duration;
} else if (typeof item.duration === "string" && item.duration.trim()) {
const parsed = parseFloat(item.duration);
if (!Number.isNaN(parsed)) {
durationSeconds = parsed;
}
}
const durationLabel = durationSeconds != null ? `${formatTimestamp(durationSeconds)}` : "";
metaLine.textContent = channelLabel ? `${channelLabel}${dateLabel}${durationLabel}` : `${dateLabel}${durationLabel}`;
if (isLikelyDeleted(item)) {
metaLine.appendChild(document.createTextNode(" "));
const statusEl = document.createElement("span");
statusEl.className = "result-status result-status--deleted";
statusEl.textContent = "Likely deleted";
metaLine.appendChild(statusEl);
}
headerMain.appendChild(metaLine);
if (item.url) {
const linkLine = document.createElement("div");
linkLine.className = "muted";
const anchor = document.createElement("a");
anchor.href = item.url;
anchor.target = "_blank";
anchor.rel = "noopener";
anchor.textContent = "Open on YouTube";
linkLine.appendChild(anchor);
headerMain.appendChild(linkLine);
}
if (typeof item.distance === "number") {
const scoreLine = document.createElement("div");
scoreLine.className = "muted";
scoreLine.textContent = `Similarity score: ${fmtSimilarity(item.distance)}`;
headerMain.appendChild(scoreLine);
}
header.appendChild(headerMain);
header.appendChild(createActions(item));
el.appendChild(header);
if (item.descriptionHtml || item.description) {
const desc = document.createElement("div");
desc.className = "muted description-block";
desc.innerHTML = item.descriptionHtml || escapeHtml(item.description);
el.appendChild(desc);
}
if (item.chunkText) {
const chunkBlock = document.createElement("div");
chunkBlock.className = "vector-chunk";
if (item.chunkTimestamp && item.url) {
const tsObj =
typeof item.chunkTimestamp === "object"
? item.chunkTimestamp
: { timestamp: item.chunkTimestamp };
const ts = formatSegmentTimestamp(tsObj);
const tsLink = document.createElement("a");
const paramValue =
typeof item.chunkTimestamp === "number"
? Math.floor(item.chunkTimestamp)
: item.chunkTimestamp;
tsLink.href = `${item.url}${item.url.includes("?") ? "&" : "?"}t=${encodeURIComponent(
paramValue
)}`;
tsLink.target = "_blank";
tsLink.rel = "noopener";
tsLink.textContent = ts ? `[${ts}]` : "[timestamp]";
chunkBlock.appendChild(tsLink);
chunkBlock.appendChild(document.createTextNode(" "));
}
const chunkTextSpan = document.createElement("span");
chunkTextSpan.textContent = item.chunkText;
chunkBlock.appendChild(chunkTextSpan);
el.appendChild(chunkBlock);
}
const highlights = createHighlightRows(item.toHighlight);
if (highlights) {
el.appendChild(highlights);
}
resultsDiv.appendChild(el);
});
};
/** Search handler **/
const runVectorSearch = async () => {
const query = queryInput.value.trim();
if (!query) {
alert("Please enter a query.");
return;
}
metaDiv.textContent = "Searching vector index…";
resultsDiv.innerHTML = "";
searchBtn.disabled = true;
try {
const res = await fetch("/api/vector-search", {
method: "POST",
headers: { "Content-Type": "application/json" },
body: JSON.stringify({ query }),
});
if (!res.ok) {
throw new Error(`Vector search failed (${res.status})`);
}
const data = await res.json();
if (data.error) {
metaDiv.textContent = "Vector search unavailable.";
return;
}
renderVectorResults(data);
} catch (err) {
console.error(err);
metaDiv.textContent = "Vector search unavailable.";
} finally {
searchBtn.disabled = false;
}
};
searchBtn.addEventListener("click", runVectorSearch);
queryInput.addEventListener("keypress", (event) => {
if (event.key === "Enter") {
runVectorSearch();
}
});
})();

188
sync_qdrant_channels.py Normal file
View File

@ -0,0 +1,188 @@
"""
Utility to backfill channel titles/names inside the Qdrant payloads.
Usage:
python -m python_app.sync_qdrant_channels \
--batch-size 512 \
--max-batches 200 \
--dry-run
"""
from __future__ import annotations
import argparse
import logging
from typing import Dict, Iterable, List, Optional, Set, Tuple
import time
import requests
from .config import CONFIG
from .search_app import _ensure_client
LOGGER = logging.getLogger(__name__)
def chunked(iterable: Iterable, size: int):
chunk: List = []
for item in iterable:
chunk.append(item)
if len(chunk) >= size:
yield chunk
chunk = []
if chunk:
yield chunk
def resolve_channels(channel_ids: Iterable[str]) -> Dict[str, str]:
client = _ensure_client(CONFIG)
ids = list(set(channel_ids))
if not ids:
return {}
body = {
"size": len(ids) * 2,
"_source": ["channel_id", "channel_name"],
"query": {"terms": {"channel_id.keyword": ids}},
}
response = client.search(index=CONFIG.elastic.index, body=body)
resolved: Dict[str, str] = {}
for hit in response.get("hits", {}).get("hits", []):
source = hit.get("_source") or {}
cid = source.get("channel_id")
cname = source.get("channel_name")
if cid and cname and cid not in resolved:
resolved[cid] = cname
return resolved
def upsert_channel_payload(
qdrant_url: str,
collection: str,
channel_id: str,
channel_name: str,
*,
dry_run: bool = False,
) -> bool:
"""Set channel_name/channel_title for all vectors with this channel_id."""
payload = {"channel_name": channel_name, "channel_title": channel_name}
body = {
"payload": payload,
"filter": {"must": [{"key": "channel_id", "match": {"value": channel_id}}]},
}
LOGGER.info("Updating channel_id=%s -> %s", channel_id, channel_name)
if dry_run:
return True
resp = requests.post(
f"{qdrant_url}/collections/{collection}/points/payload",
json=body,
timeout=120,
)
if resp.status_code >= 400:
LOGGER.error("Failed to update %s: %s", channel_id, resp.text)
return False
return True
def scroll_missing_payloads(
qdrant_url: str,
collection: str,
batch_size: int,
*,
max_points: Optional[int] = None,
) -> Iterable[List[Tuple[str, Dict[str, any]]]]:
"""Yield batches of (point_id, payload) missing channel names."""
fetched = 0
next_page = None
while True:
current_limit = batch_size
while True:
body = {
"limit": current_limit,
"with_payload": True,
"filter": {"must": [{"is_empty": {"key": "channel_name"}}]},
}
if next_page:
body["offset"] = next_page
try:
resp = requests.post(
f"{qdrant_url}/collections/{collection}/points/scroll",
json=body,
timeout=120,
)
resp.raise_for_status()
break
except requests.HTTPError as exc:
LOGGER.warning(
"Scroll request failed at limit=%s: %s", current_limit, exc
)
if current_limit <= 5:
raise
current_limit = max(5, current_limit // 2)
LOGGER.info("Reducing scroll batch size to %s", current_limit)
time.sleep(2)
except requests.RequestException as exc: # type: ignore[attr-defined]
LOGGER.warning("Transient scroll error: %s", exc)
time.sleep(2)
payload = resp.json().get("result", {})
points = payload.get("points", [])
if not points:
break
batch: List[Tuple[str, Dict[str, any]]] = []
for point in points:
pid = point.get("id")
p_payload = point.get("payload") or {}
batch.append((pid, p_payload))
yield batch
fetched += len(points)
if max_points and fetched >= max_points:
break
next_page = payload.get("next_page_offset")
if not next_page:
break
def main() -> None:
logging.basicConfig(level=logging.INFO, format="%(levelname)s %(message)s")
parser = argparse.ArgumentParser(
description="Backfill missing channel_name/channel_title in Qdrant payloads"
)
parser.add_argument("--batch-size", type=int, default=512)
parser.add_argument(
"--max-points",
type=int,
default=None,
help="Limit processing to the first N points for testing",
)
parser.add_argument("--dry-run", action="store_true")
args = parser.parse_args()
q_url = CONFIG.qdrant_url
collection = CONFIG.qdrant_collection
total_updates = 0
for batch in scroll_missing_payloads(
q_url, collection, args.batch_size, max_points=args.max_points
):
channel_ids: Set[str] = set()
for _, payload in batch:
cid = payload.get("channel_id")
if cid:
channel_ids.add(str(cid))
if not channel_ids:
continue
resolved = resolve_channels(channel_ids)
if not resolved:
LOGGER.warning("No channel names resolved for ids: %s", channel_ids)
continue
for cid, name in resolved.items():
if upsert_channel_payload(
q_url, collection, cid, name, dry_run=args.dry_run
):
total_updates += 1
LOGGER.info("Updated %s channel payloads so far", total_updates)
LOGGER.info("Finished. Total channel updates attempted: %s", total_updates)
if __name__ == "__main__":
main()