Disable vector search
This commit is contained in:
parent
8e4c57a93a
commit
6a3d1ee491
@ -4,4 +4,3 @@ youtube-transcript-api>=0.6
|
|||||||
google-api-python-client>=2.0.0
|
google-api-python-client>=2.0.0
|
||||||
python-dotenv>=0.19.0
|
python-dotenv>=0.19.0
|
||||||
requests>=2.31.0
|
requests>=2.31.0
|
||||||
sentence-transformers>=2.7.0
|
|
||||||
|
|||||||
208
search_app.py
208
search_app.py
@ -4,10 +4,8 @@ Flask application exposing search, graph, and transcript endpoints for TLC.
|
|||||||
Routes:
|
Routes:
|
||||||
GET / -> static HTML search page.
|
GET / -> static HTML search page.
|
||||||
GET /graph -> static reference graph UI.
|
GET /graph -> static reference graph UI.
|
||||||
GET /vector-search -> experimental Qdrant vector search UI.
|
|
||||||
GET /api/channels -> channels aggregation.
|
GET /api/channels -> channels aggregation.
|
||||||
GET /api/search -> Elasticsearch keyword search.
|
GET /api/search -> Elasticsearch keyword search.
|
||||||
POST /api/vector-search -> Qdrant vector similarity query.
|
|
||||||
GET /api/graph -> reference graph API.
|
GET /api/graph -> reference graph API.
|
||||||
GET /api/transcript -> transcript JSON payload.
|
GET /api/transcript -> transcript JSON payload.
|
||||||
"""
|
"""
|
||||||
@ -27,13 +25,6 @@ from datetime import datetime
|
|||||||
|
|
||||||
from flask import Flask, jsonify, request, send_from_directory
|
from flask import Flask, jsonify, request, send_from_directory
|
||||||
|
|
||||||
import requests
|
|
||||||
|
|
||||||
try:
|
|
||||||
from sentence_transformers import SentenceTransformer # type: ignore
|
|
||||||
except ImportError: # pragma: no cover - optional dependency
|
|
||||||
SentenceTransformer = None
|
|
||||||
|
|
||||||
from .config import CONFIG, AppConfig
|
from .config import CONFIG, AppConfig
|
||||||
|
|
||||||
try:
|
try:
|
||||||
@ -44,14 +35,11 @@ except ImportError: # pragma: no cover - dependency optional
|
|||||||
BadRequestError = Exception # type: ignore
|
BadRequestError = Exception # type: ignore
|
||||||
|
|
||||||
LOGGER = logging.getLogger(__name__)
|
LOGGER = logging.getLogger(__name__)
|
||||||
_EMBED_MODEL = None
|
|
||||||
_EMBED_MODEL_NAME: Optional[str] = None
|
|
||||||
|
|
||||||
# Security constants
|
# Security constants
|
||||||
MAX_QUERY_SIZE = 100
|
MAX_QUERY_SIZE = 100
|
||||||
MAX_OFFSET = 10000
|
MAX_OFFSET = 10000
|
||||||
ALLOWED_QDRANT_FILTER_FIELDS = {"channel_id", "date", "video_status", "external_reference"}
|
DEFAULT_ELASTIC_TIMEOUT = int(os.environ.get("ELASTIC_TIMEOUT_SECONDS", "30"))
|
||||||
|
|
||||||
|
|
||||||
def sanitize_query_string(query: str) -> str:
|
def sanitize_query_string(query: str) -> str:
|
||||||
"""
|
"""
|
||||||
@ -74,47 +62,6 @@ def sanitize_query_string(query: str) -> str:
|
|||||||
return sanitized.strip() or "*"
|
return sanitized.strip() or "*"
|
||||||
|
|
||||||
|
|
||||||
def validate_qdrant_filter(filters: Any) -> Dict[str, Any]:
|
|
||||||
"""
|
|
||||||
Validate and sanitize Qdrant filter objects.
|
|
||||||
Only allows whitelisted fields to prevent filter injection.
|
|
||||||
"""
|
|
||||||
if not isinstance(filters, dict):
|
|
||||||
return {}
|
|
||||||
validated: Dict[str, Any] = {}
|
|
||||||
for key, value in filters.items():
|
|
||||||
if key in ALLOWED_QDRANT_FILTER_FIELDS:
|
|
||||||
validated[key] = value
|
|
||||||
return validated
|
|
||||||
|
|
||||||
|
|
||||||
def _ensure_embedder(model_name: str) -> "SentenceTransformer":
|
|
||||||
global _EMBED_MODEL, _EMBED_MODEL_NAME
|
|
||||||
if SentenceTransformer is None: # pragma: no cover - optional dependency
|
|
||||||
raise RuntimeError(
|
|
||||||
"sentence-transformers is required for vector search. Install via pip install sentence-transformers."
|
|
||||||
)
|
|
||||||
if _EMBED_MODEL is None or _EMBED_MODEL_NAME != model_name:
|
|
||||||
LOGGER.info("Loading embedding model: %s", model_name)
|
|
||||||
_EMBED_MODEL = SentenceTransformer(model_name)
|
|
||||||
_EMBED_MODEL_NAME = model_name
|
|
||||||
return _EMBED_MODEL
|
|
||||||
|
|
||||||
|
|
||||||
def embed_query(text: str, *, model_name: str, expected_dim: int) -> List[float]:
|
|
||||||
embedder = _ensure_embedder(model_name)
|
|
||||||
vector = embedder.encode(
|
|
||||||
[f"query: {text}"],
|
|
||||||
show_progress_bar=False,
|
|
||||||
normalize_embeddings=True,
|
|
||||||
)[0].tolist()
|
|
||||||
if len(vector) != expected_dim:
|
|
||||||
raise RuntimeError(
|
|
||||||
f"Embedding dimension mismatch (expected {expected_dim}, got {len(vector)})"
|
|
||||||
)
|
|
||||||
return vector
|
|
||||||
|
|
||||||
|
|
||||||
def _ensure_client(config: AppConfig) -> "Elasticsearch":
|
def _ensure_client(config: AppConfig) -> "Elasticsearch":
|
||||||
if Elasticsearch is None:
|
if Elasticsearch is None:
|
||||||
raise RuntimeError(
|
raise RuntimeError(
|
||||||
@ -286,7 +233,7 @@ def elastic_metrics_payload(
|
|||||||
"Elasticsearch metrics request: %s",
|
"Elasticsearch metrics request: %s",
|
||||||
json.dumps({"index": index, "body": body}, indent=2),
|
json.dumps({"index": index, "body": body}, indent=2),
|
||||||
)
|
)
|
||||||
response = client.search(index=index, body=body)
|
response = client.search(index=index, body=body, request_timeout=30)
|
||||||
break
|
break
|
||||||
except BadRequestError as exc:
|
except BadRequestError as exc:
|
||||||
last_error = exc
|
last_error = exc
|
||||||
@ -857,7 +804,7 @@ def build_full_graph_payload(
|
|||||||
scroll_id: Optional[str] = None
|
scroll_id: Optional[str] = None
|
||||||
try:
|
try:
|
||||||
body = {"query": query, "_source": source_fields, "sort": ["_doc"]}
|
body = {"query": query, "_source": source_fields, "sort": ["_doc"]}
|
||||||
response = client.search(index=index, body=body, size=batch_size, scroll="1m")
|
response = client.search(index=index, body=body, size=batch_size, scroll="1m", request_timeout=60)
|
||||||
scroll_id = response.get("_scroll_id")
|
scroll_id = response.get("_scroll_id")
|
||||||
stop_fetch = False
|
stop_fetch = False
|
||||||
while not stop_fetch:
|
while not stop_fetch:
|
||||||
@ -957,11 +904,6 @@ def create_app(config: AppConfig = CONFIG) -> Flask:
|
|||||||
|
|
||||||
client = _ensure_client(config)
|
client = _ensure_client(config)
|
||||||
index = config.elastic.index
|
index = config.elastic.index
|
||||||
qdrant_url = config.qdrant_url
|
|
||||||
qdrant_collection = config.qdrant_collection
|
|
||||||
qdrant_vector_name = config.qdrant_vector_name
|
|
||||||
qdrant_vector_size = config.qdrant_vector_size
|
|
||||||
qdrant_embed_model = config.qdrant_embed_model
|
|
||||||
|
|
||||||
@app.route("/")
|
@app.route("/")
|
||||||
def index_page():
|
def index_page():
|
||||||
@ -971,10 +913,6 @@ def create_app(config: AppConfig = CONFIG) -> Flask:
|
|||||||
def graph_page():
|
def graph_page():
|
||||||
return send_from_directory(app.static_folder, "graph.html")
|
return send_from_directory(app.static_folder, "graph.html")
|
||||||
|
|
||||||
@app.route("/vector-search")
|
|
||||||
def vector_search_page():
|
|
||||||
return send_from_directory(app.static_folder, "vector.html")
|
|
||||||
|
|
||||||
@app.route("/static/<path:filename>")
|
@app.route("/static/<path:filename>")
|
||||||
def static_files(filename: str):
|
def static_files(filename: str):
|
||||||
return send_from_directory(app.static_folder, filename)
|
return send_from_directory(app.static_folder, filename)
|
||||||
@ -1260,6 +1198,7 @@ def create_app(config: AppConfig = CONFIG) -> Flask:
|
|||||||
from_=start,
|
from_=start,
|
||||||
size=size,
|
size=size,
|
||||||
body=payload,
|
body=payload,
|
||||||
|
request_timeout=30,
|
||||||
)
|
)
|
||||||
if config.elastic.debug:
|
if config.elastic.debug:
|
||||||
LOGGER.info(
|
LOGGER.info(
|
||||||
@ -1550,145 +1489,6 @@ def create_app(config: AppConfig = CONFIG) -> Flask:
|
|||||||
def frequency_page():
|
def frequency_page():
|
||||||
return send_from_directory(app.static_folder, "frequency.html")
|
return send_from_directory(app.static_folder, "frequency.html")
|
||||||
|
|
||||||
@app.route("/api/vector-search", methods=["POST"])
|
|
||||||
def api_vector_search():
|
|
||||||
payload = request.get_json(silent=True) or {}
|
|
||||||
query_text = (payload.get("query") or "").strip()
|
|
||||||
filters = validate_qdrant_filter(payload.get("filters"))
|
|
||||||
limit = min(max(int(payload.get("size", 10)), 1), MAX_QUERY_SIZE)
|
|
||||||
offset = min(max(int(payload.get("offset", 0)), 0), MAX_OFFSET)
|
|
||||||
|
|
||||||
if not query_text:
|
|
||||||
return jsonify(
|
|
||||||
{"items": [], "totalResults": 0, "offset": offset, "error": "empty_query"}
|
|
||||||
)
|
|
||||||
|
|
||||||
try:
|
|
||||||
query_vector = embed_query(
|
|
||||||
query_text, model_name=qdrant_embed_model, expected_dim=qdrant_vector_size
|
|
||||||
)
|
|
||||||
except Exception as exc: # pragma: no cover - runtime dependency
|
|
||||||
LOGGER.error("Embedding failed: %s", exc, exc_info=config.elastic.debug)
|
|
||||||
return jsonify({"error": "embedding_unavailable"}), 500
|
|
||||||
|
|
||||||
qdrant_vector_payload: Any
|
|
||||||
if qdrant_vector_name:
|
|
||||||
qdrant_vector_payload = {qdrant_vector_name: query_vector}
|
|
||||||
else:
|
|
||||||
qdrant_vector_payload = query_vector
|
|
||||||
|
|
||||||
qdrant_body: Dict[str, Any] = {
|
|
||||||
"vector": qdrant_vector_payload,
|
|
||||||
"limit": limit,
|
|
||||||
"offset": offset,
|
|
||||||
"with_payload": True,
|
|
||||||
"with_vectors": False,
|
|
||||||
}
|
|
||||||
if filters:
|
|
||||||
qdrant_body["filter"] = filters
|
|
||||||
|
|
||||||
try:
|
|
||||||
response = requests.post(
|
|
||||||
f"{qdrant_url}/collections/{qdrant_collection}/points/search",
|
|
||||||
json=qdrant_body,
|
|
||||||
timeout=20,
|
|
||||||
)
|
|
||||||
response.raise_for_status()
|
|
||||||
data = response.json()
|
|
||||||
except Exception as exc:
|
|
||||||
LOGGER.error("Vector search failed: %s", exc, exc_info=config.elastic.debug)
|
|
||||||
return jsonify({"error": "vector_search_unavailable"}), 502
|
|
||||||
|
|
||||||
points = data.get("result", []) if isinstance(data, dict) else []
|
|
||||||
items: List[Dict[str, Any]] = []
|
|
||||||
missing_channel_ids: Set[str] = set()
|
|
||||||
for point in points:
|
|
||||||
payload = point.get("payload", {}) or {}
|
|
||||||
raw_highlights = payload.get("highlights") or []
|
|
||||||
highlight_entries: List[Dict[str, str]] = []
|
|
||||||
for entry in raw_highlights:
|
|
||||||
if isinstance(entry, dict):
|
|
||||||
html_value = entry.get("html") or entry.get("text")
|
|
||||||
else:
|
|
||||||
html_value = str(entry)
|
|
||||||
if not html_value:
|
|
||||||
continue
|
|
||||||
highlight_entries.append({"html": html_value, "source": "primary"})
|
|
||||||
|
|
||||||
channel_label = (
|
|
||||||
payload.get("channel_name")
|
|
||||||
or payload.get("channel_title")
|
|
||||||
or payload.get("channel_id")
|
|
||||||
)
|
|
||||||
items.append(
|
|
||||||
{
|
|
||||||
"video_id": payload.get("video_id"),
|
|
||||||
"channel_id": payload.get("channel_id"),
|
|
||||||
"channel_name": channel_label,
|
|
||||||
"title": payload.get("title"),
|
|
||||||
"titleHtml": payload.get("title"),
|
|
||||||
"description": payload.get("description"),
|
|
||||||
"descriptionHtml": payload.get("description"),
|
|
||||||
"date": payload.get("date"),
|
|
||||||
"url": payload.get("url"),
|
|
||||||
"chunkText": payload.get("text")
|
|
||||||
or payload.get("chunk_text")
|
|
||||||
or payload.get("chunk")
|
|
||||||
or payload.get("content"),
|
|
||||||
"chunkTimestamp": payload.get("timestamp")
|
|
||||||
or payload.get("start_seconds")
|
|
||||||
or payload.get("start"),
|
|
||||||
"toHighlight": highlight_entries,
|
|
||||||
"highlightSource": {
|
|
||||||
"primary": bool(highlight_entries),
|
|
||||||
"secondary": False,
|
|
||||||
},
|
|
||||||
"distance": point.get("score"),
|
|
||||||
"internal_references_count": payload.get("internal_references_count", 0),
|
|
||||||
"internal_references": payload.get("internal_references", []),
|
|
||||||
"referenced_by_count": payload.get("referenced_by_count", 0),
|
|
||||||
"referenced_by": payload.get("referenced_by", []),
|
|
||||||
"video_status": payload.get("video_status"),
|
|
||||||
"duration": payload.get("duration"),
|
|
||||||
}
|
|
||||||
)
|
|
||||||
if (not channel_label) and payload.get("channel_id"):
|
|
||||||
missing_channel_ids.add(str(payload.get("channel_id")))
|
|
||||||
|
|
||||||
if missing_channel_ids:
|
|
||||||
try:
|
|
||||||
es_lookup = client.search(
|
|
||||||
index=index,
|
|
||||||
body={
|
|
||||||
"size": len(missing_channel_ids) * 2,
|
|
||||||
"_source": ["channel_id", "channel_name"],
|
|
||||||
"query": {"terms": {"channel_id.keyword": list(missing_channel_ids)}},
|
|
||||||
},
|
|
||||||
)
|
|
||||||
hits = es_lookup.get("hits", {}).get("hits", [])
|
|
||||||
channel_lookup = {}
|
|
||||||
for hit in hits:
|
|
||||||
src = hit.get("_source", {}) or {}
|
|
||||||
cid = src.get("channel_id")
|
|
||||||
cname = src.get("channel_name")
|
|
||||||
if cid and cname and cid not in channel_lookup:
|
|
||||||
channel_lookup[cid] = cname
|
|
||||||
for item in items:
|
|
||||||
if not item.get("channel_name"):
|
|
||||||
cid = item.get("channel_id")
|
|
||||||
if cid and cid in channel_lookup:
|
|
||||||
item["channel_name"] = channel_lookup[cid]
|
|
||||||
except Exception as exc:
|
|
||||||
LOGGER.debug("Vector channel lookup failed: %s", exc)
|
|
||||||
|
|
||||||
return jsonify(
|
|
||||||
{
|
|
||||||
"items": items,
|
|
||||||
"totalResults": len(items),
|
|
||||||
"offset": offset,
|
|
||||||
}
|
|
||||||
)
|
|
||||||
|
|
||||||
@app.route("/api/transcript")
|
@app.route("/api/transcript")
|
||||||
def transcript():
|
def transcript():
|
||||||
video_id = request.args.get("video_id", type=str)
|
video_id = request.args.get("video_id", type=str)
|
||||||
|
|||||||
@ -1340,10 +1340,12 @@ async function updateFrequencyChart(term, channels, year, queryMode, toggles = {
|
|||||||
}
|
}
|
||||||
const el = document.createElement("div");
|
const el = document.createElement("div");
|
||||||
el.className = "item";
|
el.className = "item";
|
||||||
|
const rawTitle = item.title || "Untitled";
|
||||||
|
const rawDescription = item.description || "";
|
||||||
const titleHtml =
|
const titleHtml =
|
||||||
item.titleHtml || escapeHtml(item.title || "Untitled");
|
item.titleHtml || escapeHtml(rawTitle);
|
||||||
const descriptionHtml =
|
const descriptionHtml =
|
||||||
item.descriptionHtml || escapeHtml(item.description || "");
|
item.descriptionHtml || escapeHtml(rawDescription);
|
||||||
|
|
||||||
const header = document.createElement("div");
|
const header = document.createElement("div");
|
||||||
header.className = "result-header";
|
header.className = "result-header";
|
||||||
@ -1395,7 +1397,11 @@ async function updateFrequencyChart(term, channels, year, queryMode, toggles = {
|
|||||||
}
|
}
|
||||||
|
|
||||||
const titleEl = document.createElement("strong");
|
const titleEl = document.createElement("strong");
|
||||||
|
if (item.titleHtml) {
|
||||||
titleEl.innerHTML = titleHtml;
|
titleEl.innerHTML = titleHtml;
|
||||||
|
} else {
|
||||||
|
titleEl.textContent = rawTitle;
|
||||||
|
}
|
||||||
headerMain.appendChild(titleEl);
|
headerMain.appendChild(titleEl);
|
||||||
|
|
||||||
const metaLine = document.createElement("div");
|
const metaLine = document.createElement("div");
|
||||||
@ -1519,7 +1525,11 @@ async function updateFrequencyChart(term, channels, year, queryMode, toggles = {
|
|||||||
if (descriptionHtml) {
|
if (descriptionHtml) {
|
||||||
const desc = document.createElement("div");
|
const desc = document.createElement("div");
|
||||||
desc.className = "muted description-block";
|
desc.className = "muted description-block";
|
||||||
|
if (item.descriptionHtml) {
|
||||||
desc.innerHTML = descriptionHtml;
|
desc.innerHTML = descriptionHtml;
|
||||||
|
} else {
|
||||||
|
desc.textContent = rawDescription;
|
||||||
|
}
|
||||||
el.appendChild(desc);
|
el.appendChild(desc);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@ -5,9 +5,9 @@
|
|||||||
<meta name="viewport" content="width=device-width, initial-scale=1" />
|
<meta name="viewport" content="width=device-width, initial-scale=1" />
|
||||||
<title>TLC Search</title>
|
<title>TLC Search</title>
|
||||||
<link rel="icon" href="/static/favicon.png" type="image/png" />
|
<link rel="icon" href="/static/favicon.png" type="image/png" />
|
||||||
<link rel="stylesheet" href="https://unpkg.com/xp.css" />
|
<link rel="stylesheet" href="https://unpkg.com/xp.css" integrity="sha384-isKk8ZXKlU28/m3uIrnyTfuPaamQIF4ONLeGSfsWGEe3qBvaeLU5wkS4J7cTIwxI" crossorigin="anonymous" />
|
||||||
<link rel="stylesheet" href="/static/style.css" />
|
<link rel="stylesheet" href="/static/style.css" />
|
||||||
<script src="https://cdn.jsdelivr.net/npm/d3@7/dist/d3.min.js"></script>
|
<script src="https://cdn.jsdelivr.net/npm/d3@7/dist/d3.min.js" integrity="sha384-CjloA8y00+1SDAUkjs099PVfnY2KmDC2BZnws9kh8D/lX1s46w6EPhpXdqMfjK6i" crossorigin="anonymous"></script>
|
||||||
</head>
|
</head>
|
||||||
<body>
|
<body>
|
||||||
<div class="window" style="max-width: 1200px; margin: 20px auto;">
|
<div class="window" style="max-width: 1200px; margin: 20px auto;">
|
||||||
@ -22,10 +22,6 @@
|
|||||||
</div>
|
</div>
|
||||||
<div class="window-body">
|
<div class="window-body">
|
||||||
<p>Enter a phrase to query title, description, and transcript text.</p>
|
<p>Enter a phrase to query title, description, and transcript text.</p>
|
||||||
<p style="font-size: 11px;">
|
|
||||||
Looking for semantic matches? Try the
|
|
||||||
<a href="/vector-search">vector search beta</a>.
|
|
||||||
</p>
|
|
||||||
|
|
||||||
<fieldset>
|
<fieldset>
|
||||||
<legend>Search</legend>
|
<legend>Search</legend>
|
||||||
|
|||||||
@ -1,46 +0,0 @@
|
|||||||
<!doctype html>
|
|
||||||
<html>
|
|
||||||
<head>
|
|
||||||
<meta charset="utf-8" />
|
|
||||||
<meta name="viewport" content="width=device-width, initial-scale=1" />
|
|
||||||
<title>TLC Vector Search</title>
|
|
||||||
<link rel="icon" href="/static/favicon.png" type="image/png" />
|
|
||||||
<link rel="stylesheet" href="https://unpkg.com/xp.css" />
|
|
||||||
<link rel="stylesheet" href="/static/style.css" />
|
|
||||||
</head>
|
|
||||||
<body>
|
|
||||||
<div class="window" style="max-width: 1200px; margin: 20px auto;">
|
|
||||||
<div class="title-bar">
|
|
||||||
<div class="title-bar-text">Vector Search (Experimental)</div>
|
|
||||||
<div class="title-bar-controls">
|
|
||||||
<a class="title-bar-link" href="/">⬅ Back to Search</a>
|
|
||||||
</div>
|
|
||||||
</div>
|
|
||||||
<div class="window-body">
|
|
||||||
<p>Enter a natural language prompt; results come from the Qdrant vector index.</p>
|
|
||||||
|
|
||||||
<fieldset>
|
|
||||||
<legend>Vector Query</legend>
|
|
||||||
<div class="field-row" style="margin-bottom: 8px;">
|
|
||||||
<label for="vectorQuery" style="width: 60px;">Query:</label>
|
|
||||||
<input id="vectorQuery" type="text" placeholder="Describe what you are looking for" style="flex: 1;" />
|
|
||||||
<button id="vectorSearchBtn">Search</button>
|
|
||||||
</div>
|
|
||||||
</fieldset>
|
|
||||||
|
|
||||||
<div id="vectorMeta" style="margin-top: 12px; font-size: 11px;"></div>
|
|
||||||
|
|
||||||
<fieldset style="margin-top: 16px;">
|
|
||||||
<legend>Results</legend>
|
|
||||||
<div id="vectorResults"></div>
|
|
||||||
</fieldset>
|
|
||||||
</div>
|
|
||||||
|
|
||||||
<div class="status-bar">
|
|
||||||
<p class="status-bar-field">Experimental mode • Qdrant</p>
|
|
||||||
</div>
|
|
||||||
</div>
|
|
||||||
|
|
||||||
<script src="/static/vector.js"></script>
|
|
||||||
</body>
|
|
||||||
</html>
|
|
||||||
423
static/vector.js
423
static/vector.js
@ -1,423 +0,0 @@
|
|||||||
(() => {
|
|
||||||
const queryInput = document.getElementById("vectorQuery");
|
|
||||||
const searchBtn = document.getElementById("vectorSearchBtn");
|
|
||||||
const resultsDiv = document.getElementById("vectorResults");
|
|
||||||
const metaDiv = document.getElementById("vectorMeta");
|
|
||||||
const transcriptCache = new Map();
|
|
||||||
|
|
||||||
if (!queryInput || !searchBtn || !resultsDiv || !metaDiv) {
|
|
||||||
console.error("Vector search elements missing");
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
|
|
||||||
/** Utility helpers **/
|
|
||||||
const escapeHtml = (str) =>
|
|
||||||
(str || "").replace(/[&<>"']/g, (ch) => {
|
|
||||||
switch (ch) {
|
|
||||||
case "&":
|
|
||||||
return "&";
|
|
||||||
case "<":
|
|
||||||
return "<";
|
|
||||||
case ">":
|
|
||||||
return ">";
|
|
||||||
case '"':
|
|
||||||
return """;
|
|
||||||
case "'":
|
|
||||||
return "'";
|
|
||||||
default:
|
|
||||||
return ch;
|
|
||||||
}
|
|
||||||
});
|
|
||||||
|
|
||||||
const fmtDate = (value) => {
|
|
||||||
try {
|
|
||||||
return (value || "").split("T")[0];
|
|
||||||
} catch {
|
|
||||||
return value;
|
|
||||||
}
|
|
||||||
};
|
|
||||||
|
|
||||||
const fmtSimilarity = (score) => {
|
|
||||||
if (typeof score !== "number" || Number.isNaN(score)) return "";
|
|
||||||
return score.toFixed(3);
|
|
||||||
};
|
|
||||||
|
|
||||||
const getVideoStatus = (item) =>
|
|
||||||
(item && item.video_status ? String(item.video_status).toLowerCase() : "");
|
|
||||||
const isLikelyDeleted = (item) => getVideoStatus(item) === "deleted";
|
|
||||||
|
|
||||||
const formatTimestamp = (seconds) => {
|
|
||||||
if (!seconds && seconds !== 0) return "00:00";
|
|
||||||
const hours = Math.floor(seconds / 3600);
|
|
||||||
const mins = Math.floor((seconds % 3600) / 60);
|
|
||||||
const secs = Math.floor(seconds % 60);
|
|
||||||
if (hours > 0) {
|
|
||||||
return `${hours}:${mins.toString().padStart(2, "0")}:${secs
|
|
||||||
.toString()
|
|
||||||
.padStart(2, "0")}`;
|
|
||||||
}
|
|
||||||
return `${mins}:${secs.toString().padStart(2, "0")}`;
|
|
||||||
};
|
|
||||||
|
|
||||||
const formatSegmentTimestamp = (segment) => {
|
|
||||||
if (!segment) return "";
|
|
||||||
if (segment.timestamp) return segment.timestamp;
|
|
||||||
const fields = [
|
|
||||||
segment.start_seconds,
|
|
||||||
segment.start,
|
|
||||||
segment.offset,
|
|
||||||
segment.time,
|
|
||||||
];
|
|
||||||
for (const value of fields) {
|
|
||||||
if (value == null) continue;
|
|
||||||
const num = parseFloat(value);
|
|
||||||
if (!Number.isNaN(num)) {
|
|
||||||
return formatTimestamp(num);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return "";
|
|
||||||
};
|
|
||||||
|
|
||||||
const serializeTranscriptSection = (label, parts, fullText) => {
|
|
||||||
let content = "";
|
|
||||||
if (typeof fullText === "string" && fullText.trim()) {
|
|
||||||
content = fullText.trim();
|
|
||||||
} else if (Array.isArray(parts) && parts.length) {
|
|
||||||
content = parts
|
|
||||||
.map((segment) => {
|
|
||||||
const ts = formatSegmentTimestamp(segment);
|
|
||||||
const text = segment && segment.text ? segment.text : "";
|
|
||||||
return ts ? `[${ts}] ${text}` : text;
|
|
||||||
})
|
|
||||||
.join("\n")
|
|
||||||
.trim();
|
|
||||||
}
|
|
||||||
if (!content) return "";
|
|
||||||
return `${label}\n${content}\n`;
|
|
||||||
};
|
|
||||||
|
|
||||||
const fetchTranscriptData = async (videoId) => {
|
|
||||||
if (!videoId) return null;
|
|
||||||
if (transcriptCache.has(videoId)) {
|
|
||||||
return transcriptCache.get(videoId);
|
|
||||||
}
|
|
||||||
const res = await fetch(`/api/transcript?video_id=${encodeURIComponent(videoId)}`);
|
|
||||||
if (!res.ok) {
|
|
||||||
throw new Error(`Transcript fetch failed (${res.status})`);
|
|
||||||
}
|
|
||||||
const data = await res.json();
|
|
||||||
transcriptCache.set(videoId, data);
|
|
||||||
return data;
|
|
||||||
};
|
|
||||||
|
|
||||||
const buildTranscriptDownloadText = (item, transcriptData) => {
|
|
||||||
const lines = [];
|
|
||||||
lines.push(`Title: ${item.title || "Untitled"}`);
|
|
||||||
if (item.channel_name) lines.push(`Channel: ${item.channel_name}`);
|
|
||||||
if (item.date) lines.push(`Published: ${item.date}`);
|
|
||||||
if (item.url) lines.push(`URL: ${item.url}`);
|
|
||||||
lines.push("");
|
|
||||||
|
|
||||||
const primaryText = serializeTranscriptSection(
|
|
||||||
"Primary Transcript",
|
|
||||||
transcriptData.transcript_parts,
|
|
||||||
transcriptData.transcript_full
|
|
||||||
);
|
|
||||||
const secondaryText = serializeTranscriptSection(
|
|
||||||
"Secondary Transcript",
|
|
||||||
transcriptData.transcript_secondary_parts,
|
|
||||||
transcriptData.transcript_secondary_full
|
|
||||||
);
|
|
||||||
|
|
||||||
if (primaryText) lines.push(primaryText);
|
|
||||||
if (secondaryText) lines.push(secondaryText);
|
|
||||||
if (!primaryText && !secondaryText) {
|
|
||||||
lines.push("No transcript available.");
|
|
||||||
}
|
|
||||||
return lines.join("\n").trim() + "\n";
|
|
||||||
};
|
|
||||||
|
|
||||||
const flashButtonMessage = (button, message, duration = 1800) => {
|
|
||||||
if (!button) return;
|
|
||||||
const original = button.dataset.originalLabel || button.textContent;
|
|
||||||
button.dataset.originalLabel = original;
|
|
||||||
button.textContent = message;
|
|
||||||
setTimeout(() => {
|
|
||||||
button.textContent = button.dataset.originalLabel || original;
|
|
||||||
}, duration);
|
|
||||||
};
|
|
||||||
|
|
||||||
const handleTranscriptDownload = async (item, button) => {
|
|
||||||
if (!item.video_id) return;
|
|
||||||
button.disabled = true;
|
|
||||||
try {
|
|
||||||
const transcriptData = await fetchTranscriptData(item.video_id);
|
|
||||||
if (!transcriptData) throw new Error("Transcript unavailable");
|
|
||||||
const text = buildTranscriptDownloadText(item, transcriptData);
|
|
||||||
const blob = new Blob([text], { type: "text/plain" });
|
|
||||||
const url = URL.createObjectURL(blob);
|
|
||||||
const link = document.createElement("a");
|
|
||||||
link.href = url;
|
|
||||||
link.download = `${item.video_id}.txt`;
|
|
||||||
document.body.appendChild(link);
|
|
||||||
link.click();
|
|
||||||
document.body.removeChild(link);
|
|
||||||
URL.revokeObjectURL(url);
|
|
||||||
flashButtonMessage(button, "Downloaded");
|
|
||||||
} catch (err) {
|
|
||||||
console.error("Download failed", err);
|
|
||||||
alert("Unable to download transcript right now.");
|
|
||||||
} finally {
|
|
||||||
button.disabled = false;
|
|
||||||
}
|
|
||||||
};
|
|
||||||
|
|
||||||
const formatMlaDate = (value) => {
|
|
||||||
if (!value) return "n.d.";
|
|
||||||
const parsed = new Date(value);
|
|
||||||
if (Number.isNaN(parsed.valueOf())) return value;
|
|
||||||
const months = [
|
|
||||||
"Jan.", "Feb.", "Mar.", "Apr.", "May", "June",
|
|
||||||
"July", "Aug.", "Sept.", "Oct.", "Nov.", "Dec.",
|
|
||||||
];
|
|
||||||
return `${parsed.getDate()} ${months[parsed.getMonth()]} ${parsed.getFullYear()}`;
|
|
||||||
};
|
|
||||||
|
|
||||||
const buildMlaCitation = (item) => {
|
|
||||||
const channel = (item.channel_name || item.channel_id || "Unknown").trim();
|
|
||||||
const title = (item.title || "Untitled").trim();
|
|
||||||
const url = item.url || "";
|
|
||||||
const publishDate = formatMlaDate(item.date);
|
|
||||||
const today = formatMlaDate(new Date().toISOString().split("T")[0]);
|
|
||||||
return `${channel}. "${title}." YouTube, uploaded by ${channel}, ${publishDate}, ${url}. Accessed ${today}.`;
|
|
||||||
};
|
|
||||||
|
|
||||||
const handleCopyCitation = async (item, button) => {
|
|
||||||
const citation = buildMlaCitation(item);
|
|
||||||
try {
|
|
||||||
if (navigator.clipboard && window.isSecureContext) {
|
|
||||||
await navigator.clipboard.writeText(citation);
|
|
||||||
} else {
|
|
||||||
const textarea = document.createElement("textarea");
|
|
||||||
textarea.value = citation;
|
|
||||||
textarea.style.position = "fixed";
|
|
||||||
textarea.style.opacity = "0";
|
|
||||||
document.body.appendChild(textarea);
|
|
||||||
textarea.select();
|
|
||||||
document.execCommand("copy");
|
|
||||||
document.body.removeChild(textarea);
|
|
||||||
}
|
|
||||||
flashButtonMessage(button, "Copied!");
|
|
||||||
} catch (err) {
|
|
||||||
console.error("Citation copy failed", err);
|
|
||||||
alert(citation);
|
|
||||||
}
|
|
||||||
};
|
|
||||||
|
|
||||||
/** Rendering helpers **/
|
|
||||||
const createHighlightRows = (entries) => {
|
|
||||||
if (!Array.isArray(entries) || !entries.length) return null;
|
|
||||||
const container = document.createElement("div");
|
|
||||||
container.className = "transcript highlight-list";
|
|
||||||
entries.forEach((entry) => {
|
|
||||||
if (!entry) return;
|
|
||||||
const row = document.createElement("div");
|
|
||||||
row.className = "highlight-row";
|
|
||||||
const textBlock = document.createElement("div");
|
|
||||||
textBlock.className = "highlight-text";
|
|
||||||
const html = entry.html || entry.text || entry;
|
|
||||||
textBlock.innerHTML = html || "";
|
|
||||||
row.appendChild(textBlock);
|
|
||||||
const indicator = document.createElement("span");
|
|
||||||
indicator.className = "highlight-source-indicator highlight-source-indicator--primary";
|
|
||||||
indicator.title = "Vector highlight";
|
|
||||||
row.appendChild(indicator);
|
|
||||||
container.appendChild(row);
|
|
||||||
});
|
|
||||||
return container;
|
|
||||||
};
|
|
||||||
|
|
||||||
const createActions = (item) => {
|
|
||||||
const actions = document.createElement("div");
|
|
||||||
actions.className = "result-actions";
|
|
||||||
const downloadBtn = document.createElement("button");
|
|
||||||
downloadBtn.type = "button";
|
|
||||||
downloadBtn.className = "result-action-btn";
|
|
||||||
downloadBtn.textContent = "Download transcript";
|
|
||||||
downloadBtn.addEventListener("click", () => handleTranscriptDownload(item, downloadBtn));
|
|
||||||
actions.appendChild(downloadBtn);
|
|
||||||
|
|
||||||
const citationBtn = document.createElement("button");
|
|
||||||
citationBtn.type = "button";
|
|
||||||
citationBtn.className = "result-action-btn";
|
|
||||||
citationBtn.textContent = "Copy citation";
|
|
||||||
citationBtn.addEventListener("click", () => handleCopyCitation(item, citationBtn));
|
|
||||||
actions.appendChild(citationBtn);
|
|
||||||
|
|
||||||
const graphBtn = document.createElement("button");
|
|
||||||
graphBtn.type = "button";
|
|
||||||
graphBtn.className = "result-action-btn graph-launch-btn";
|
|
||||||
graphBtn.textContent = "Graph";
|
|
||||||
graphBtn.disabled = !item.video_id;
|
|
||||||
graphBtn.addEventListener("click", () => {
|
|
||||||
if (!item.video_id) return;
|
|
||||||
const target = `/graph?video_id=${encodeURIComponent(item.video_id)}`;
|
|
||||||
window.open(target, "_blank", "noopener");
|
|
||||||
});
|
|
||||||
actions.appendChild(graphBtn);
|
|
||||||
|
|
||||||
return actions;
|
|
||||||
};
|
|
||||||
|
|
||||||
const renderVectorResults = (payload) => {
|
|
||||||
resultsDiv.innerHTML = "";
|
|
||||||
const items = payload.items || [];
|
|
||||||
if (!items.length) {
|
|
||||||
metaDiv.textContent = "No vector matches for this prompt.";
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
metaDiv.textContent = `Matches: ${items.length} (vector mode)`;
|
|
||||||
|
|
||||||
items.forEach((item) => {
|
|
||||||
const el = document.createElement("div");
|
|
||||||
el.className = "item";
|
|
||||||
const header = document.createElement("div");
|
|
||||||
header.className = "result-header";
|
|
||||||
const headerMain = document.createElement("div");
|
|
||||||
headerMain.className = "result-header-main";
|
|
||||||
const titleEl = document.createElement("strong");
|
|
||||||
titleEl.innerHTML = item.titleHtml || escapeHtml(item.title || "Untitled");
|
|
||||||
headerMain.appendChild(titleEl);
|
|
||||||
|
|
||||||
const metaLine = document.createElement("div");
|
|
||||||
metaLine.className = "muted result-meta";
|
|
||||||
const channelLabel = item.channel_name || item.channel_id || "Unknown";
|
|
||||||
const dateLabel = fmtDate(item.date);
|
|
||||||
let durationSeconds = null;
|
|
||||||
if (typeof item.duration === "number") {
|
|
||||||
durationSeconds = item.duration;
|
|
||||||
} else if (typeof item.duration === "string" && item.duration.trim()) {
|
|
||||||
const parsed = parseFloat(item.duration);
|
|
||||||
if (!Number.isNaN(parsed)) {
|
|
||||||
durationSeconds = parsed;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
const durationLabel = durationSeconds != null ? ` • ${formatTimestamp(durationSeconds)}` : "";
|
|
||||||
metaLine.textContent = channelLabel ? `${channelLabel} • ${dateLabel}${durationLabel}` : `${dateLabel}${durationLabel}`;
|
|
||||||
if (isLikelyDeleted(item)) {
|
|
||||||
metaLine.appendChild(document.createTextNode(" "));
|
|
||||||
const statusEl = document.createElement("span");
|
|
||||||
statusEl.className = "result-status result-status--deleted";
|
|
||||||
statusEl.textContent = "Likely deleted";
|
|
||||||
metaLine.appendChild(statusEl);
|
|
||||||
}
|
|
||||||
headerMain.appendChild(metaLine);
|
|
||||||
|
|
||||||
if (item.url) {
|
|
||||||
const linkLine = document.createElement("div");
|
|
||||||
linkLine.className = "muted";
|
|
||||||
const anchor = document.createElement("a");
|
|
||||||
anchor.href = item.url;
|
|
||||||
anchor.target = "_blank";
|
|
||||||
anchor.rel = "noopener";
|
|
||||||
anchor.textContent = "Open on YouTube";
|
|
||||||
linkLine.appendChild(anchor);
|
|
||||||
headerMain.appendChild(linkLine);
|
|
||||||
}
|
|
||||||
|
|
||||||
if (typeof item.distance === "number") {
|
|
||||||
const scoreLine = document.createElement("div");
|
|
||||||
scoreLine.className = "muted";
|
|
||||||
scoreLine.textContent = `Similarity score: ${fmtSimilarity(item.distance)}`;
|
|
||||||
headerMain.appendChild(scoreLine);
|
|
||||||
}
|
|
||||||
|
|
||||||
header.appendChild(headerMain);
|
|
||||||
header.appendChild(createActions(item));
|
|
||||||
el.appendChild(header);
|
|
||||||
|
|
||||||
if (item.descriptionHtml || item.description) {
|
|
||||||
const desc = document.createElement("div");
|
|
||||||
desc.className = "muted description-block";
|
|
||||||
desc.innerHTML = item.descriptionHtml || escapeHtml(item.description);
|
|
||||||
el.appendChild(desc);
|
|
||||||
}
|
|
||||||
|
|
||||||
if (item.chunkText) {
|
|
||||||
const chunkBlock = document.createElement("div");
|
|
||||||
chunkBlock.className = "vector-chunk";
|
|
||||||
if (item.chunkTimestamp && item.url) {
|
|
||||||
const tsObj =
|
|
||||||
typeof item.chunkTimestamp === "object"
|
|
||||||
? item.chunkTimestamp
|
|
||||||
: { timestamp: item.chunkTimestamp };
|
|
||||||
const ts = formatSegmentTimestamp(tsObj);
|
|
||||||
const tsLink = document.createElement("a");
|
|
||||||
const paramValue =
|
|
||||||
typeof item.chunkTimestamp === "number"
|
|
||||||
? Math.floor(item.chunkTimestamp)
|
|
||||||
: item.chunkTimestamp;
|
|
||||||
tsLink.href = `${item.url}${item.url.includes("?") ? "&" : "?"}t=${encodeURIComponent(
|
|
||||||
paramValue
|
|
||||||
)}`;
|
|
||||||
tsLink.target = "_blank";
|
|
||||||
tsLink.rel = "noopener";
|
|
||||||
tsLink.textContent = ts ? `[${ts}]` : "[timestamp]";
|
|
||||||
chunkBlock.appendChild(tsLink);
|
|
||||||
chunkBlock.appendChild(document.createTextNode(" "));
|
|
||||||
}
|
|
||||||
const chunkTextSpan = document.createElement("span");
|
|
||||||
chunkTextSpan.textContent = item.chunkText;
|
|
||||||
chunkBlock.appendChild(chunkTextSpan);
|
|
||||||
el.appendChild(chunkBlock);
|
|
||||||
}
|
|
||||||
|
|
||||||
const highlights = createHighlightRows(item.toHighlight);
|
|
||||||
if (highlights) {
|
|
||||||
el.appendChild(highlights);
|
|
||||||
}
|
|
||||||
|
|
||||||
resultsDiv.appendChild(el);
|
|
||||||
});
|
|
||||||
};
|
|
||||||
|
|
||||||
/** Search handler **/
|
|
||||||
const runVectorSearch = async () => {
|
|
||||||
const query = queryInput.value.trim();
|
|
||||||
if (!query) {
|
|
||||||
alert("Please enter a query.");
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
metaDiv.textContent = "Searching vector index…";
|
|
||||||
resultsDiv.innerHTML = "";
|
|
||||||
searchBtn.disabled = true;
|
|
||||||
try {
|
|
||||||
const res = await fetch("/api/vector-search", {
|
|
||||||
method: "POST",
|
|
||||||
headers: { "Content-Type": "application/json" },
|
|
||||||
body: JSON.stringify({ query }),
|
|
||||||
});
|
|
||||||
if (!res.ok) {
|
|
||||||
throw new Error(`Vector search failed (${res.status})`);
|
|
||||||
}
|
|
||||||
const data = await res.json();
|
|
||||||
if (data.error) {
|
|
||||||
metaDiv.textContent = "Vector search unavailable.";
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
renderVectorResults(data);
|
|
||||||
} catch (err) {
|
|
||||||
console.error(err);
|
|
||||||
metaDiv.textContent = "Vector search unavailable.";
|
|
||||||
} finally {
|
|
||||||
searchBtn.disabled = false;
|
|
||||||
}
|
|
||||||
};
|
|
||||||
|
|
||||||
searchBtn.addEventListener("click", runVectorSearch);
|
|
||||||
queryInput.addEventListener("keypress", (event) => {
|
|
||||||
if (event.key === "Enter") {
|
|
||||||
runVectorSearch();
|
|
||||||
}
|
|
||||||
});
|
|
||||||
})();
|
|
||||||
Loading…
x
Reference in New Issue
Block a user