Disable vector search
This commit is contained in:
208
search_app.py
208
search_app.py
@@ -4,10 +4,8 @@ Flask application exposing search, graph, and transcript endpoints for TLC.
|
||||
Routes:
|
||||
GET / -> static HTML search page.
|
||||
GET /graph -> static reference graph UI.
|
||||
GET /vector-search -> experimental Qdrant vector search UI.
|
||||
GET /api/channels -> channels aggregation.
|
||||
GET /api/search -> Elasticsearch keyword search.
|
||||
POST /api/vector-search -> Qdrant vector similarity query.
|
||||
GET /api/graph -> reference graph API.
|
||||
GET /api/transcript -> transcript JSON payload.
|
||||
"""
|
||||
@@ -27,13 +25,6 @@ from datetime import datetime
|
||||
|
||||
from flask import Flask, jsonify, request, send_from_directory
|
||||
|
||||
import requests
|
||||
|
||||
try:
|
||||
from sentence_transformers import SentenceTransformer # type: ignore
|
||||
except ImportError: # pragma: no cover - optional dependency
|
||||
SentenceTransformer = None
|
||||
|
||||
from .config import CONFIG, AppConfig
|
||||
|
||||
try:
|
||||
@@ -44,14 +35,11 @@ except ImportError: # pragma: no cover - dependency optional
|
||||
BadRequestError = Exception # type: ignore
|
||||
|
||||
LOGGER = logging.getLogger(__name__)
|
||||
_EMBED_MODEL = None
|
||||
_EMBED_MODEL_NAME: Optional[str] = None
|
||||
|
||||
# Security constants
|
||||
MAX_QUERY_SIZE = 100
|
||||
MAX_OFFSET = 10000
|
||||
ALLOWED_QDRANT_FILTER_FIELDS = {"channel_id", "date", "video_status", "external_reference"}
|
||||
|
||||
DEFAULT_ELASTIC_TIMEOUT = int(os.environ.get("ELASTIC_TIMEOUT_SECONDS", "30"))
|
||||
|
||||
def sanitize_query_string(query: str) -> str:
|
||||
"""
|
||||
@@ -74,47 +62,6 @@ def sanitize_query_string(query: str) -> str:
|
||||
return sanitized.strip() or "*"
|
||||
|
||||
|
||||
def validate_qdrant_filter(filters: Any) -> Dict[str, Any]:
|
||||
"""
|
||||
Validate and sanitize Qdrant filter objects.
|
||||
Only allows whitelisted fields to prevent filter injection.
|
||||
"""
|
||||
if not isinstance(filters, dict):
|
||||
return {}
|
||||
validated: Dict[str, Any] = {}
|
||||
for key, value in filters.items():
|
||||
if key in ALLOWED_QDRANT_FILTER_FIELDS:
|
||||
validated[key] = value
|
||||
return validated
|
||||
|
||||
|
||||
def _ensure_embedder(model_name: str) -> "SentenceTransformer":
|
||||
global _EMBED_MODEL, _EMBED_MODEL_NAME
|
||||
if SentenceTransformer is None: # pragma: no cover - optional dependency
|
||||
raise RuntimeError(
|
||||
"sentence-transformers is required for vector search. Install via pip install sentence-transformers."
|
||||
)
|
||||
if _EMBED_MODEL is None or _EMBED_MODEL_NAME != model_name:
|
||||
LOGGER.info("Loading embedding model: %s", model_name)
|
||||
_EMBED_MODEL = SentenceTransformer(model_name)
|
||||
_EMBED_MODEL_NAME = model_name
|
||||
return _EMBED_MODEL
|
||||
|
||||
|
||||
def embed_query(text: str, *, model_name: str, expected_dim: int) -> List[float]:
|
||||
embedder = _ensure_embedder(model_name)
|
||||
vector = embedder.encode(
|
||||
[f"query: {text}"],
|
||||
show_progress_bar=False,
|
||||
normalize_embeddings=True,
|
||||
)[0].tolist()
|
||||
if len(vector) != expected_dim:
|
||||
raise RuntimeError(
|
||||
f"Embedding dimension mismatch (expected {expected_dim}, got {len(vector)})"
|
||||
)
|
||||
return vector
|
||||
|
||||
|
||||
def _ensure_client(config: AppConfig) -> "Elasticsearch":
|
||||
if Elasticsearch is None:
|
||||
raise RuntimeError(
|
||||
@@ -286,7 +233,7 @@ def elastic_metrics_payload(
|
||||
"Elasticsearch metrics request: %s",
|
||||
json.dumps({"index": index, "body": body}, indent=2),
|
||||
)
|
||||
response = client.search(index=index, body=body)
|
||||
response = client.search(index=index, body=body, request_timeout=30)
|
||||
break
|
||||
except BadRequestError as exc:
|
||||
last_error = exc
|
||||
@@ -857,7 +804,7 @@ def build_full_graph_payload(
|
||||
scroll_id: Optional[str] = None
|
||||
try:
|
||||
body = {"query": query, "_source": source_fields, "sort": ["_doc"]}
|
||||
response = client.search(index=index, body=body, size=batch_size, scroll="1m")
|
||||
response = client.search(index=index, body=body, size=batch_size, scroll="1m", request_timeout=60)
|
||||
scroll_id = response.get("_scroll_id")
|
||||
stop_fetch = False
|
||||
while not stop_fetch:
|
||||
@@ -957,11 +904,6 @@ def create_app(config: AppConfig = CONFIG) -> Flask:
|
||||
|
||||
client = _ensure_client(config)
|
||||
index = config.elastic.index
|
||||
qdrant_url = config.qdrant_url
|
||||
qdrant_collection = config.qdrant_collection
|
||||
qdrant_vector_name = config.qdrant_vector_name
|
||||
qdrant_vector_size = config.qdrant_vector_size
|
||||
qdrant_embed_model = config.qdrant_embed_model
|
||||
|
||||
@app.route("/")
|
||||
def index_page():
|
||||
@@ -971,10 +913,6 @@ def create_app(config: AppConfig = CONFIG) -> Flask:
|
||||
def graph_page():
|
||||
return send_from_directory(app.static_folder, "graph.html")
|
||||
|
||||
@app.route("/vector-search")
|
||||
def vector_search_page():
|
||||
return send_from_directory(app.static_folder, "vector.html")
|
||||
|
||||
@app.route("/static/<path:filename>")
|
||||
def static_files(filename: str):
|
||||
return send_from_directory(app.static_folder, filename)
|
||||
@@ -1260,6 +1198,7 @@ def create_app(config: AppConfig = CONFIG) -> Flask:
|
||||
from_=start,
|
||||
size=size,
|
||||
body=payload,
|
||||
request_timeout=30,
|
||||
)
|
||||
if config.elastic.debug:
|
||||
LOGGER.info(
|
||||
@@ -1550,145 +1489,6 @@ def create_app(config: AppConfig = CONFIG) -> Flask:
|
||||
def frequency_page():
|
||||
return send_from_directory(app.static_folder, "frequency.html")
|
||||
|
||||
@app.route("/api/vector-search", methods=["POST"])
|
||||
def api_vector_search():
|
||||
payload = request.get_json(silent=True) or {}
|
||||
query_text = (payload.get("query") or "").strip()
|
||||
filters = validate_qdrant_filter(payload.get("filters"))
|
||||
limit = min(max(int(payload.get("size", 10)), 1), MAX_QUERY_SIZE)
|
||||
offset = min(max(int(payload.get("offset", 0)), 0), MAX_OFFSET)
|
||||
|
||||
if not query_text:
|
||||
return jsonify(
|
||||
{"items": [], "totalResults": 0, "offset": offset, "error": "empty_query"}
|
||||
)
|
||||
|
||||
try:
|
||||
query_vector = embed_query(
|
||||
query_text, model_name=qdrant_embed_model, expected_dim=qdrant_vector_size
|
||||
)
|
||||
except Exception as exc: # pragma: no cover - runtime dependency
|
||||
LOGGER.error("Embedding failed: %s", exc, exc_info=config.elastic.debug)
|
||||
return jsonify({"error": "embedding_unavailable"}), 500
|
||||
|
||||
qdrant_vector_payload: Any
|
||||
if qdrant_vector_name:
|
||||
qdrant_vector_payload = {qdrant_vector_name: query_vector}
|
||||
else:
|
||||
qdrant_vector_payload = query_vector
|
||||
|
||||
qdrant_body: Dict[str, Any] = {
|
||||
"vector": qdrant_vector_payload,
|
||||
"limit": limit,
|
||||
"offset": offset,
|
||||
"with_payload": True,
|
||||
"with_vectors": False,
|
||||
}
|
||||
if filters:
|
||||
qdrant_body["filter"] = filters
|
||||
|
||||
try:
|
||||
response = requests.post(
|
||||
f"{qdrant_url}/collections/{qdrant_collection}/points/search",
|
||||
json=qdrant_body,
|
||||
timeout=20,
|
||||
)
|
||||
response.raise_for_status()
|
||||
data = response.json()
|
||||
except Exception as exc:
|
||||
LOGGER.error("Vector search failed: %s", exc, exc_info=config.elastic.debug)
|
||||
return jsonify({"error": "vector_search_unavailable"}), 502
|
||||
|
||||
points = data.get("result", []) if isinstance(data, dict) else []
|
||||
items: List[Dict[str, Any]] = []
|
||||
missing_channel_ids: Set[str] = set()
|
||||
for point in points:
|
||||
payload = point.get("payload", {}) or {}
|
||||
raw_highlights = payload.get("highlights") or []
|
||||
highlight_entries: List[Dict[str, str]] = []
|
||||
for entry in raw_highlights:
|
||||
if isinstance(entry, dict):
|
||||
html_value = entry.get("html") or entry.get("text")
|
||||
else:
|
||||
html_value = str(entry)
|
||||
if not html_value:
|
||||
continue
|
||||
highlight_entries.append({"html": html_value, "source": "primary"})
|
||||
|
||||
channel_label = (
|
||||
payload.get("channel_name")
|
||||
or payload.get("channel_title")
|
||||
or payload.get("channel_id")
|
||||
)
|
||||
items.append(
|
||||
{
|
||||
"video_id": payload.get("video_id"),
|
||||
"channel_id": payload.get("channel_id"),
|
||||
"channel_name": channel_label,
|
||||
"title": payload.get("title"),
|
||||
"titleHtml": payload.get("title"),
|
||||
"description": payload.get("description"),
|
||||
"descriptionHtml": payload.get("description"),
|
||||
"date": payload.get("date"),
|
||||
"url": payload.get("url"),
|
||||
"chunkText": payload.get("text")
|
||||
or payload.get("chunk_text")
|
||||
or payload.get("chunk")
|
||||
or payload.get("content"),
|
||||
"chunkTimestamp": payload.get("timestamp")
|
||||
or payload.get("start_seconds")
|
||||
or payload.get("start"),
|
||||
"toHighlight": highlight_entries,
|
||||
"highlightSource": {
|
||||
"primary": bool(highlight_entries),
|
||||
"secondary": False,
|
||||
},
|
||||
"distance": point.get("score"),
|
||||
"internal_references_count": payload.get("internal_references_count", 0),
|
||||
"internal_references": payload.get("internal_references", []),
|
||||
"referenced_by_count": payload.get("referenced_by_count", 0),
|
||||
"referenced_by": payload.get("referenced_by", []),
|
||||
"video_status": payload.get("video_status"),
|
||||
"duration": payload.get("duration"),
|
||||
}
|
||||
)
|
||||
if (not channel_label) and payload.get("channel_id"):
|
||||
missing_channel_ids.add(str(payload.get("channel_id")))
|
||||
|
||||
if missing_channel_ids:
|
||||
try:
|
||||
es_lookup = client.search(
|
||||
index=index,
|
||||
body={
|
||||
"size": len(missing_channel_ids) * 2,
|
||||
"_source": ["channel_id", "channel_name"],
|
||||
"query": {"terms": {"channel_id.keyword": list(missing_channel_ids)}},
|
||||
},
|
||||
)
|
||||
hits = es_lookup.get("hits", {}).get("hits", [])
|
||||
channel_lookup = {}
|
||||
for hit in hits:
|
||||
src = hit.get("_source", {}) or {}
|
||||
cid = src.get("channel_id")
|
||||
cname = src.get("channel_name")
|
||||
if cid and cname and cid not in channel_lookup:
|
||||
channel_lookup[cid] = cname
|
||||
for item in items:
|
||||
if not item.get("channel_name"):
|
||||
cid = item.get("channel_id")
|
||||
if cid and cid in channel_lookup:
|
||||
item["channel_name"] = channel_lookup[cid]
|
||||
except Exception as exc:
|
||||
LOGGER.debug("Vector channel lookup failed: %s", exc)
|
||||
|
||||
return jsonify(
|
||||
{
|
||||
"items": items,
|
||||
"totalResults": len(items),
|
||||
"offset": offset,
|
||||
}
|
||||
)
|
||||
|
||||
@app.route("/api/transcript")
|
||||
def transcript():
|
||||
video_id = request.args.get("video_id", type=str)
|
||||
|
||||
Reference in New Issue
Block a user