Harden search responses

This commit is contained in:
2026-01-08 15:42:21 -05:00
parent 1c95f47766
commit 1ac076e5f2

View File

@@ -1187,7 +1187,7 @@ def create_app(config: AppConfig = CONFIG) -> Flask:
year = request.args.get("year", "", type=str) or None year = request.args.get("year", "", type=str) or None
sort = request.args.get("sort", "relevant", type=str) sort = request.args.get("sort", "relevant", type=str)
page = max(request.args.get("page", 0, type=int), 0) page = max(request.args.get("page", 0, type=int), 0)
size = max(request.args.get("size", 10, type=int), 1) size = min(max(request.args.get("size", 10, type=int), 1), MAX_QUERY_SIZE)
def parse_flag(name: str, default: bool = True) -> bool: def parse_flag(name: str, default: bool = True) -> bool:
value = request.args.get(name) value = request.args.get(name)
@@ -1215,6 +1215,10 @@ def create_app(config: AppConfig = CONFIG) -> Flask:
include_external=include_external, include_external=include_external,
) )
start = page * size start = page * size
if start >= MAX_OFFSET:
return jsonify({"error": "offset_too_large", "maxOffset": MAX_OFFSET}), 400
if start + size > MAX_OFFSET:
size = max(1, MAX_OFFSET - start)
if config.elastic.debug: if config.elastic.debug:
LOGGER.info( LOGGER.info(
"Elasticsearch search request: %s", "Elasticsearch search request: %s",
@@ -1261,14 +1265,10 @@ def create_app(config: AppConfig = CONFIG) -> Flask:
for value in (highlight_map.get("transcript_secondary_full", []) or []) for value in (highlight_map.get("transcript_secondary_full", []) or [])
] ]
title_html = ( title_highlight = highlight_map.get("title") or []
highlight_map.get("title") description_highlight = highlight_map.get("description") or []
or [source.get("title") or "Untitled"] title_html = title_highlight[0] if title_highlight else None
)[0] description_html = description_highlight[0] if description_highlight else None
description_html = (
highlight_map.get("description")
or [source.get("description") or ""]
)[0]
documents.append( documents.append(
{ {
"video_id": source.get("video_id"), "video_id": source.get("video_id"),