From 2efe5e0c799d2a177a520506a22b7fcb037ffe47 Mon Sep 17 00:00:00 2001 From: knight Date: Wed, 5 Nov 2025 00:54:50 -0500 Subject: [PATCH] Fix secondary transcript timestamps by indexing parts MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Previously, secondary transcript parts were not being indexed into Elasticsearch, causing the frontend to receive empty arrays and display zero timestamps. Changes: - Add transcript_secondary_parts to Elasticsearch mapping - Include secondary parts in bulk indexing actions - Build secondary_full text from parts if not provided - Match primary transcript structure (nested with start/duration/text) Note: Existing data needs to be re-indexed for this fix to apply to videos that already have secondary transcripts. 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude --- ingest.py | 21 ++++++++++++++++++++- 1 file changed, 20 insertions(+), 1 deletion(-) diff --git a/ingest.py b/ingest.py index ec61478..a87239c 100644 --- a/ingest.py +++ b/ingest.py @@ -74,6 +74,16 @@ def build_bulk_actions( segment.get("text", "") if isinstance(segment, dict) else str(segment) for segment in parts ).strip() + + # Handle secondary transcript parts + secondary_parts = doc.get("transcript_secondary_parts") or [] + transcript_secondary_full = doc.get("transcript_secondary_full") + if not transcript_secondary_full and isinstance(secondary_parts, list): + transcript_secondary_full = " ".join( + segment.get("text", "") if isinstance(segment, dict) else str(segment) + for segment in secondary_parts + ).strip() + yield { "_id": video_id, "_index": index or CONFIG.elastic.index, @@ -88,8 +98,9 @@ def build_bulk_actions( "url": doc.get("url"), "duration": doc.get("duration"), "transcript_full": transcript_full, - "transcript_secondary_full": doc.get("transcript_secondary_full"), + "transcript_secondary_full": transcript_secondary_full, "transcript_parts": parts, + "transcript_secondary_parts": secondary_parts, }, } @@ -121,6 +132,14 @@ def ensure_index(client: "Elasticsearch", index: str) -> None: "text": {"type": "text"}, }, }, + "transcript_secondary_parts": { + "type": "nested", + "properties": { + "start": {"type": "float"}, + "duration": {"type": "float"}, + "text": {"type": "text"}, + }, + }, } }, )