Fix secondary transcript timestamps by indexing parts

Previously, secondary transcript parts were not being indexed into Elasticsearch, causing the frontend to receive empty arrays and display zero timestamps. Changes: - Add transcript_secondary_parts to Elasticsearch mapping - Include secondary parts in bulk indexing actions - Build secondary_full text from parts if not provided - Match primary transcript structure (nested with start/duration/text) Note: Existing data needs to be re-indexed for this fix to apply to videos that already have secondary transcripts. 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
2025-11-05 00:54:50 -05:00
parent b096e2aeeb
commit 2efe5e0c79
1 changed files with 20 additions and 1 deletions
--- a/ingest.py
+++ b/ingest.py
@@ -74,6 +74,16 @@ def build_bulk_actions(
                segment.get("text", "") if isinstance(segment, dict) else str(segment)
                for segment in parts
            ).strip()
+
+        # Handle secondary transcript parts
+        secondary_parts = doc.get("transcript_secondary_parts") or []
+        transcript_secondary_full = doc.get("transcript_secondary_full")
+        if not transcript_secondary_full and isinstance(secondary_parts, list):
+            transcript_secondary_full = " ".join(
+                segment.get("text", "") if isinstance(segment, dict) else str(segment)
+                for segment in secondary_parts
+            ).strip()
+
        yield {
            "_id": video_id,
            "_index": index or CONFIG.elastic.index,
@@ -88,8 +98,9 @@ def build_bulk_actions(
                "url": doc.get("url"),
                "duration": doc.get("duration"),
                "transcript_full": transcript_full,
-                "transcript_secondary_full": doc.get("transcript_secondary_full"),
+                "transcript_secondary_full": transcript_secondary_full,
                "transcript_parts": parts,
+                "transcript_secondary_parts": secondary_parts,
            },
        }

@@ -121,6 +132,14 @@ def ensure_index(client: "Elasticsearch", index: str) -> None:
                        "text": {"type": "text"},
                    },
                },
+                "transcript_secondary_parts": {
+                    "type": "nested",
+                    "properties": {
+                        "start": {"type": "float"},
+                        "duration": {"type": "float"},
+                        "text": {"type": "text"},
+                    },
+                },
            }
        },
    )