diff --git a/ingest.py b/ingest.py index ec61478..a87239c 100644 --- a/ingest.py +++ b/ingest.py @@ -74,6 +74,16 @@ def build_bulk_actions( segment.get("text", "") if isinstance(segment, dict) else str(segment) for segment in parts ).strip() + + # Handle secondary transcript parts + secondary_parts = doc.get("transcript_secondary_parts") or [] + transcript_secondary_full = doc.get("transcript_secondary_full") + if not transcript_secondary_full and isinstance(secondary_parts, list): + transcript_secondary_full = " ".join( + segment.get("text", "") if isinstance(segment, dict) else str(segment) + for segment in secondary_parts + ).strip() + yield { "_id": video_id, "_index": index or CONFIG.elastic.index, @@ -88,8 +98,9 @@ def build_bulk_actions( "url": doc.get("url"), "duration": doc.get("duration"), "transcript_full": transcript_full, - "transcript_secondary_full": doc.get("transcript_secondary_full"), + "transcript_secondary_full": transcript_secondary_full, "transcript_parts": parts, + "transcript_secondary_parts": secondary_parts, }, } @@ -121,6 +132,14 @@ def ensure_index(client: "Elasticsearch", index: str) -> None: "text": {"type": "text"}, }, }, + "transcript_secondary_parts": { + "type": "nested", + "properties": { + "start": {"type": "float"}, + "duration": {"type": "float"}, + "text": {"type": "text"}, + }, + }, } }, )