Fix secondary transcript timestamps by indexing parts

Previously, secondary transcript parts were not being indexed
into Elasticsearch, causing the frontend to receive empty arrays
and display zero timestamps.

Changes:
- Add transcript_secondary_parts to Elasticsearch mapping
- Include secondary parts in bulk indexing actions
- Build secondary_full text from parts if not provided
- Match primary transcript structure (nested with start/duration/text)

Note: Existing data needs to be re-indexed for this fix to apply
to videos that already have secondary transcripts.

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
knight 2025-11-05 00:54:50 -05:00
parent b096e2aeeb
commit 2efe5e0c79

View File

@ -74,6 +74,16 @@ def build_bulk_actions(
segment.get("text", "") if isinstance(segment, dict) else str(segment)
for segment in parts
).strip()
# Handle secondary transcript parts
secondary_parts = doc.get("transcript_secondary_parts") or []
transcript_secondary_full = doc.get("transcript_secondary_full")
if not transcript_secondary_full and isinstance(secondary_parts, list):
transcript_secondary_full = " ".join(
segment.get("text", "") if isinstance(segment, dict) else str(segment)
for segment in secondary_parts
).strip()
yield {
"_id": video_id,
"_index": index or CONFIG.elastic.index,
@ -88,8 +98,9 @@ def build_bulk_actions(
"url": doc.get("url"),
"duration": doc.get("duration"),
"transcript_full": transcript_full,
"transcript_secondary_full": doc.get("transcript_secondary_full"),
"transcript_secondary_full": transcript_secondary_full,
"transcript_parts": parts,
"transcript_secondary_parts": secondary_parts,
},
}
@ -121,6 +132,14 @@ def ensure_index(client: "Elasticsearch", index: str) -> None:
"text": {"type": "text"},
},
},
"transcript_secondary_parts": {
"type": "nested",
"properties": {
"start": {"type": "float"},
"duration": {"type": "float"},
"text": {"type": "text"},
},
},
}
},
)