From 3e939a4bebccb2fef4146adf6ff9eb73c846904b Mon Sep 17 00:00:00 2001 From: knight Date: Sun, 2 Nov 2025 01:26:53 -0400 Subject: [PATCH] Improve highlight-to-transcript matching with multi-strategy algorithm --- static/app.js | 90 +++++++++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 83 insertions(+), 7 deletions(-) diff --git a/static/app.js b/static/app.js index e1e7cec..240d41c 100644 --- a/static/app.js +++ b/static/app.js @@ -354,11 +354,53 @@ return temp.textContent.trim().toLowerCase().replace(/\s+/g, ' '); } + function extractMarkedText(html) { + // Extract text from tags as these are the actual search matches + const temp = document.createElement('div'); + temp.innerHTML = html; + const marks = temp.querySelectorAll('mark'); + if (marks.length > 0) { + return Array.from(marks).map(m => m.textContent.trim().toLowerCase()).join(' '); + } + return null; + } + function findMatchingSegment(transcriptDiv, searchText) { - const segments = transcriptDiv.querySelectorAll('.transcript-segment'); + const segments = Array.from(transcriptDiv.querySelectorAll('.transcript-segment')); const normalized = searchText.toLowerCase().replace(/\s+/g, ' ').trim(); - // First try exact match + // Strategy 1: Try to match the marked/highlighted words first (most reliable) + const markedText = extractMarkedText(searchText); + if (markedText) { + const markedWords = markedText.split(' ').filter(w => w.length > 2); + let bestMatch = null; + let bestScore = 0; + + for (const segment of segments) { + const segmentText = segment.dataset.text; + if (!segmentText) continue; + + let matchCount = 0; + for (const word of markedWords) { + if (segmentText.includes(word)) { + matchCount++; + } + } + + const score = matchCount / markedWords.length; + if (score > bestScore) { + bestScore = score; + bestMatch = segment; + } + } + + // If we found a good match with marked words, use it + if (bestMatch && bestScore >= 0.7) { + return bestMatch; + } + } + + // Strategy 2: Try exact substring match for (const segment of segments) { const segmentText = segment.dataset.text; if (segmentText && segmentText.includes(normalized)) { @@ -366,10 +408,33 @@ } } - // If no exact match, try matching by words (at least 70% of words match) + // Strategy 3: Try matching a sliding window of the search text + // (since highlights may span multiple segments, try smaller chunks) + const words = normalized.split(' '); + if (words.length > 10) { + // Try chunks of 8 consecutive words from the middle (most likely to be in one segment) + const chunkSize = 8; + const startIdx = Math.floor((words.length - chunkSize) / 2); + const chunk = words.slice(startIdx, startIdx + chunkSize).join(' '); + + for (const segment of segments) { + const segmentText = segment.dataset.text; + if (segmentText && segmentText.includes(chunk)) { + return segment; + } + } + } + + // Strategy 4: Fuzzy word matching (at least 50% of words match) const searchWords = normalized.split(' ').filter(w => w.length > 2); if (searchWords.length === 0) return null; + // Take up to 15 most distinctive words (skip very common words) + const commonWords = new Set(['the', 'and', 'that', 'this', 'with', 'for', 'are', 'but', 'not', 'you', 'have', 'from', 'was', 'been', 'has', 'had', 'were']); + const distinctWords = searchWords + .filter(w => !commonWords.has(w)) + .slice(0, 15); + let bestMatch = null; let bestScore = 0; @@ -378,14 +443,25 @@ if (!segmentText) continue; let matchCount = 0; - for (const word of searchWords) { - if (segmentText.includes(word)) { + let consecutiveMatches = 0; + let maxConsecutive = 0; + + for (let i = 0; i < distinctWords.length; i++) { + if (segmentText.includes(distinctWords[i])) { matchCount++; + consecutiveMatches++; + maxConsecutive = Math.max(maxConsecutive, consecutiveMatches); + } else { + consecutiveMatches = 0; } } - const score = matchCount / searchWords.length; - if (score > bestScore && score >= 0.5) { + // Score considers both match percentage and consecutive matches (phrase matches) + const matchScore = matchCount / distinctWords.length; + const consecutiveBonus = maxConsecutive / distinctWords.length * 0.3; + const score = matchScore + consecutiveBonus; + + if (score > bestScore && score >= 0.4) { bestScore = score; bestMatch = segment; }