Improve highlight-to-transcript matching with multi-strategy algorithm

2025-11-02 01:26:53 -04:00 · 2025-11-02 01:26:53 -04:00 · 3e939a4beb
commit 3e939a4beb
parent 69bff7549c
1 changed files with 83 additions and 7 deletions
--- a/static/app.js
+++ b/static/app.js
@ -354,11 +354,53 @@
    return temp.textContent.trim().toLowerCase().replace(/\s+/g, ' ');
  }

+  function extractMarkedText(html) {
+    // Extract text from <mark> tags as these are the actual search matches
+    const temp = document.createElement('div');
+    temp.innerHTML = html;
+    const marks = temp.querySelectorAll('mark');
+    if (marks.length > 0) {
+      return Array.from(marks).map(m => m.textContent.trim().toLowerCase()).join(' ');
+    }
+    return null;
+  }
+
  function findMatchingSegment(transcriptDiv, searchText) {
-    const segments = transcriptDiv.querySelectorAll('.transcript-segment');
+    const segments = Array.from(transcriptDiv.querySelectorAll('.transcript-segment'));
    const normalized = searchText.toLowerCase().replace(/\s+/g, ' ').trim();

-    // First try exact match
+    // Strategy 1: Try to match the marked/highlighted words first (most reliable)
+    const markedText = extractMarkedText(searchText);
+    if (markedText) {
+      const markedWords = markedText.split(' ').filter(w => w.length > 2);
+      let bestMatch = null;
+      let bestScore = 0;
+
+      for (const segment of segments) {
+        const segmentText = segment.dataset.text;
+        if (!segmentText) continue;
+
+        let matchCount = 0;
+        for (const word of markedWords) {
+          if (segmentText.includes(word)) {
+            matchCount++;
+          }
+        }
+
+        const score = matchCount / markedWords.length;
+        if (score > bestScore) {
+          bestScore = score;
+          bestMatch = segment;
+        }
+      }
+
+      // If we found a good match with marked words, use it
+      if (bestMatch && bestScore >= 0.7) {
+        return bestMatch;
+      }
+    }
+
+    // Strategy 2: Try exact substring match
    for (const segment of segments) {
      const segmentText = segment.dataset.text;
      if (segmentText && segmentText.includes(normalized)) {
@ -366,10 +408,33 @@
      }
    }

-    // If no exact match, try matching by words (at least 70% of words match)
+    // Strategy 3: Try matching a sliding window of the search text
+    // (since highlights may span multiple segments, try smaller chunks)
+    const words = normalized.split(' ');
+    if (words.length > 10) {
+      // Try chunks of 8 consecutive words from the middle (most likely to be in one segment)
+      const chunkSize = 8;
+      const startIdx = Math.floor((words.length - chunkSize) / 2);
+      const chunk = words.slice(startIdx, startIdx + chunkSize).join(' ');
+
+      for (const segment of segments) {
+        const segmentText = segment.dataset.text;
+        if (segmentText && segmentText.includes(chunk)) {
+          return segment;
+        }
+      }
+    }
+
+    // Strategy 4: Fuzzy word matching (at least 50% of words match)
    const searchWords = normalized.split(' ').filter(w => w.length > 2);
    if (searchWords.length === 0) return null;

+    // Take up to 15 most distinctive words (skip very common words)
+    const commonWords = new Set(['the', 'and', 'that', 'this', 'with', 'for', 'are', 'but', 'not', 'you', 'have', 'from', 'was', 'been', 'has', 'had', 'were']);
+    const distinctWords = searchWords
+      .filter(w => !commonWords.has(w))
+      .slice(0, 15);
+
    let bestMatch = null;
    let bestScore = 0;

@ -378,14 +443,25 @@
      if (!segmentText) continue;

      let matchCount = 0;
-      for (const word of searchWords) {
-        if (segmentText.includes(word)) {
+      let consecutiveMatches = 0;
+      let maxConsecutive = 0;
+
+      for (let i = 0; i < distinctWords.length; i++) {
+        if (segmentText.includes(distinctWords[i])) {
          matchCount++;
+          consecutiveMatches++;
+          maxConsecutive = Math.max(maxConsecutive, consecutiveMatches);
+        } else {
+          consecutiveMatches = 0;
        }
      }

-      const score = matchCount / searchWords.length;
-      if (score > bestScore && score >= 0.5) {
+      // Score considers both match percentage and consecutive matches (phrase matches)
+      const matchScore = matchCount / distinctWords.length;
+      const consecutiveBonus = maxConsecutive / distinctWords.length * 0.3;
+      const score = matchScore + consecutiveBonus;
+
+      if (score > bestScore && score >= 0.4) {
        bestScore = score;
        bestMatch = segment;
      }