Improve highlight-to-transcript matching with multi-strategy algorithm

2025-11-02 01:26:53 -04:00 · 2025-11-02 01:26:53 -04:00 · 3e939a4beb
commit 3e939a4beb
parent 69bff7549c
1 changed files with 83 additions and 7 deletions
--- a/static/app.js
+++ b/static/app.js
@ -354,11 +354,53 @@
    return temp.textContent.trim().toLowerCase().replace(/\s+/g, ' ');
  }
  function extractMarkedText(html) {
    // Extract text from <mark> tags as these are the actual search matches
    const temp = document.createElement('div');
    temp.innerHTML = html;
    const marks = temp.querySelectorAll('mark');
    if (marks.length > 0) {
      return Array.from(marks).map(m => m.textContent.trim().toLowerCase()).join(' ');
    }
    return null;
  }
  function findMatchingSegment(transcriptDiv, searchText) {
-    const segments = transcriptDiv.querySelectorAll('.transcript-segment');
+    const segments = Array.from(transcriptDiv.querySelectorAll('.transcript-segment'));
    const normalized = searchText.toLowerCase().replace(/\s+/g, ' ').trim();
-    // First try exact match
+    // Strategy 1: Try to match the marked/highlighted words first (most reliable)
    const markedText = extractMarkedText(searchText);
    if (markedText) {
      const markedWords = markedText.split(' ').filter(w => w.length > 2);
      let bestMatch = null;
      let bestScore = 0;
      for (const segment of segments) {
        const segmentText = segment.dataset.text;
        if (!segmentText) continue;
        let matchCount = 0;
        for (const word of markedWords) {
          if (segmentText.includes(word)) {
            matchCount++;
          }
        }
        const score = matchCount / markedWords.length;
        if (score > bestScore) {
          bestScore = score;
          bestMatch = segment;
        }
      }
      // If we found a good match with marked words, use it
      if (bestMatch && bestScore >= 0.7) {
        return bestMatch;
      }
    }
    // Strategy 2: Try exact substring match
    for (const segment of segments) {
      const segmentText = segment.dataset.text;
      if (segmentText && segmentText.includes(normalized)) {
@ -366,10 +408,33 @@
      }
    }
-    // If no exact match, try matching by words (at least 70% of words match)
+    // Strategy 3: Try matching a sliding window of the search text
    // (since highlights may span multiple segments, try smaller chunks)
    const words = normalized.split(' ');
    if (words.length > 10) {
      // Try chunks of 8 consecutive words from the middle (most likely to be in one segment)
      const chunkSize = 8;
      const startIdx = Math.floor((words.length - chunkSize) / 2);
      const chunk = words.slice(startIdx, startIdx + chunkSize).join(' ');
      for (const segment of segments) {
        const segmentText = segment.dataset.text;
        if (segmentText && segmentText.includes(chunk)) {
          return segment;
        }
      }
    }
    // Strategy 4: Fuzzy word matching (at least 50% of words match)
    const searchWords = normalized.split(' ').filter(w => w.length > 2);
    if (searchWords.length === 0) return null;
    // Take up to 15 most distinctive words (skip very common words)
    const commonWords = new Set(['the', 'and', 'that', 'this', 'with', 'for', 'are', 'but', 'not', 'you', 'have', 'from', 'was', 'been', 'has', 'had', 'were']);
    const distinctWords = searchWords
      .filter(w => !commonWords.has(w))
      .slice(0, 15);
    let bestMatch = null;
    let bestScore = 0;
@ -378,14 +443,25 @@
      if (!segmentText) continue;
      let matchCount = 0;
-      for (const word of searchWords) {
+      let consecutiveMatches = 0;
-        if (segmentText.includes(word)) {
+      let maxConsecutive = 0;
      for (let i = 0; i < distinctWords.length; i++) {
        if (segmentText.includes(distinctWords[i])) {
          matchCount++;
          consecutiveMatches++;
          maxConsecutive = Math.max(maxConsecutive, consecutiveMatches);
        } else {
          consecutiveMatches = 0;
        }
      }
-      const score = matchCount / searchWords.length;
+      // Score considers both match percentage and consecutive matches (phrase matches)
-      if (score > bestScore && score >= 0.5) {
+      const matchScore = matchCount / distinctWords.length;
      const consecutiveBonus = maxConsecutive / distinctWords.length * 0.3;
      const score = matchScore + consecutiveBonus;
      if (score > bestScore && score >= 0.4) {
        bestScore = score;
        bestMatch = segment;
      }