From 3e939a4bebccb2fef4146adf6ff9eb73c846904b Mon Sep 17 00:00:00 2001
From: knight <gitea@ghost.tel>
Date: Sun, 2 Nov 2025 01:26:53 -0400
Subject: [PATCH] Improve highlight-to-transcript matching with multi-strategy
 algorithm

---
 static/app.js | 90 +++++++++++++++++++++++++++++++++++++++++++++++----
 1 file changed, 83 insertions(+), 7 deletions(-)
diff --git a/static/app.js b/static/app.js
index e1e7cec..240d41c 100644
--- a/static/app.js
+++ b/static/app.js
@@ -354,11 +354,53 @@
     return temp.textContent.trim().toLowerCase().replace(/\s+/g, ' ');
   }
 
+  function extractMarkedText(html) {
+    // Extract text from <mark> tags as these are the actual search matches
+    const temp = document.createElement('div');
+    temp.innerHTML = html;
+    const marks = temp.querySelectorAll('mark');
+    if (marks.length > 0) {
+      return Array.from(marks).map(m => m.textContent.trim().toLowerCase()).join(' ');
+    }
+    return null;
+  }
+
   function findMatchingSegment(transcriptDiv, searchText) {
-    const segments = transcriptDiv.querySelectorAll('.transcript-segment');
+    const segments = Array.from(transcriptDiv.querySelectorAll('.transcript-segment'));
     const normalized = searchText.toLowerCase().replace(/\s+/g, ' ').trim();
 
-    // First try exact match
+    // Strategy 1: Try to match the marked/highlighted words first (most reliable)
+    const markedText = extractMarkedText(searchText);
+    if (markedText) {
+      const markedWords = markedText.split(' ').filter(w => w.length > 2);
+      let bestMatch = null;
+      let bestScore = 0;
+
+      for (const segment of segments) {
+        const segmentText = segment.dataset.text;
+        if (!segmentText) continue;
+
+        let matchCount = 0;
+        for (const word of markedWords) {
+          if (segmentText.includes(word)) {
+            matchCount++;
+          }
+        }
+
+        const score = matchCount / markedWords.length;
+        if (score > bestScore) {
+          bestScore = score;
+          bestMatch = segment;
+        }
+      }
+
+      // If we found a good match with marked words, use it
+      if (bestMatch && bestScore >= 0.7) {
+        return bestMatch;
+      }
+    }
+
+    // Strategy 2: Try exact substring match
     for (const segment of segments) {
       const segmentText = segment.dataset.text;
       if (segmentText && segmentText.includes(normalized)) {
@@ -366,10 +408,33 @@
       }
     }
 
-    // If no exact match, try matching by words (at least 70% of words match)
+    // Strategy 3: Try matching a sliding window of the search text
+    // (since highlights may span multiple segments, try smaller chunks)
+    const words = normalized.split(' ');
+    if (words.length > 10) {
+      // Try chunks of 8 consecutive words from the middle (most likely to be in one segment)
+      const chunkSize = 8;
+      const startIdx = Math.floor((words.length - chunkSize) / 2);
+      const chunk = words.slice(startIdx, startIdx + chunkSize).join(' ');
+
+      for (const segment of segments) {
+        const segmentText = segment.dataset.text;
+        if (segmentText && segmentText.includes(chunk)) {
+          return segment;
+        }
+      }
+    }
+
+    // Strategy 4: Fuzzy word matching (at least 50% of words match)
     const searchWords = normalized.split(' ').filter(w => w.length > 2);
     if (searchWords.length === 0) return null;
 
+    // Take up to 15 most distinctive words (skip very common words)
+    const commonWords = new Set(['the', 'and', 'that', 'this', 'with', 'for', 'are', 'but', 'not', 'you', 'have', 'from', 'was', 'been', 'has', 'had', 'were']);
+    const distinctWords = searchWords
+      .filter(w => !commonWords.has(w))
+      .slice(0, 15);
+
     let bestMatch = null;
     let bestScore = 0;
 
@@ -378,14 +443,25 @@
       if (!segmentText) continue;
 
       let matchCount = 0;
-      for (const word of searchWords) {
-        if (segmentText.includes(word)) {
+      let consecutiveMatches = 0;
+      let maxConsecutive = 0;
+
+      for (let i = 0; i < distinctWords.length; i++) {
+        if (segmentText.includes(distinctWords[i])) {
           matchCount++;
+          consecutiveMatches++;
+          maxConsecutive = Math.max(maxConsecutive, consecutiveMatches);
+        } else {
+          consecutiveMatches = 0;
         }
       }
 
-      const score = matchCount / searchWords.length;
-      if (score > bestScore && score >= 0.5) {
+      // Score considers both match percentage and consecutive matches (phrase matches)
+      const matchScore = matchCount / distinctWords.length;
+      const consecutiveBonus = maxConsecutive / distinctWords.length * 0.3;
+      const score = matchScore + consecutiveBonus;
+
+      if (score > bestScore && score >= 0.4) {
         bestScore = score;
         bestMatch = segment;
       }