Improve highlight-to-transcript matching with multi-strategy algorithm

This commit is contained in:
knight 2025-11-02 01:26:53 -04:00
parent 69bff7549c
commit 3e939a4beb

View File

@ -354,11 +354,53 @@
return temp.textContent.trim().toLowerCase().replace(/\s+/g, ' ');
}
function extractMarkedText(html) {
// Extract text from <mark> tags as these are the actual search matches
const temp = document.createElement('div');
temp.innerHTML = html;
const marks = temp.querySelectorAll('mark');
if (marks.length > 0) {
return Array.from(marks).map(m => m.textContent.trim().toLowerCase()).join(' ');
}
return null;
}
function findMatchingSegment(transcriptDiv, searchText) {
const segments = transcriptDiv.querySelectorAll('.transcript-segment');
const segments = Array.from(transcriptDiv.querySelectorAll('.transcript-segment'));
const normalized = searchText.toLowerCase().replace(/\s+/g, ' ').trim();
// First try exact match
// Strategy 1: Try to match the marked/highlighted words first (most reliable)
const markedText = extractMarkedText(searchText);
if (markedText) {
const markedWords = markedText.split(' ').filter(w => w.length > 2);
let bestMatch = null;
let bestScore = 0;
for (const segment of segments) {
const segmentText = segment.dataset.text;
if (!segmentText) continue;
let matchCount = 0;
for (const word of markedWords) {
if (segmentText.includes(word)) {
matchCount++;
}
}
const score = matchCount / markedWords.length;
if (score > bestScore) {
bestScore = score;
bestMatch = segment;
}
}
// If we found a good match with marked words, use it
if (bestMatch && bestScore >= 0.7) {
return bestMatch;
}
}
// Strategy 2: Try exact substring match
for (const segment of segments) {
const segmentText = segment.dataset.text;
if (segmentText && segmentText.includes(normalized)) {
@ -366,10 +408,33 @@
}
}
// If no exact match, try matching by words (at least 70% of words match)
// Strategy 3: Try matching a sliding window of the search text
// (since highlights may span multiple segments, try smaller chunks)
const words = normalized.split(' ');
if (words.length > 10) {
// Try chunks of 8 consecutive words from the middle (most likely to be in one segment)
const chunkSize = 8;
const startIdx = Math.floor((words.length - chunkSize) / 2);
const chunk = words.slice(startIdx, startIdx + chunkSize).join(' ');
for (const segment of segments) {
const segmentText = segment.dataset.text;
if (segmentText && segmentText.includes(chunk)) {
return segment;
}
}
}
// Strategy 4: Fuzzy word matching (at least 50% of words match)
const searchWords = normalized.split(' ').filter(w => w.length > 2);
if (searchWords.length === 0) return null;
// Take up to 15 most distinctive words (skip very common words)
const commonWords = new Set(['the', 'and', 'that', 'this', 'with', 'for', 'are', 'but', 'not', 'you', 'have', 'from', 'was', 'been', 'has', 'had', 'were']);
const distinctWords = searchWords
.filter(w => !commonWords.has(w))
.slice(0, 15);
let bestMatch = null;
let bestScore = 0;
@ -378,14 +443,25 @@
if (!segmentText) continue;
let matchCount = 0;
for (const word of searchWords) {
if (segmentText.includes(word)) {
let consecutiveMatches = 0;
let maxConsecutive = 0;
for (let i = 0; i < distinctWords.length; i++) {
if (segmentText.includes(distinctWords[i])) {
matchCount++;
consecutiveMatches++;
maxConsecutive = Math.max(maxConsecutive, consecutiveMatches);
} else {
consecutiveMatches = 0;
}
}
const score = matchCount / searchWords.length;
if (score > bestScore && score >= 0.5) {
// Score considers both match percentage and consecutive matches (phrase matches)
const matchScore = matchCount / distinctWords.length;
const consecutiveBonus = maxConsecutive / distinctWords.length * 0.3;
const score = matchScore + consecutiveBonus;
if (score > bestScore && score >= 0.4) {
bestScore = score;
bestMatch = segment;
}