Improve highlight-to-transcript matching with multi-strategy algorithm
This commit is contained in:
parent
69bff7549c
commit
3e939a4beb
@ -354,11 +354,53 @@
|
||||
return temp.textContent.trim().toLowerCase().replace(/\s+/g, ' ');
|
||||
}
|
||||
|
||||
function extractMarkedText(html) {
|
||||
// Extract text from <mark> tags as these are the actual search matches
|
||||
const temp = document.createElement('div');
|
||||
temp.innerHTML = html;
|
||||
const marks = temp.querySelectorAll('mark');
|
||||
if (marks.length > 0) {
|
||||
return Array.from(marks).map(m => m.textContent.trim().toLowerCase()).join(' ');
|
||||
}
|
||||
return null;
|
||||
}
|
||||
|
||||
function findMatchingSegment(transcriptDiv, searchText) {
|
||||
const segments = transcriptDiv.querySelectorAll('.transcript-segment');
|
||||
const segments = Array.from(transcriptDiv.querySelectorAll('.transcript-segment'));
|
||||
const normalized = searchText.toLowerCase().replace(/\s+/g, ' ').trim();
|
||||
|
||||
// First try exact match
|
||||
// Strategy 1: Try to match the marked/highlighted words first (most reliable)
|
||||
const markedText = extractMarkedText(searchText);
|
||||
if (markedText) {
|
||||
const markedWords = markedText.split(' ').filter(w => w.length > 2);
|
||||
let bestMatch = null;
|
||||
let bestScore = 0;
|
||||
|
||||
for (const segment of segments) {
|
||||
const segmentText = segment.dataset.text;
|
||||
if (!segmentText) continue;
|
||||
|
||||
let matchCount = 0;
|
||||
for (const word of markedWords) {
|
||||
if (segmentText.includes(word)) {
|
||||
matchCount++;
|
||||
}
|
||||
}
|
||||
|
||||
const score = matchCount / markedWords.length;
|
||||
if (score > bestScore) {
|
||||
bestScore = score;
|
||||
bestMatch = segment;
|
||||
}
|
||||
}
|
||||
|
||||
// If we found a good match with marked words, use it
|
||||
if (bestMatch && bestScore >= 0.7) {
|
||||
return bestMatch;
|
||||
}
|
||||
}
|
||||
|
||||
// Strategy 2: Try exact substring match
|
||||
for (const segment of segments) {
|
||||
const segmentText = segment.dataset.text;
|
||||
if (segmentText && segmentText.includes(normalized)) {
|
||||
@ -366,10 +408,33 @@
|
||||
}
|
||||
}
|
||||
|
||||
// If no exact match, try matching by words (at least 70% of words match)
|
||||
// Strategy 3: Try matching a sliding window of the search text
|
||||
// (since highlights may span multiple segments, try smaller chunks)
|
||||
const words = normalized.split(' ');
|
||||
if (words.length > 10) {
|
||||
// Try chunks of 8 consecutive words from the middle (most likely to be in one segment)
|
||||
const chunkSize = 8;
|
||||
const startIdx = Math.floor((words.length - chunkSize) / 2);
|
||||
const chunk = words.slice(startIdx, startIdx + chunkSize).join(' ');
|
||||
|
||||
for (const segment of segments) {
|
||||
const segmentText = segment.dataset.text;
|
||||
if (segmentText && segmentText.includes(chunk)) {
|
||||
return segment;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Strategy 4: Fuzzy word matching (at least 50% of words match)
|
||||
const searchWords = normalized.split(' ').filter(w => w.length > 2);
|
||||
if (searchWords.length === 0) return null;
|
||||
|
||||
// Take up to 15 most distinctive words (skip very common words)
|
||||
const commonWords = new Set(['the', 'and', 'that', 'this', 'with', 'for', 'are', 'but', 'not', 'you', 'have', 'from', 'was', 'been', 'has', 'had', 'were']);
|
||||
const distinctWords = searchWords
|
||||
.filter(w => !commonWords.has(w))
|
||||
.slice(0, 15);
|
||||
|
||||
let bestMatch = null;
|
||||
let bestScore = 0;
|
||||
|
||||
@ -378,14 +443,25 @@
|
||||
if (!segmentText) continue;
|
||||
|
||||
let matchCount = 0;
|
||||
for (const word of searchWords) {
|
||||
if (segmentText.includes(word)) {
|
||||
let consecutiveMatches = 0;
|
||||
let maxConsecutive = 0;
|
||||
|
||||
for (let i = 0; i < distinctWords.length; i++) {
|
||||
if (segmentText.includes(distinctWords[i])) {
|
||||
matchCount++;
|
||||
consecutiveMatches++;
|
||||
maxConsecutive = Math.max(maxConsecutive, consecutiveMatches);
|
||||
} else {
|
||||
consecutiveMatches = 0;
|
||||
}
|
||||
}
|
||||
|
||||
const score = matchCount / searchWords.length;
|
||||
if (score > bestScore && score >= 0.5) {
|
||||
// Score considers both match percentage and consecutive matches (phrase matches)
|
||||
const matchScore = matchCount / distinctWords.length;
|
||||
const consecutiveBonus = maxConsecutive / distinctWords.length * 0.3;
|
||||
const score = matchScore + consecutiveBonus;
|
||||
|
||||
if (score > bestScore && score >= 0.4) {
|
||||
bestScore = score;
|
||||
bestMatch = segment;
|
||||
}
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user