Improve highlight-to-transcript matching with multi-strategy algorithm
This commit is contained in:
parent
69bff7549c
commit
3e939a4beb
@ -354,11 +354,53 @@
|
|||||||
return temp.textContent.trim().toLowerCase().replace(/\s+/g, ' ');
|
return temp.textContent.trim().toLowerCase().replace(/\s+/g, ' ');
|
||||||
}
|
}
|
||||||
|
|
||||||
|
function extractMarkedText(html) {
|
||||||
|
// Extract text from <mark> tags as these are the actual search matches
|
||||||
|
const temp = document.createElement('div');
|
||||||
|
temp.innerHTML = html;
|
||||||
|
const marks = temp.querySelectorAll('mark');
|
||||||
|
if (marks.length > 0) {
|
||||||
|
return Array.from(marks).map(m => m.textContent.trim().toLowerCase()).join(' ');
|
||||||
|
}
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
|
||||||
function findMatchingSegment(transcriptDiv, searchText) {
|
function findMatchingSegment(transcriptDiv, searchText) {
|
||||||
const segments = transcriptDiv.querySelectorAll('.transcript-segment');
|
const segments = Array.from(transcriptDiv.querySelectorAll('.transcript-segment'));
|
||||||
const normalized = searchText.toLowerCase().replace(/\s+/g, ' ').trim();
|
const normalized = searchText.toLowerCase().replace(/\s+/g, ' ').trim();
|
||||||
|
|
||||||
// First try exact match
|
// Strategy 1: Try to match the marked/highlighted words first (most reliable)
|
||||||
|
const markedText = extractMarkedText(searchText);
|
||||||
|
if (markedText) {
|
||||||
|
const markedWords = markedText.split(' ').filter(w => w.length > 2);
|
||||||
|
let bestMatch = null;
|
||||||
|
let bestScore = 0;
|
||||||
|
|
||||||
|
for (const segment of segments) {
|
||||||
|
const segmentText = segment.dataset.text;
|
||||||
|
if (!segmentText) continue;
|
||||||
|
|
||||||
|
let matchCount = 0;
|
||||||
|
for (const word of markedWords) {
|
||||||
|
if (segmentText.includes(word)) {
|
||||||
|
matchCount++;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
const score = matchCount / markedWords.length;
|
||||||
|
if (score > bestScore) {
|
||||||
|
bestScore = score;
|
||||||
|
bestMatch = segment;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// If we found a good match with marked words, use it
|
||||||
|
if (bestMatch && bestScore >= 0.7) {
|
||||||
|
return bestMatch;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Strategy 2: Try exact substring match
|
||||||
for (const segment of segments) {
|
for (const segment of segments) {
|
||||||
const segmentText = segment.dataset.text;
|
const segmentText = segment.dataset.text;
|
||||||
if (segmentText && segmentText.includes(normalized)) {
|
if (segmentText && segmentText.includes(normalized)) {
|
||||||
@ -366,10 +408,33 @@
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// If no exact match, try matching by words (at least 70% of words match)
|
// Strategy 3: Try matching a sliding window of the search text
|
||||||
|
// (since highlights may span multiple segments, try smaller chunks)
|
||||||
|
const words = normalized.split(' ');
|
||||||
|
if (words.length > 10) {
|
||||||
|
// Try chunks of 8 consecutive words from the middle (most likely to be in one segment)
|
||||||
|
const chunkSize = 8;
|
||||||
|
const startIdx = Math.floor((words.length - chunkSize) / 2);
|
||||||
|
const chunk = words.slice(startIdx, startIdx + chunkSize).join(' ');
|
||||||
|
|
||||||
|
for (const segment of segments) {
|
||||||
|
const segmentText = segment.dataset.text;
|
||||||
|
if (segmentText && segmentText.includes(chunk)) {
|
||||||
|
return segment;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Strategy 4: Fuzzy word matching (at least 50% of words match)
|
||||||
const searchWords = normalized.split(' ').filter(w => w.length > 2);
|
const searchWords = normalized.split(' ').filter(w => w.length > 2);
|
||||||
if (searchWords.length === 0) return null;
|
if (searchWords.length === 0) return null;
|
||||||
|
|
||||||
|
// Take up to 15 most distinctive words (skip very common words)
|
||||||
|
const commonWords = new Set(['the', 'and', 'that', 'this', 'with', 'for', 'are', 'but', 'not', 'you', 'have', 'from', 'was', 'been', 'has', 'had', 'were']);
|
||||||
|
const distinctWords = searchWords
|
||||||
|
.filter(w => !commonWords.has(w))
|
||||||
|
.slice(0, 15);
|
||||||
|
|
||||||
let bestMatch = null;
|
let bestMatch = null;
|
||||||
let bestScore = 0;
|
let bestScore = 0;
|
||||||
|
|
||||||
@ -378,14 +443,25 @@
|
|||||||
if (!segmentText) continue;
|
if (!segmentText) continue;
|
||||||
|
|
||||||
let matchCount = 0;
|
let matchCount = 0;
|
||||||
for (const word of searchWords) {
|
let consecutiveMatches = 0;
|
||||||
if (segmentText.includes(word)) {
|
let maxConsecutive = 0;
|
||||||
|
|
||||||
|
for (let i = 0; i < distinctWords.length; i++) {
|
||||||
|
if (segmentText.includes(distinctWords[i])) {
|
||||||
matchCount++;
|
matchCount++;
|
||||||
|
consecutiveMatches++;
|
||||||
|
maxConsecutive = Math.max(maxConsecutive, consecutiveMatches);
|
||||||
|
} else {
|
||||||
|
consecutiveMatches = 0;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
const score = matchCount / searchWords.length;
|
// Score considers both match percentage and consecutive matches (phrase matches)
|
||||||
if (score > bestScore && score >= 0.5) {
|
const matchScore = matchCount / distinctWords.length;
|
||||||
|
const consecutiveBonus = maxConsecutive / distinctWords.length * 0.3;
|
||||||
|
const score = matchScore + consecutiveBonus;
|
||||||
|
|
||||||
|
if (score > bestScore && score >= 0.4) {
|
||||||
bestScore = score;
|
bestScore = score;
|
||||||
bestMatch = segment;
|
bestMatch = segment;
|
||||||
}
|
}
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user