refactor: enhance anchor-based search strategy in Levenshtein match

- Improved the `identifyAnchors` function to return the first and last non-empty lines of the search string.
- Updated the `findAnchorMatch` function to validate anchor uniqueness and context more effectively.
- Removed unused complexity calculations and streamlined the anchor validation process.
This commit is contained in:
Daniel Riccio
2025-01-17 00:40:33 -05:00
parent 815c266fec
commit df654f0284

View File

@@ -271,8 +271,9 @@ export function findLevenshteinMatch(searchStr: string, content: string[], start
const similarity = getDMPSimilarity(searchStr, closestMatch)
const contextSimilarity = validateContextLines(searchStr, closestMatch, confidenceThreshold)
const confidence = Math.min(similarity, contextSimilarity)
console.log(searchStr, closestMatch, index, confidence)
return {
index,
index: confidence === 0 ? -1 : index,
confidence: index !== -1 ? confidence : 0,
strategy: "levenshtein",
}
@@ -281,92 +282,91 @@ export function findLevenshteinMatch(searchStr: string, content: string[], start
return { index: -1, confidence: 0, strategy: "levenshtein" }
}
// Helper function to identify anchor lines based on uniqueness and complexity
function identifyAnchors(searchStr: string, content: string[]): { line: string; index: number; weight: number }[] {
const searchLines = searchStr.split("\n")
const contentStr = content.join("\n")
const anchors: { line: string; index: number; weight: number }[] = []
// Helper function to identify anchor lines
function identifyAnchors(searchStr: string): { first: string | null; last: string | null } {
const searchLines = searchStr.split("\n");
let first: string | null = null;
let last: string | null = null;
for (let i = 0; i < searchLines.length; i++) {
const line = searchLines[i]
if (!line.trim()) {continue} // Skip empty lines
// Calculate line complexity (more special chars = more unique)
const specialChars = (line.match(/[^a-zA-Z0-9\s]/g) || []).length
const complexity = specialChars / line.length
// Count occurrences in content
const regex = new RegExp(line.replace(/[.*+?^${}()|[\]\\]/g, "\\$&"), "g")
const matches = contentStr.match(regex)
const occurrences = matches ? matches.length : 0
// Calculate uniqueness weight
const uniquenessWeight = occurrences <= 1 ? 1 : 1 / occurrences
const weight = uniquenessWeight * (0.7 + 0.3 * complexity)
if (weight > 0.5) {
// Only consider lines with high enough weight
anchors.push({ line, index: i, weight })
// Find the first non-empty line
for (const line of searchLines) {
if (line.trim()) {
first = line;
break;
}
}
// Sort by weight descending
return anchors.sort((a, b) => b.weight - a.weight)
}
// Helper function to validate anchor positions
function validateAnchorPositions(
anchors: { line: string; index: number }[],
content: string[],
searchLines: string[]
): number {
for (const anchor of anchors) {
const anchorIndex = content.findIndex((line) => line === anchor.line)
if (anchorIndex !== -1) {
// Check if surrounding context matches
const contextBefore = searchLines.slice(Math.max(0, anchor.index - 2), anchor.index).join("\n")
const contextAfter = searchLines.slice(anchor.index + 1, anchor.index + 3).join("\n")
const contentBefore = content.slice(Math.max(0, anchorIndex - 2), anchorIndex).join("\n")
const contentAfter = content.slice(anchorIndex + 1, anchorIndex + 3).join("\n")
const beforeSimilarity = evaluateSimilarity(contextBefore, contentBefore)
const afterSimilarity = evaluateSimilarity(contextAfter, contentAfter)
if (beforeSimilarity > 0.8 && afterSimilarity > 0.8) {
return anchorIndex - anchor.index
}
// Find the last non-empty line
for (let i = searchLines.length - 1; i >= 0; i--) {
if (searchLines[i].trim()) {
last = searchLines[i];
break;
}
}
return -1
return { first, last };
}
// Anchor-based search strategy
export function findAnchorMatch(searchStr: string, content: string[], startIndex: number = 0, confidenceThreshold: number = 0.97): SearchResult {
const searchLines = searchStr.split("\n")
const anchors = identifyAnchors(searchStr, content.slice(startIndex))
const searchLines = searchStr.split("\n");
const { first, last } = identifyAnchors(searchStr);
if (anchors.length === 0) {
return { index: -1, confidence: 0, strategy: "anchor" }
if (!first || !last) {
return { index: -1, confidence: 0, strategy: "anchor" };
}
// Try to validate position using top anchors
const offset = validateAnchorPositions(anchors.slice(0, 3), content.slice(startIndex), searchLines)
let firstIndex = -1;
let lastIndex = -1;
if (offset !== -1) {
const matchPosition = startIndex + offset
const matchedContent = content.slice(matchPosition, matchPosition + searchLines.length).join("\n")
const similarity = getDMPSimilarity(searchStr, matchedContent)
const contextSimilarity = validateContextLines(searchStr, matchedContent, confidenceThreshold)
const confidence = Math.min(similarity, contextSimilarity) * (1 + anchors[0].weight * 0.1) // Boost confidence based on anchor weight
return {
index: matchPosition,
confidence: Math.min(1, confidence), // Cap at 1
strategy: "anchor",
// Check if the first anchor is unique
let firstOccurrences = 0;
for (const contentLine of content) {
if (contentLine === first) {
firstOccurrences++;
}
}
return { index: -1, confidence: 0, strategy: "anchor" }
if (firstOccurrences !== 1) {
return { index: -1, confidence: 0, strategy: "anchor" };
}
// Find the first anchor
for (let i = startIndex; i < content.length; i++) {
if (content[i] === first) {
firstIndex = i;
break;
}
}
// Find the last anchor
for (let i = content.length - 1; i >= startIndex; i--) {
if (content[i] === last) {
lastIndex = i;
break;
}
}
if (firstIndex === -1 || lastIndex === -1 || lastIndex <= firstIndex) {
return { index: -1, confidence: 0, strategy: "anchor" };
}
// Validate the context
const expectedContext = searchLines.slice(searchLines.indexOf(first) + 1, searchLines.indexOf(last)).join("\n");
const actualContext = content.slice(firstIndex + 1, lastIndex).join("\n");
const contextSimilarity = evaluateSimilarity(expectedContext, actualContext);
if (contextSimilarity < getAdaptiveThreshold(content.length, confidenceThreshold)) {
return { index: -1, confidence: 0, strategy: "anchor" };
}
const confidence = 1;
return {
index: firstIndex,
confidence: confidence,
strategy: "anchor",
};
}
// Main search function that tries all strategies