refactor: enhance search strategies with adaptive thresholds and overlapping windows

- Introduced adaptive confidence thresholds based on file size to improve search accuracy. - Implemented overlapping window functionality in search strategies to capture matches more effectively. - Added helper functions for evaluating content uniqueness and creating overlapping windows. - Enhanced existing search functions (exact, similarity, and Levenshtein) to utilize new strategies for better match validation. - Improved logging for search results to facilitate debugging and analysis of search performance.
2026-02-05 03:55:23 -05:00 · 2025-01-14 12:00:29 -05:00
parent f007f64344
commit 258024aa5a
1 changed files with 353 additions and 157 deletions
--- a/src/core/diff/strategies/new-unified/search-strategies.ts
+++ b/src/core/diff/strategies/new-unified/search-strategies.ts
@@ -1,40 +1,71 @@
-import { compareTwoStrings } from 'string-similarity';
+import { compareTwoStrings } from "string-similarity"
-import { closest } from 'fastest-levenshtein';
+import { closest } from "fastest-levenshtein"
-import { diff_match_patch } from 'diff-match-patch';
+import { diff_match_patch } from "diff-match-patch"
-import { Change, Hunk } from './types';
+import { Change, Hunk } from "./types"
 export type SearchResult = {
-  index: number;
+	index: number
-  confidence: number;
+	confidence: number
-  strategy: string;
+	strategy: string
-};
+}
 //TODO: this should be configurable
-const MIN_CONFIDENCE = 0.97;
+const MIN_CONFIDENCE = 0.97
 const MIN_CONFIDENCE_LARGE_FILE = 0.9
 const LARGE_FILE_THRESHOLD = 1000 // lines
 const UNIQUE_CONTENT_BOOST = 0.05
 const DEFAULT_OVERLAP_SIZE = 3 // lines of overlap between windows
 const MAX_WINDOW_SIZE = 500 // maximum lines in a window
 // Helper function to calculate adaptive confidence threshold based on file size
 function getAdaptiveThreshold(contentLength: number): number {
 	if (contentLength <= LARGE_FILE_THRESHOLD) {
 		return MIN_CONFIDENCE
 	}
 	return MIN_CONFIDENCE_LARGE_FILE
 }
 // Helper function to evaluate content uniqueness
 function evaluateContentUniqueness(searchStr: string, content: string[]): number {
 	const searchLines = searchStr.split("\n")
 	const uniqueLines = new Set(searchLines)
 	const contentStr = content.join("\n")
 	// Calculate how many search lines are relatively unique in the content
 	let uniqueCount = 0
 	for (const line of uniqueLines) {
 		const regex = new RegExp(line.replace(/[.*+?^${}()|[\]\\]/g, "\\$&"), "g")
 		const matches = contentStr.match(regex)
 		if (matches && matches.length <= 2) {
 			// Line appears at most twice
 			uniqueCount++
 		}
 	}
 	return uniqueCount / uniqueLines.size
 }
 // Helper function to prepare search string from context
 export function prepareSearchString(changes: Change[]): string {
-  const lines = changes
+	const lines = changes.filter((c) => c.type === "context" || c.type === "remove").map((c) => c.originalLine)
-    .filter((c) => c.type === 'context' || c.type === 'remove')
+	return lines.join("\n")
    .map((c) => c.content);
  return lines.join('\n');
 }
 // Helper function to evaluate similarity between two texts
 export function evaluateSimilarity(original: string, modified: string): number {
-  return compareTwoStrings(original, modified);
+	return compareTwoStrings(original, modified)
 }
 // Helper function to validate using diff-match-patch
 export function getDMPSimilarity(original: string, modified: string): number {
-  const dmp = new diff_match_patch();
+	const dmp = new diff_match_patch()
-  const diffs = dmp.diff_main(original, modified);
+	const diffs = dmp.diff_main(original, modified)
-  dmp.diff_cleanupSemantic(diffs);
+	dmp.diff_cleanupSemantic(diffs)
-  const patches = dmp.patch_make(original, diffs);
+	const patches = dmp.patch_make(original, diffs)
-  const [expectedText] = dmp.patch_apply(patches, original);
+	const [expectedText] = dmp.patch_apply(patches, original)
-  const similarity = evaluateSimilarity(expectedText, modified);
+	const similarity = evaluateSimilarity(expectedText, modified)
-  return similarity;
+	return similarity
 }
 // Helper function to validate edit results using hunk information
@@ -43,176 +74,341 @@ export function getDMPSimilarity(original: string, modified: string): number {
 // returns 0.1 (0.5 * (1 - 0.8)) to reduce confidence proportionally but with less impact.
 // If similarity >= MIN_CONFIDENCE, returns 0 (no reduction).
 export function validateEditResult(hunk: Hunk, result: string, strategy: string): number {
-  const hunkDeepCopy: Hunk = JSON.parse(JSON.stringify(hunk));
+	const hunkDeepCopy: Hunk = JSON.parse(JSON.stringify(hunk))
-  // Create skeleton of original content (context + removed lines)
+	// Create skeleton of original content (context + removed lines)
-  const originalSkeleton = hunkDeepCopy.changes
+	const originalSkeleton = hunkDeepCopy.changes
-    .filter((change) => change.type === 'context' || change.type === 'remove')
+		.filter((change) => change.type === "context" || change.type === "remove")
-    .map((change) => change.content)
+		.map((change) => change.content)
-    .join('\n');
+		.join("\n")
-  // Create skeleton of expected result (context + added lines)
+	// Create skeleton of expected result (context + added lines)
-  const expectedSkeleton = hunkDeepCopy.changes
+	const expectedSkeleton = hunkDeepCopy.changes
-    .filter((change) => change.type === 'context' || change.type === 'add')
+		.filter((change) => change.type === "context" || change.type === "add")
-    .map((change) => change.content)
+		.map((change) => change.content)
-    .join('\n');
+		.join("\n")
-  // Compare with original content
+	// Compare with original content
-  const originalSimilarity = evaluateSimilarity(originalSkeleton, result);
+	const originalSimilarity = evaluateSimilarity(originalSkeleton, result)
-  console.log('originalSimilarity ', strategy, originalSimilarity);
+	console.log("originalSimilarity ", strategy, originalSimilarity)
  // If original similarity is 1, it means changes weren't applied
  if (originalSimilarity > 0.97) {
    if (originalSimilarity === 1) {
      return 0.5; // Significant confidence reduction
    } else {
      return 0.8;
    }
  }
-  // Compare with expected result
+	// Compare with expected result
-  const expectedSimilarity = evaluateSimilarity(expectedSkeleton, result);
+	const expectedSimilarity = evaluateSimilarity(expectedSkeleton, result)
-  console.log('expectedSimilarity', strategy, expectedSimilarity);
+	console.log("expectedSimilarity", strategy, expectedSimilarity)
 	console.log("result", result)
-  // Scale between 0.98 and 1.0 (4% impact) based on expected similarity
+	// If original similarity is 1 and expected similarity is not 1, it means changes weren't applied
-  const multiplier =
+	if (originalSimilarity > 0.97 && expectedSimilarity !== 1) {
-    expectedSimilarity < MIN_CONFIDENCE ? 0.96 + 0.04 * expectedSimilarity : 1;
+		if (originalSimilarity === 1) {
 			// If original similarity is 1, it means changes weren't applied
 			if (originalSimilarity > 0.97) {
 				if (originalSimilarity === 1) {
 					return 0.5 // Significant confidence reduction
 				} else {
 					return 0.8
 				}
 			}
 		} else {
 			return 0.8
 		}
 	}
-  return multiplier;
+	// Scale between 0.98 and 1.0 (4% impact) based on expected similarity
 	const multiplier = expectedSimilarity < MIN_CONFIDENCE ? 0.96 + 0.04 * expectedSimilarity : 1
 	return multiplier
 }
 // Helper function to validate context lines against original content
 function validateContextLines(searchStr: string, content: string): number {
-  // Extract just the context lines from the search string
+	// Extract just the context lines from the search string
-  const contextLines = searchStr
+	const contextLines = searchStr.split("\n").filter((line) => !line.startsWith("-")) // Exclude removed lines
    .split('\n')
    .filter((line) => !line.startsWith('-')); // Exclude removed lines
-  // Compare context lines with content
+	// Compare context lines with content
-  const similarity = evaluateSimilarity(contextLines.join('\n'), content);
+	const similarity = evaluateSimilarity(contextLines.join("\n"), content)
-  // Context lines must match very closely, or confidence drops significantly
+	// Get adaptive threshold based on content size
-  return similarity < MIN_CONFIDENCE ? similarity * 0.3 : similarity;
+	const threshold = getAdaptiveThreshold(content.split("\n").length)
 	// Calculate uniqueness boost
 	const uniquenessScore = evaluateContentUniqueness(searchStr, content.split("\n"))
 	const uniquenessBoost = uniquenessScore * UNIQUE_CONTENT_BOOST
 	// Adjust confidence based on threshold and uniqueness
 	return similarity < threshold ? similarity * 0.3 + uniquenessBoost : similarity + uniquenessBoost
 }
-// Exact match strategy
+// Helper function to create overlapping windows
-export function findExactMatch(
+function createOverlappingWindows(
-  searchStr: string,
+	content: string[],
-  content: string[],
+	searchSize: number,
-  startIndex: number = 0
+	overlapSize: number = DEFAULT_OVERLAP_SIZE
-): SearchResult {
+): { window: string[]; startIndex: number }[] {
-  const contentStr = content.slice(startIndex).join('\n');
+	const windows: { window: string[]; startIndex: number }[] = []
  const searchLines = searchStr.split('\n');
-  const exactMatch = contentStr.indexOf(searchStr);
+	// Ensure minimum window size is at least searchSize
-  if (exactMatch !== -1) {
+	const effectiveWindowSize = Math.max(searchSize, Math.min(searchSize * 2, MAX_WINDOW_SIZE))
    const matchedContent = content
      .slice(
        startIndex + contentStr.slice(0, exactMatch).split('\n').length - 1,
        startIndex +
          contentStr.slice(0, exactMatch).split('\n').length -
          1 +
          searchLines.length
      )
      .join('\n');
-    const similarity = getDMPSimilarity(searchStr, matchedContent);
+	// Ensure overlap size doesn't exceed window size
-    const contextSimilarity = validateContextLines(searchStr, matchedContent);
+	const effectiveOverlapSize = Math.min(overlapSize, effectiveWindowSize - 1)
    const confidence = Math.min(similarity, contextSimilarity);
-    return {
+	// Calculate step size, ensure it's at least 1
-      index:
+	const stepSize = Math.max(1, effectiveWindowSize - effectiveOverlapSize)
        startIndex + contentStr.slice(0, exactMatch).split('\n').length - 1,
      confidence,
      strategy: 'exact',
    };
  }
-  return { index: -1, confidence: 0, strategy: 'exact' };
+	for (let i = 0; i < content.length; i += stepSize) {
 		const windowContent = content.slice(i, i + effectiveWindowSize)
 		if (windowContent.length >= searchSize) {
 			windows.push({ window: windowContent, startIndex: i })
 		}
 	}
 	return windows
 }
 // Helper function to combine overlapping matches
 function combineOverlappingMatches(
 	matches: (SearchResult & { windowIndex: number })[],
 	overlapSize: number = DEFAULT_OVERLAP_SIZE
 ): SearchResult[] {
 	if (matches.length === 0) {
 		return []
 	}
 	// Sort matches by confidence
 	matches.sort((a, b) => b.confidence - a.confidence)
 	const combinedMatches: SearchResult[] = []
 	const usedIndices = new Set<number>()
 	for (const match of matches) {
 		if (usedIndices.has(match.windowIndex)) {continue}
 		// Find overlapping matches
 		const overlapping = matches.filter(
 			(m) =>
 				Math.abs(m.windowIndex - match.windowIndex) === 1 &&
 				Math.abs(m.index - match.index) <= overlapSize &&
 				!usedIndices.has(m.windowIndex)
 		)
 		if (overlapping.length > 0) {
 			// Boost confidence if we find same match in overlapping windows
 			const avgConfidence =
 				(match.confidence + overlapping.reduce((sum, m) => sum + m.confidence, 0)) / (overlapping.length + 1)
 			const boost = Math.min(0.05 * overlapping.length, 0.1) // Max 10% boost
 			combinedMatches.push({
 				index: match.index,
 				confidence: Math.min(1, avgConfidence + boost),
 				strategy: `${match.strategy}-overlapping`,
 			})
 			usedIndices.add(match.windowIndex)
 			overlapping.forEach((m) => usedIndices.add(m.windowIndex))
 		} else {
 			combinedMatches.push({
 				index: match.index,
 				confidence: match.confidence,
 				strategy: match.strategy,
 			})
 			usedIndices.add(match.windowIndex)
 		}
 	}
 	return combinedMatches
 }
 // Modified search functions to use sliding windows
 export function findExactMatch(searchStr: string, content: string[], startIndex: number = 0): SearchResult {
 	const searchLines = searchStr.split("\n")
 	const windows = createOverlappingWindows(content.slice(startIndex), searchLines.length)
 	const matches: (SearchResult & { windowIndex: number })[] = []
 	windows.forEach((windowData, windowIndex) => {
 		const windowStr = windowData.window.join("\n")
 		const exactMatch = windowStr.indexOf(searchStr)
 		if (exactMatch !== -1) {
 			const matchedContent = windowData.window
 				.slice(
 					windowStr.slice(0, exactMatch).split("\n").length - 1,
 					windowStr.slice(0, exactMatch).split("\n").length - 1 + searchLines.length
 				)
 				.join("\n")
 			const similarity = getDMPSimilarity(searchStr, matchedContent)
 			const contextSimilarity = validateContextLines(searchStr, matchedContent)
 			const confidence = Math.min(similarity, contextSimilarity)
 			matches.push({
 				index: startIndex + windowData.startIndex + windowStr.slice(0, exactMatch).split("\n").length - 1,
 				confidence,
 				strategy: "exact",
 				windowIndex,
 			})
 		}
 	})
 	const combinedMatches = combineOverlappingMatches(matches)
 	return combinedMatches.length > 0 ? combinedMatches[0] : { index: -1, confidence: 0, strategy: "exact" }
 }
 // String similarity strategy
-export function findSimilarityMatch(
+export function findSimilarityMatch(searchStr: string, content: string[], startIndex: number = 0): SearchResult {
-  searchStr: string,
+	const searchLines = searchStr.split("\n")
-  content: string[],
+	let bestScore = 0
-  startIndex: number = 0
+	let bestIndex = -1
-): SearchResult {
+	const minScore = 0.8
  const searchLines = searchStr.split('\n');
  let bestScore = 0;
  let bestIndex = -1;
  const minScore = 0.8;
-  for (let i = startIndex; i < content.length - searchLines.length + 1; i++) {
+	for (let i = startIndex; i < content.length - searchLines.length + 1; i++) {
-    const windowStr = content.slice(i, i + searchLines.length).join('\n');
+		const windowStr = content.slice(i, i + searchLines.length).join("\n")
-    const score = compareTwoStrings(searchStr, windowStr);
+		const score = compareTwoStrings(searchStr, windowStr)
-    if (score > bestScore && score >= minScore) {
+		if (score > bestScore && score >= minScore) {
-      const similarity = getDMPSimilarity(searchStr, windowStr);
+			const similarity = getDMPSimilarity(searchStr, windowStr)
-      const contextSimilarity = validateContextLines(searchStr, windowStr);
+			const contextSimilarity = validateContextLines(searchStr, windowStr)
-      const adjustedScore = Math.min(similarity, contextSimilarity) * score;
+			const adjustedScore = Math.min(similarity, contextSimilarity) * score
-      if (adjustedScore > bestScore) {
+			if (adjustedScore > bestScore) {
-        bestScore = adjustedScore;
+				bestScore = adjustedScore
-        bestIndex = i;
+				bestIndex = i
-      }
+			}
-    }
+		}
-  }
+	}
-  return {
+	return {
-    index: bestIndex,
+		index: bestIndex,
-    confidence: bestIndex !== -1 ? bestScore : 0,
+		confidence: bestIndex !== -1 ? bestScore : 0,
-    strategy: 'similarity',
+		strategy: "similarity",
-  };
+	}
 }
 // Levenshtein strategy
-export function findLevenshteinMatch(
+export function findLevenshteinMatch(searchStr: string, content: string[], startIndex: number = 0): SearchResult {
-  searchStr: string,
+	const searchLines = searchStr.split("\n")
-  content: string[],
+	const candidates = []
  startIndex: number = 0
 ): SearchResult {
  const searchLines = searchStr.split('\n');
  const candidates = [];
-  for (let i = startIndex; i < content.length - searchLines.length + 1; i++) {
+	for (let i = startIndex; i < content.length - searchLines.length + 1; i++) {
-    candidates.push(content.slice(i, i + searchLines.length).join('\n'));
+		candidates.push(content.slice(i, i + searchLines.length).join("\n"))
-  }
+	}
-  if (candidates.length > 0) {
+	if (candidates.length > 0) {
-    const closestMatch = closest(searchStr, candidates);
+		const closestMatch = closest(searchStr, candidates)
-    const index = startIndex + candidates.indexOf(closestMatch);
+		const index = startIndex + candidates.indexOf(closestMatch)
-    const similarity = getDMPSimilarity(searchStr, closestMatch);
+		const similarity = getDMPSimilarity(searchStr, closestMatch)
-    const contextSimilarity = validateContextLines(searchStr, closestMatch);
+		const contextSimilarity = validateContextLines(searchStr, closestMatch)
-    const confidence = Math.min(similarity, contextSimilarity)
+		const confidence = Math.min(similarity, contextSimilarity)
-    return {
+		return {
-      index,
+			index,
-      confidence: index !== -1 ? confidence : 0,
+			confidence: index !== -1 ? confidence : 0,
-      strategy: 'levenshtein',
+			strategy: "levenshtein",
-    };
+		}
-  }
+	}
-  return { index: -1, confidence: 0, strategy: 'levenshtein' };
+	return { index: -1, confidence: 0, strategy: "levenshtein" }
 }
 // Helper function to identify anchor lines based on uniqueness and complexity
 function identifyAnchors(searchStr: string, content: string[]): { line: string; index: number; weight: number }[] {
 	const searchLines = searchStr.split("\n")
 	const contentStr = content.join("\n")
 	const anchors: { line: string; index: number; weight: number }[] = []
 	for (let i = 0; i < searchLines.length; i++) {
 		const line = searchLines[i]
 		if (!line.trim()) {continue} // Skip empty lines
 		// Calculate line complexity (more special chars = more unique)
 		const specialChars = (line.match(/[^a-zA-Z0-9\s]/g) || []).length
 		const complexity = specialChars / line.length
 		// Count occurrences in content
 		const regex = new RegExp(line.replace(/[.*+?^${}()|[\]\\]/g, "\\$&"), "g")
 		const matches = contentStr.match(regex)
 		const occurrences = matches ? matches.length : 0
 		// Calculate uniqueness weight
 		const uniquenessWeight = occurrences <= 1 ? 1 : 1 / occurrences
 		const weight = uniquenessWeight * (0.7 + 0.3 * complexity)
 		if (weight > 0.5) {
 			// Only consider lines with high enough weight
 			anchors.push({ line, index: i, weight })
 		}
 	}
 	// Sort by weight descending
 	return anchors.sort((a, b) => b.weight - a.weight)
 }
 // Helper function to validate anchor positions
 function validateAnchorPositions(
 	anchors: { line: string; index: number }[],
 	content: string[],
 	searchLines: string[]
 ): number {
 	for (const anchor of anchors) {
 		const anchorIndex = content.findIndex((line) => line === anchor.line)
 		if (anchorIndex !== -1) {
 			// Check if surrounding context matches
 			const contextBefore = searchLines.slice(Math.max(0, anchor.index - 2), anchor.index).join("\n")
 			const contextAfter = searchLines.slice(anchor.index + 1, anchor.index + 3).join("\n")
 			const contentBefore = content.slice(Math.max(0, anchorIndex - 2), anchorIndex).join("\n")
 			const contentAfter = content.slice(anchorIndex + 1, anchorIndex + 3).join("\n")
 			const beforeSimilarity = evaluateSimilarity(contextBefore, contentBefore)
 			const afterSimilarity = evaluateSimilarity(contextAfter, contentAfter)
 			if (beforeSimilarity > 0.8 && afterSimilarity > 0.8) {
 				return anchorIndex - anchor.index
 			}
 		}
 	}
 	return -1
 }
 // Anchor-based search strategy
 export function findAnchorMatch(searchStr: string, content: string[], startIndex: number = 0): SearchResult {
 	const searchLines = searchStr.split("\n")
 	const anchors = identifyAnchors(searchStr, content.slice(startIndex))
 	if (anchors.length === 0) {
 		return { index: -1, confidence: 0, strategy: "anchor" }
 	}
 	// Try to validate position using top anchors
 	const offset = validateAnchorPositions(anchors.slice(0, 3), content.slice(startIndex), searchLines)
 	if (offset !== -1) {
 		const matchPosition = startIndex + offset
 		const matchedContent = content.slice(matchPosition, matchPosition + searchLines.length).join("\n")
 		const similarity = getDMPSimilarity(searchStr, matchedContent)
 		const contextSimilarity = validateContextLines(searchStr, matchedContent)
 		const confidence = Math.min(similarity, contextSimilarity) * (1 + anchors[0].weight * 0.1) // Boost confidence based on anchor weight
 		return {
 			index: matchPosition,
 			confidence: Math.min(1, confidence), // Cap at 1
 			strategy: "anchor",
 		}
 	}
 	return { index: -1, confidence: 0, strategy: "anchor" }
 }
 // Main search function that tries all strategies
-export function findBestMatch(
+export function findBestMatch(searchStr: string, content: string[], startIndex: number = 0): SearchResult {
-  searchStr: string,
+	const strategies = [findExactMatch, findAnchorMatch, findSimilarityMatch, findLevenshteinMatch]
  content: string[],
  startIndex: number = 0
 ): SearchResult {
  const strategies = [
    findExactMatch,
    findSimilarityMatch,
    findLevenshteinMatch,
  ];
-  let bestResult: SearchResult = { index: -1, confidence: 0, strategy: 'none' };
+	let bestResult: SearchResult = { index: -1, confidence: 0, strategy: "none" }
-  for (const strategy of strategies) {
+	for (const strategy of strategies) {
-    const result = strategy(searchStr, content, startIndex);
+		const result = strategy(searchStr, content, startIndex)
-    if (result.confidence > bestResult.confidence) {
+		console.log("Search result:", result)
-      bestResult = result;
+		if (result.confidence > bestResult.confidence) {
-    }
+			bestResult = result
-  }
+		}
 	}
-  return bestResult;
+	return bestResult
 }