Improve strategies and confidence system

2026-03-23 01:36:03 -04:00 · 2025-01-08 12:28:31 -05:00
parent 594481643b
commit 995692c48e
4 changed files with 265 additions and 81 deletions
--- a/src/core/diff/strategies/new-unified/edit-strategies.ts
+++ b/src/core/diff/strategies/new-unified/edit-strategies.ts
@@ -1,8 +1,8 @@
 import { diff_match_patch } from 'diff-match-patch';
 import * as git from 'isomorphic-git';
 import { fs as memfs, vol } from 'memfs';
-import { Hunk } from './types';
-import { getDMPSimilarity } from './search-strategies';
+import { Change, EditResult, Hunk } from './types';
+import { getDMPSimilarity, validateEditResult } from './search-strategies';

 // Helper function to infer indentation
 function inferIndentation(line: string, contextLines: string[], previousIndent: string = ''): string {
@@ -27,12 +27,6 @@ function inferIndentation(line: string, contextLines: string[], previousIndent:
  return previousIndent;
 }

-export type EditResult = {
-  confidence: number;
-  result: string[];
-  strategy: string;
-};
-
 // Context matching edit strategy
 export function applyContextMatching(hunk: Hunk, content: string[], matchPosition: number): EditResult {
  if (matchPosition === -1) {
@@ -43,6 +37,8 @@ export function applyContextMatching(hunk: Hunk, content: string[], matchPositio
  let sourceIndex = matchPosition;
  let previousIndent = '';

+  const hunkChanges = hunk.changes.filter(c => c.type !== 'context');
+
  for (const change of hunk.changes) {
    if (change.type === 'context') {
      newResult.push(change.originalLine || (change.indent + change.content));
@@ -66,10 +62,12 @@ export function applyContextMatching(hunk: Hunk, content: string[], matchPositio
  const similarity = getDMPSimilarity(
    content.slice(matchPosition, matchPosition + hunk.changes.length).join('\n'),
    newResult.slice(matchPosition, matchPosition + hunk.changes.length).join('\n')
-  );
-  
+  )
+
+  const confidence = validateEditResult(hunk, newResult.slice(matchPosition, matchPosition + hunkChanges.length + 1).join('\n'));
+
  return { 
-    confidence: similarity,
+    confidence: similarity * confidence,
    result: newResult,
    strategy: 'context'
  };
@@ -82,41 +80,53 @@ export function applyDMP(hunk: Hunk, content: string[], matchPosition: number):
  }

  const dmp = new diff_match_patch();
-  const currentText = content.join('\n');
-  const contextLines = hunk.changes
-    .filter(c => c.type === 'context')
-    .map(c => c.content);
+  const editRegion = content.slice(matchPosition, matchPosition + hunk.changes.length);
+  const editText = editRegion.join('\n');
+  
+  // Build the target text sequentially like in applyContextMatching
+  let targetText = '';
+  let previousIndent = '';
+  
+  for (const change of hunk.changes) {
+    if (change.type === 'context') {
+      targetText += (change.originalLine || (change.indent + change.content)) + '\n';
+      previousIndent = change.indent;
+    } else if (change.type === 'add') {
+      const indent = change.indent || inferIndentation(change.content, 
+        hunk.changes.filter(c => c.type === 'context').map(c => c.originalLine || ''),
+        previousIndent
+      );
+      targetText += indent + change.content + '\n';
+      previousIndent = indent;
+    }
+    // Skip remove changes as they shouldn't appear in target
+  }

-  // Create a patch from the hunk with proper indentation
-  const patch = dmp.patch_make(
-    currentText,
-    hunk.changes.reduce((acc, change) => {
-      if (change.type === 'add') {
-        const indent = change.indent || inferIndentation(change.content, contextLines);
-        return acc + indent + change.content + '\n';
-      }
-      if (change.type === 'remove') {
-        return acc.replace(change.content + '\n', '');
-      }
-      return acc + change.content + '\n';
-    }, '')
-  );
+  // Trim the trailing newline
+  targetText = targetText.replace(/\n$/, '');

-  const [patchedText] = dmp.patch_apply(patch, currentText);
-  const similarity = getDMPSimilarity(
-    content.slice(matchPosition, matchPosition + hunk.changes.length).join('\n'),
-    patchedText
-  );
+  const patch = dmp.patch_make(editText, targetText);
+  const [patchedText] = dmp.patch_apply(patch, editText);
+  
+  // Construct result with edited portion
+  const newResult = [
+    ...content.slice(0, matchPosition),
+    ...patchedText.split('\n'),
+    ...content.slice(matchPosition + hunk.changes.length)
+  ];
+
+  const similarity = getDMPSimilarity(editText, patchedText)
+  const confidence = validateEditResult(hunk, patchedText);
  
  return { 
-    confidence: similarity,
-    result: patchedText.split('\n'),
+    confidence: similarity * confidence,
+    result: newResult,
    strategy: 'dmp'
  };
 }

-// Git edit strategy
-export async function applyGit(hunk: Hunk, content: string[], matchPosition: number): Promise<EditResult> {
+// Git edit strategy with cherry-pick approach
+async function applyGit(hunk: Hunk, content: string[], matchPosition: number): Promise<EditResult> {
  if (matchPosition === -1) {
    return { confidence: 0, result: content, strategy: 'git' };
  }
@@ -124,26 +134,55 @@ export async function applyGit(hunk: Hunk, content: string[], matchPosition: num
  vol.reset();
  
  try {
+    // Initialize git repo
    await git.init({ fs: memfs, dir: '/' });
    
-    const originalContent = content.join('\n');
-    await memfs.promises.writeFile('/file.txt', originalContent);
-    
+    // Create original content - only use the edit region
+    const editRegion = content.slice(matchPosition, matchPosition + hunk.changes.length);
+    const editText = editRegion.join('\n');
+    await memfs.promises.writeFile('/file.txt', editText);
    await git.add({ fs: memfs, dir: '/', filepath: 'file.txt' });
    await git.commit({
      fs: memfs,
      dir: '/',
      author: { name: 'Temp', email: 'temp@example.com' },
-      message: 'Initial commit'
+      message: 'Original'
    });
+    const originalHash = await git.resolveRef({ fs: memfs, dir: '/', ref: 'HEAD' });

-    await git.branch({ fs: memfs, dir: '/', ref: 'patch-branch' });
-    await git.checkout({ fs: memfs, dir: '/', ref: 'patch-branch' });
-
-    const lines = originalContent.split('\n');
-    const newLines = [...lines];
-    let offset = matchPosition;
+    // Create search content (content with removals)
+    const searchLines = [...editRegion];
+    let offset = 0;
+    for (const change of hunk.changes) {
+      if (change.type === 'remove') {
+        const index = searchLines.findIndex(
+          (line, i) => i >= offset && line.trimLeft() === change.content
+        );
+        if (index !== -1) {
+          searchLines.splice(index, 1);
+        }
+      }
+      if (change.type !== 'add') {
+        offset++;
+      }
+    }
+    
+    // Create search branch and commit
+    await git.branch({ fs: memfs, dir: '/', ref: 'search' });
+    await git.checkout({ fs: memfs, dir: '/', ref: 'search' });
+    await memfs.promises.writeFile('/file.txt', searchLines.join('\n'));
+    await git.add({ fs: memfs, dir: '/', filepath: 'file.txt' });
+    await git.commit({
+      fs: memfs,
+      dir: '/',
+      author: { name: 'Temp', email: 'temp@example.com' },
+      message: 'Search state'
+    });
+    const searchHash = await git.resolveRef({ fs: memfs, dir: '/', ref: 'HEAD' });

+    // Create replace content (with additions)
+    const replaceLines = [...searchLines];
+    offset = 0;
    const contextLines = hunk.changes
      .filter(c => c.type === 'context')
      .map(c => c.content);
@@ -151,42 +190,108 @@ export async function applyGit(hunk: Hunk, content: string[], matchPosition: num
    for (const change of hunk.changes) {
      if (change.type === 'add') {
        const indent = change.indent || inferIndentation(change.content, contextLines);
-        newLines.splice(offset, 0, indent + change.content);
+        replaceLines.splice(offset, 0, indent + change.content);
        offset++;
-      } else if (change.type === 'remove') {
-        const index = newLines.findIndex(
-          (line, i) => i >= offset && line.trimLeft() === change.content
-        );
-        if (index !== -1) {
-          newLines.splice(index, 1);
-        }
-      } else {
+      } else if (change.type !== 'remove') {
        offset++;
      }
    }

-    const modifiedContent = newLines.join('\n');
-    await memfs.promises.writeFile('/file.txt', modifiedContent);
-
+    // Create replace branch and commit
+    await git.branch({ fs: memfs, dir: '/', ref: 'replace' });
+    await git.checkout({ fs: memfs, dir: '/', ref: 'replace' });
+    await memfs.promises.writeFile('/file.txt', replaceLines.join('\n'));
    await git.add({ fs: memfs, dir: '/', filepath: 'file.txt' });
    await git.commit({
      fs: memfs,
      dir: '/',
      author: { name: 'Temp', email: 'temp@example.com' },
-      message: 'Apply changes'
+      message: 'Replace state'
    });
+    const replaceHash = await git.resolveRef({ fs: memfs, dir: '/', ref: 'HEAD' });

-    const similarity = getDMPSimilarity(
-      content.slice(matchPosition, matchPosition + hunk.changes.length).join('\n'),
-      newLines.slice(matchPosition, matchPosition + hunk.changes.length).join('\n')
-    );
+    // Try both strategies:
+    // 1. OSR: Cherry-pick replace onto original
+    // 2. SR-SO: Apply search->replace changes to search->original

-    return { 
-      confidence: similarity,
-      result: newLines,
-      strategy: 'git'
-    };
+    // Strategy 1: OSR
+    await git.checkout({ fs: memfs, dir: '/', ref: originalHash });
+    try {
+      await git.merge({
+        fs: memfs,
+        dir: '/',
+        ours: originalHash,
+        theirs: replaceHash,
+        author: { name: 'Temp', email: 'temp@example.com' },
+        message: 'Cherry-pick OSR'
+      });
+      const osrResult = (await memfs.promises.readFile('/file.txt')).toString();
+      const osrSimilarity = getDMPSimilarity(editText, osrResult)
+
+      const confidence = validateEditResult(hunk, osrResult);
+      
+      if (osrSimilarity * confidence > 0.9) {
+        // Construct result with edited portion
+        const newResult = [
+          ...content.slice(0, matchPosition),
+          ...osrResult.split('\n'),
+          ...content.slice(matchPosition + hunk.changes.length)
+        ];
+        return {
+          confidence: osrSimilarity,
+          result: newResult,
+          strategy: 'git-osr'
+        };
+      }
+    } catch (error) {
+      console.log('OSR strategy failed:', error);
+    }
+
+    // Strategy 2: SR-SO
+    await git.checkout({ fs: memfs, dir: '/', ref: searchHash });
+    try {
+      // First apply original changes
+      await git.merge({
+        fs: memfs,
+        dir: '/',
+        ours: searchHash,
+        theirs: originalHash,
+        author: { name: 'Temp', email: 'temp@example.com' },
+        message: 'Apply original changes'
+      });
+
+      // Then apply replace changes
+      await git.merge({
+        fs: memfs,
+        dir: '/',
+        ours: 'HEAD',
+        theirs: replaceHash,
+        author: { name: 'Temp', email: 'temp@example.com' },
+        message: 'Apply replace changes'
+      });
+
+      const srsoResult = (await memfs.promises.readFile('/file.txt')).toString();
+      const srsoSimilarity = getDMPSimilarity(editText, srsoResult)
+
+      const confidence = validateEditResult(hunk, srsoResult);
+
+      // Construct result with edited portion
+      const newResult = [
+        ...content.slice(0, matchPosition),
+        ...srsoResult.split('\n'),
+        ...content.slice(matchPosition + hunk.changes.length)
+      ];
+      return {
+        confidence: srsoSimilarity * confidence,
+        result: newResult,
+        strategy: 'git-srso'
+      };
+    } catch (error) {
+      console.log('SR-SO strategy failed:', error);
+      return { confidence: 0, result: content, strategy: 'git' };
+    }
  } catch (error) {
+    console.log('Git strategy failed:', error);
    return { confidence: 0, result: content, strategy: 'git' };
  } finally {
    vol.reset();
@@ -195,9 +300,11 @@ export async function applyGit(hunk: Hunk, content: string[], matchPosition: num

 // Main edit function that tries strategies sequentially
 export async function applyEdit(hunk: Hunk, content: string[], matchPosition: number, confidence: number, debug: boolean = false): Promise<EditResult> {
+
  // Don't attempt any edits if confidence is too low and not in debug mode
  const MIN_CONFIDENCE = 0.9;
-  if (confidence < MIN_CONFIDENCE && !debug) {
+  if (confidence < MIN_CONFIDENCE) {
+    console.log(`Search confidence (${confidence}) below minimum threshold (${MIN_CONFIDENCE}), skipping edit`);
    return { confidence: 0, result: content, strategy: 'none' };
  }

@@ -211,15 +318,18 @@ export async function applyEdit(hunk: Hunk, content: string[], matchPosition: nu
  if (debug) {
    // In debug mode, try all strategies and return the first success
    const results = await Promise.all(strategies.map(async strategy => {
+      console.log(`Attempting edit with ${strategy.name} strategy...`);
      const result = await strategy.apply();
+      console.log(`Strategy ${strategy.name} succeeded with confidence ${result.confidence}`);
      return result;
    }));
    
    const successfulResults = results.filter(result => result.confidence > MIN_CONFIDENCE);
    if (successfulResults.length > 0) {
-      return successfulResults.reduce((best, current) => 
+      const bestResult = successfulResults.reduce((best, current) => 
        current.confidence > best.confidence ? current : best
      );
+      return bestResult;
    }
  } else {
    // Normal mode - try strategies sequentially until one succeeds
--- a/src/core/diff/strategies/new-unified/index.ts
+++ b/src/core/diff/strategies/new-unified/index.ts
@@ -162,7 +162,8 @@ Your diff here
 	): Promise<DiffResult> {
 		const MIN_CONFIDENCE = 0.9
 		const parsedDiff = this.parseUnifiedDiff(diffContent)
-		let result = originalContent.split("\n")
+    const originalLines = originalContent.split("\n")
+		let result = [...originalLines]

 		for (const hunk of parsedDiff.hunks) {
 			const contextStr = prepareSearchString(hunk.changes)
--- a/src/core/diff/strategies/new-unified/search-strategies.ts
+++ b/src/core/diff/strategies/new-unified/search-strategies.ts
@@ -1,7 +1,7 @@
 import { compareTwoStrings } from 'string-similarity';
 import { closest } from 'fastest-levenshtein';
 import { diff_match_patch } from 'diff-match-patch';
-import { Change } from './types';
+import { Change, Hunk } from './types';

 export type SearchResult = {
  index: number;
@@ -32,10 +32,70 @@ export function getDMPSimilarity(original: string, modified: string): number {
  dmp.diff_cleanupSemantic(diffs);
  const patches = dmp.patch_make(original, diffs);
  const [expectedText] = dmp.patch_apply(patches, original);
+
  const similarity = evaluateSimilarity(expectedText, modified);
  return similarity;
 }

+// Helper function to validate edit results using hunk information
+// Returns a confidence reduction value between 0 and 1
+// Example: If similarity is 0.8 and MIN_CONFIDENCE is 0.95,
+// returns 0.1 (0.5 * (1 - 0.8)) to reduce confidence proportionally but with less impact.
+// If similarity >= MIN_CONFIDENCE, returns 0 (no reduction).
+export function validateEditResult(hunk: Hunk, result: string): number {
+  const hunkDeepCopy: Hunk = JSON.parse(JSON.stringify(hunk));
+  
+  // Create skeleton of original content (context + removed lines)
+  const originalSkeleton = hunkDeepCopy.changes
+    .filter(change => change.type === 'context' || change.type === 'remove')
+    .map(change => change.content)
+    .join('\n');
+
+  // Create skeleton of expected result (context + added lines)
+  const expectedSkeleton = hunkDeepCopy.changes
+    .filter(change => change.type === 'context' || change.type === 'add')
+    .map(change => change.content)
+    .join('\n');
+
+  // Compare with original content
+  const originalSimilarity = evaluateSimilarity(originalSkeleton, result);
+  
+  // If result is too similar to original, it means changes weren't applied
+  if (originalSimilarity > 0.9) {
+    console.log('Result too similar to original content:', originalSimilarity);
+    return 0.5; // Significant confidence reduction
+  }
+
+  // Compare with expected result
+  const expectedSimilarity = evaluateSimilarity(expectedSkeleton, result);
+  console.log('Original similarity:', originalSimilarity);
+  console.log('Expected similarity:', expectedSimilarity);
+
+  console.log('originalSkeleton:', originalSkeleton);
+  console.log('expectedSkeleton:', expectedSkeleton);
+  console.log('result:', result);
+  
+  // Scale between 0.98 and 1.0 (4% impact) based on expected similarity
+  const multiplier = expectedSimilarity < MIN_CONFIDENCE 
+    ? 0.96 + (0.04 * expectedSimilarity) 
+    : 1;
+
+  return multiplier;
+}
+
+// Helper function to validate context lines against original content
+function validateContextLines(searchStr: string, content: string): number {
+  // Extract just the context lines from the search string
+  const contextLines = searchStr.split('\n')
+    .filter(line => !line.startsWith('-'));  // Exclude removed lines
+  
+  // Compare context lines with content
+  const similarity = evaluateSimilarity(contextLines.join('\n'), content);
+  
+  // Context lines must match very closely, or confidence drops significantly
+  return similarity < MIN_CONFIDENCE ? similarity * 0.3 : similarity;
+}
+
 // Exact match strategy
 export function findExactMatch(searchStr: string, content: string[], startIndex: number = 0): SearchResult {
  const contentStr = content.slice(startIndex).join('\n');
@@ -48,10 +108,13 @@ export function findExactMatch(searchStr: string, content: string[], startIndex:
      startIndex + contentStr.slice(0, exactMatch).split('\n').length - 1 + searchLines.length
    ).join('\n');
    
-    const dmpValid = getDMPSimilarity(searchStr, matchedContent) >= MIN_CONFIDENCE;
+    const similarity = getDMPSimilarity(searchStr, matchedContent);
+    const contextSimilarity = validateContextLines(searchStr, matchedContent);
+    const confidence = Math.min(similarity, contextSimilarity);
+    
    return {
      index: startIndex + contentStr.slice(0, exactMatch).split('\n').length - 1,
-      confidence: dmpValid ? 1.0 : 0.9,
+      confidence,
      strategy: 'exact'
    };
  }
@@ -70,8 +133,9 @@ export function findSimilarityMatch(searchStr: string, content: string[], startI
    const windowStr = content.slice(i, i + searchLines.length).join('\n');
    const score = compareTwoStrings(searchStr, windowStr);
    if (score > bestScore && score >= minScore) {
-      const dmpValid = getDMPSimilarity(searchStr, windowStr) >= MIN_CONFIDENCE;
-      const adjustedScore = dmpValid ? score : score * 0.9;
+      const similarity = getDMPSimilarity(searchStr, windowStr);
+      const contextSimilarity = validateContextLines(searchStr, windowStr);
+      const adjustedScore = Math.min(similarity, contextSimilarity) * score;
      
      if (adjustedScore > bestScore) {
        bestScore = adjustedScore;
@@ -99,10 +163,13 @@ export function findLevenshteinMatch(searchStr: string, content: string[], start
  if (candidates.length > 0) {
    const closestMatch = closest(searchStr, candidates);
    const index = startIndex + candidates.indexOf(closestMatch);
-    const dmpValid = getDMPSimilarity(searchStr, closestMatch) >= MIN_CONFIDENCE;
+    const similarity = getDMPSimilarity(searchStr, closestMatch);
+    const contextSimilarity = validateContextLines(searchStr, closestMatch);
+    const confidence = Math.min(similarity, contextSimilarity) * 0.7;  // Still apply Levenshtein penalty
+    
    return { 
      index, 
-      confidence: dmpValid ? 0.7 : 0.6,
+      confidence,
      strategy: 'levenshtein'
    };
  }
--- a/src/core/diff/strategies/new-unified/types.ts
+++ b/src/core/diff/strategies/new-unified/types.ts
@@ -11,4 +11,10 @@ export type Hunk = {

 export type Diff = {
  hunks: Hunk[];
-}; 
+}; 
+
+export type EditResult = {
+  confidence: number;
+  result: string[];
+  strategy: string;
+};