Add New Unified Diff Strategy Implementation

- Introduced a new unified diff strategy with support for context matching, DMP, and Git-based edits. - Implemented helper functions for parsing unified diffs and evaluating similarity. - Added types for changes, hunks, and diffs to enhance type safety. - Created a main edit function that applies strategies sequentially based on confidence levels. - Included detailed descriptions and usage examples for the new strategy.
2025-12-20 20:31:37 -05:00 · 2025-01-07 19:01:12 -05:00
parent 2857dd4996
commit 594481643b
4 changed files with 562 additions and 0 deletions
--- a/src/core/diff/strategies/new-unified/edit-strategies.ts
+++ b/src/core/diff/strategies/new-unified/edit-strategies.ts
@@ -0,0 +1,236 @@
+import { diff_match_patch } from 'diff-match-patch';
+import * as git from 'isomorphic-git';
+import { fs as memfs, vol } from 'memfs';
+import { Hunk } from './types';
+import { getDMPSimilarity } from './search-strategies';
+
+// Helper function to infer indentation
+function inferIndentation(line: string, contextLines: string[], previousIndent: string = ''): string {
+  const match = line.match(/^(\s+)/);
+  if (match) {
+    return match[1];
+  }
+
+  for (const contextLine of contextLines) {
+    const contextMatch = contextLine.match(/^(\s+)/);
+    if (contextMatch) {
+      const currentLineDepth = (line.match(/^\s*/)?.[0] || '').length;
+      const contextLineDepth = contextMatch[1].length;
+      
+      if (currentLineDepth > contextLineDepth) {
+        return contextMatch[1] + ' '.repeat(2);
+      }
+      return contextMatch[1];
+    }
+  }
+
+  return previousIndent;
+}
+
+export type EditResult = {
+  confidence: number;
+  result: string[];
+  strategy: string;
+};
+
+// Context matching edit strategy
+export function applyContextMatching(hunk: Hunk, content: string[], matchPosition: number): EditResult {
+  if (matchPosition === -1) {
+    return { confidence: 0, result: content, strategy: 'context' };
+  }
+
+  const newResult = [...content.slice(0, matchPosition)];
+  let sourceIndex = matchPosition;
+  let previousIndent = '';
+
+  for (const change of hunk.changes) {
+    if (change.type === 'context') {
+      newResult.push(change.originalLine || (change.indent + change.content));
+      previousIndent = change.indent;
+      sourceIndex++;
+    } else if (change.type === 'add') {
+      const indent = change.indent || inferIndentation(change.content, 
+        hunk.changes.filter(c => c.type === 'context').map(c => c.originalLine || ''),
+        previousIndent
+      );
+      newResult.push(indent + change.content);
+      previousIndent = indent;
+    } else if (change.type === 'remove') {
+      sourceIndex++;
+    }
+  }
+
+  newResult.push(...content.slice(sourceIndex));
+  
+  // Validate the result
+  const similarity = getDMPSimilarity(
+    content.slice(matchPosition, matchPosition + hunk.changes.length).join('\n'),
+    newResult.slice(matchPosition, matchPosition + hunk.changes.length).join('\n')
+  );
+  
+  return { 
+    confidence: similarity,
+    result: newResult,
+    strategy: 'context'
+  };
+}
+
+// DMP edit strategy
+export function applyDMP(hunk: Hunk, content: string[], matchPosition: number): EditResult {
+  if (matchPosition === -1) {
+    return { confidence: 0, result: content, strategy: 'dmp' };
+  }
+
+  const dmp = new diff_match_patch();
+  const currentText = content.join('\n');
+  const contextLines = hunk.changes
+    .filter(c => c.type === 'context')
+    .map(c => c.content);
+
+  // Create a patch from the hunk with proper indentation
+  const patch = dmp.patch_make(
+    currentText,
+    hunk.changes.reduce((acc, change) => {
+      if (change.type === 'add') {
+        const indent = change.indent || inferIndentation(change.content, contextLines);
+        return acc + indent + change.content + '\n';
+      }
+      if (change.type === 'remove') {
+        return acc.replace(change.content + '\n', '');
+      }
+      return acc + change.content + '\n';
+    }, '')
+  );
+
+  const [patchedText] = dmp.patch_apply(patch, currentText);
+  const similarity = getDMPSimilarity(
+    content.slice(matchPosition, matchPosition + hunk.changes.length).join('\n'),
+    patchedText
+  );
+  
+  return { 
+    confidence: similarity,
+    result: patchedText.split('\n'),
+    strategy: 'dmp'
+  };
+}
+
+// Git edit strategy
+export async function applyGit(hunk: Hunk, content: string[], matchPosition: number): Promise<EditResult> {
+  if (matchPosition === -1) {
+    return { confidence: 0, result: content, strategy: 'git' };
+  }
+
+  vol.reset();
+  
+  try {
+    await git.init({ fs: memfs, dir: '/' });
+    
+    const originalContent = content.join('\n');
+    await memfs.promises.writeFile('/file.txt', originalContent);
+    
+    await git.add({ fs: memfs, dir: '/', filepath: 'file.txt' });
+    await git.commit({
+      fs: memfs,
+      dir: '/',
+      author: { name: 'Temp', email: 'temp@example.com' },
+      message: 'Initial commit'
+    });
+
+    await git.branch({ fs: memfs, dir: '/', ref: 'patch-branch' });
+    await git.checkout({ fs: memfs, dir: '/', ref: 'patch-branch' });
+
+    const lines = originalContent.split('\n');
+    const newLines = [...lines];
+    let offset = matchPosition;
+
+    const contextLines = hunk.changes
+      .filter(c => c.type === 'context')
+      .map(c => c.content);
+
+    for (const change of hunk.changes) {
+      if (change.type === 'add') {
+        const indent = change.indent || inferIndentation(change.content, contextLines);
+        newLines.splice(offset, 0, indent + change.content);
+        offset++;
+      } else if (change.type === 'remove') {
+        const index = newLines.findIndex(
+          (line, i) => i >= offset && line.trimLeft() === change.content
+        );
+        if (index !== -1) {
+          newLines.splice(index, 1);
+        }
+      } else {
+        offset++;
+      }
+    }
+
+    const modifiedContent = newLines.join('\n');
+    await memfs.promises.writeFile('/file.txt', modifiedContent);
+
+    await git.add({ fs: memfs, dir: '/', filepath: 'file.txt' });
+    await git.commit({
+      fs: memfs,
+      dir: '/',
+      author: { name: 'Temp', email: 'temp@example.com' },
+      message: 'Apply changes'
+    });
+
+    const similarity = getDMPSimilarity(
+      content.slice(matchPosition, matchPosition + hunk.changes.length).join('\n'),
+      newLines.slice(matchPosition, matchPosition + hunk.changes.length).join('\n')
+    );
+
+    return { 
+      confidence: similarity,
+      result: newLines,
+      strategy: 'git'
+    };
+  } catch (error) {
+    return { confidence: 0, result: content, strategy: 'git' };
+  } finally {
+    vol.reset();
+  }
+}
+
+// Main edit function that tries strategies sequentially
+export async function applyEdit(hunk: Hunk, content: string[], matchPosition: number, confidence: number, debug: boolean = false): Promise<EditResult> {
+  // Don't attempt any edits if confidence is too low and not in debug mode
+  const MIN_CONFIDENCE = 0.9;
+  if (confidence < MIN_CONFIDENCE && !debug) {
+    return { confidence: 0, result: content, strategy: 'none' };
+  }
+
+  // Try each strategy in sequence until one succeeds
+  const strategies = [
+    { name: 'context', apply: () => applyContextMatching(hunk, content, matchPosition) },
+    { name: 'dmp', apply: () => applyDMP(hunk, content, matchPosition) },
+    { name: 'git', apply: () => applyGit(hunk, content, matchPosition) }
+  ];
+
+  if (debug) {
+    // In debug mode, try all strategies and return the first success
+    const results = await Promise.all(strategies.map(async strategy => {
+      const result = await strategy.apply();
+      return result;
+    }));
+    
+    const successfulResults = results.filter(result => result.confidence > MIN_CONFIDENCE);
+    if (successfulResults.length > 0) {
+      return successfulResults.reduce((best, current) => 
+        current.confidence > best.confidence ? current : best
+      );
+    }
+  } else {
+    // Normal mode - try strategies sequentially until one succeeds
+    for (const strategy of strategies) {
+      const result = await strategy.apply();
+      if (result.confidence > MIN_CONFIDENCE) {
+        return result;
+      }
+    }
+  }
+
+  // If all strategies fail, return failure
+  return { confidence: 0, result: content, strategy: 'none' };
+}
--- a/src/core/diff/strategies/new-unified/index.ts
+++ b/src/core/diff/strategies/new-unified/index.ts
@@ -0,0 +1,181 @@
+import { Diff, Hunk } from "./types"
+import { findBestMatch, prepareSearchString } from "./search-strategies"
+import { applyEdit } from "./edit-strategies"
+import { DiffResult, DiffStrategy } from "../../types"
+
+export class NewUnifiedDiffStrategy implements DiffStrategy {
+	private parseUnifiedDiff(diff: string): Diff {
+		const lines = diff.split("\n")
+		const hunks: Hunk[] = []
+		let currentHunk: Hunk | null = null
+
+		let i = 0
+		while (i < lines.length && !lines[i].startsWith("@@")) {
+			i++
+		}
+
+		for (; i < lines.length; i++) {
+			const line = lines[i]
+
+			if (line.startsWith("@@")) {
+				if (currentHunk) {
+					hunks.push(currentHunk)
+				}
+				currentHunk = { changes: [] }
+				continue
+			}
+
+			if (!currentHunk) {
+				continue
+			}
+
+			// Extract the complete indentation for each line
+			const content = line.slice(1) // Remove the diff marker
+			const indentMatch = content.match(/^(\s*)/)
+			const indent = indentMatch ? indentMatch[0] : ""
+			const trimmedContent = content.slice(indent.length)
+
+			if (line.startsWith(" ")) {
+				currentHunk.changes.push({
+					type: "context",
+					content: trimmedContent,
+					indent,
+					originalLine: content,
+				})
+			} else if (line.startsWith("+")) {
+				currentHunk.changes.push({
+					type: "add",
+					content: trimmedContent,
+					indent,
+					originalLine: content,
+				})
+			} else if (line.startsWith("-")) {
+				currentHunk.changes.push({
+					type: "remove",
+					content: trimmedContent,
+					indent,
+					originalLine: content,
+				})
+			}
+		}
+
+		if (currentHunk && currentHunk.changes.length > 0) {
+			hunks.push(currentHunk)
+		}
+
+		return { hunks }
+	}
+
+	getToolDescription(cwd: string): string {
+		return `## apply_diff
+Description: Apply a unified diff to a file at the specified path. This tool is useful when you need to make specific modifications to a file based on a set of changes provided in unified diff format (diff -U0).
+
+Make sure you include the first 2 lines with the file paths.
+Don't include timestamps with the file paths.
+
+Start each hunk of changes with a \`@@ ... @@\` line.
+Don't include line numbers like \`diff -U0\` does.
+The user's patch tool doesn't need them.
+
+Indentation matters in the diffs!
+
+Start a new hunk for each section of the file that needs changes.
+
+Only output hunks that specify changes with \`+\` or \`-\` lines.
+Skip any hunks that are entirely unchanging \` \` lines.
+
+The user's patch tool needs CORRECT patches that apply cleanly against the current contents of the file!
+Think carefully and make sure you include and mark all lines that need to be removed or changed as \`-\` lines.
+Make sure you mark all new or modified lines with \`+\`.
+Don't leave out any lines or the diff patch won't apply correctly.
+
+Output hunks in whatever order makes the most sense.
+Hunks don't need to be in any particular order.
+
+The hunks do not need line numbers.
+
+When editing a function, method, loop, etc use a hunk to replace the *entire* code block.
+Delete the entire existing version with \`-\` lines and then add a new, updated version with \`+\` lines.
+This will help you generate correct code and correct diffs.
+
+To move code within a file, use 2 hunks: 1 to delete it from its current location, 1 to insert it in the new location.
+
+Parameters:
+- path: (required) The path of the file to apply the diff to (relative to the current working directory ${cwd})
+- diff: (required) The diff content in unified format to apply to the file.
+
+For each file that needs to be changed, write out the changes similar to a unified diff like \`diff -U0\` would produce.
+
+
+Example:
+\`\`\`diff
+--- mathweb/flask/app.py
+++ mathweb/flask/app.py
+@@ ... @@
+-class MathWeb:
+import sympy
+
+class MathWeb:
+@@ ... @@
+-def is_prime(x):
+-    if x < 2:
+-        return False
+-    for i in range(2, int(math.sqrt(x)) + 1):
+-        if x % i == 0:
+-            return False
+-    return True
+@@ ... @@
+-@app.route('/prime/<int:n>')
+-def nth_prime(n):
+-    count = 0
+-    num = 1
+-    while count < n:
+-        num += 1
+-        if is_prime(num):
+-            count += 1
+-    return str(num)
+@app.route('/prime/<int:n>')
+def nth_prime(n):
+    count = 0
+    num = 1
+    while count < n:
+        num += 1
+        if sympy.isprime(num):
+            count += 1
+    return str(num)
+\`\`\`
+
+Usage:
+<apply_diff>
+<path>File path here</path>
+<diff>
+Your diff here
+</diff>
+</apply_diff>`
+	}
+
+	async applyDiff(
+		originalContent: string,
+		diffContent: string,
+		startLine?: number,
+		endLine?: number
+	): Promise<DiffResult> {
+		const MIN_CONFIDENCE = 0.9
+		const parsedDiff = this.parseUnifiedDiff(diffContent)
+		let result = originalContent.split("\n")
+
+		for (const hunk of parsedDiff.hunks) {
+			const contextStr = prepareSearchString(hunk.changes)
+			const { index: matchPosition, confidence } = findBestMatch(contextStr, result)
+
+			const editResult = await applyEdit(hunk, result, matchPosition, confidence)
+			if (editResult.confidence > MIN_CONFIDENCE) {
+				result = editResult.result
+			} else {
+				return { success: false, error: `Failed to apply edit using ${editResult.strategy} strategy` }
+			}
+		}
+
+		return { success: true, content: result.join("\n") }
+	}
+}
--- a/src/core/diff/strategies/new-unified/search-strategies.ts
+++ b/src/core/diff/strategies/new-unified/search-strategies.ts
@@ -0,0 +1,131 @@
+import { compareTwoStrings } from 'string-similarity';
+import { closest } from 'fastest-levenshtein';
+import { diff_match_patch } from 'diff-match-patch';
+import { Change } from './types';
+
+export type SearchResult = {
+  index: number;
+  confidence: number;
+  strategy: string;
+};
+
+//TODO: this should be configurable
+const MIN_CONFIDENCE = 0.95;
+
+// Helper function to prepare search string from context
+export function prepareSearchString(changes: Change[]): string {
+  const lines = changes
+    .filter(c => c.type === 'context' || c.type === 'remove')
+    .map(c => c.content);
+  return lines.join('\n');
+}
+
+// Helper function to evaluate similarity between two texts
+export function evaluateSimilarity(original: string, modified: string): number {
+  return compareTwoStrings(original, modified);
+}
+
+// Helper function to validate using diff-match-patch
+export function getDMPSimilarity(original: string, modified: string): number {
+  const dmp = new diff_match_patch();
+  const diffs = dmp.diff_main(original, modified);
+  dmp.diff_cleanupSemantic(diffs);
+  const patches = dmp.patch_make(original, diffs);
+  const [expectedText] = dmp.patch_apply(patches, original);
+  const similarity = evaluateSimilarity(expectedText, modified);
+  return similarity;
+}
+
+// Exact match strategy
+export function findExactMatch(searchStr: string, content: string[], startIndex: number = 0): SearchResult {
+  const contentStr = content.slice(startIndex).join('\n');
+  const searchLines = searchStr.split('\n');
+  
+  const exactMatch = contentStr.indexOf(searchStr);
+  if (exactMatch !== -1) {
+    const matchedContent = content.slice(
+      startIndex + contentStr.slice(0, exactMatch).split('\n').length - 1,
+      startIndex + contentStr.slice(0, exactMatch).split('\n').length - 1 + searchLines.length
+    ).join('\n');
+    
+    const dmpValid = getDMPSimilarity(searchStr, matchedContent) >= MIN_CONFIDENCE;
+    return {
+      index: startIndex + contentStr.slice(0, exactMatch).split('\n').length - 1,
+      confidence: dmpValid ? 1.0 : 0.9,
+      strategy: 'exact'
+    };
+  }
+  
+  return { index: -1, confidence: 0, strategy: 'exact' };
+}
+
+// String similarity strategy
+export function findSimilarityMatch(searchStr: string, content: string[], startIndex: number = 0): SearchResult {
+  const searchLines = searchStr.split('\n');
+  let bestScore = 0;
+  let bestIndex = -1;
+  const minScore = 0.8;
+
+  for (let i = startIndex; i < content.length - searchLines.length + 1; i++) {
+    const windowStr = content.slice(i, i + searchLines.length).join('\n');
+    const score = compareTwoStrings(searchStr, windowStr);
+    if (score > bestScore && score >= minScore) {
+      const dmpValid = getDMPSimilarity(searchStr, windowStr) >= MIN_CONFIDENCE;
+      const adjustedScore = dmpValid ? score : score * 0.9;
+      
+      if (adjustedScore > bestScore) {
+        bestScore = adjustedScore;
+        bestIndex = i;
+      }
+    }
+  }
+
+  return { 
+    index: bestIndex, 
+    confidence: bestIndex !== -1 ? bestScore : 0,
+    strategy: 'similarity'
+  };
+}
+
+// Levenshtein strategy
+export function findLevenshteinMatch(searchStr: string, content: string[], startIndex: number = 0): SearchResult {
+  const searchLines = searchStr.split('\n');
+  const candidates = [];
+  
+  for (let i = startIndex; i < content.length - searchLines.length + 1; i++) {
+    candidates.push(content.slice(i, i + searchLines.length).join('\n'));
+  }
+  
+  if (candidates.length > 0) {
+    const closestMatch = closest(searchStr, candidates);
+    const index = startIndex + candidates.indexOf(closestMatch);
+    const dmpValid = getDMPSimilarity(searchStr, closestMatch) >= MIN_CONFIDENCE;
+    return { 
+      index, 
+      confidence: dmpValid ? 0.7 : 0.6,
+      strategy: 'levenshtein'
+    };
+  }
+
+  return { index: -1, confidence: 0, strategy: 'levenshtein' };
+}
+
+// Main search function that tries all strategies
+export function findBestMatch(searchStr: string, content: string[], startIndex: number = 0): SearchResult {
+  const strategies = [
+    findExactMatch,
+    findSimilarityMatch,
+    findLevenshteinMatch
+  ];
+  
+  let bestResult: SearchResult = { index: -1, confidence: 0, strategy: 'none' };
+  
+  for (const strategy of strategies) {
+    const result = strategy(searchStr, content, startIndex);
+    if (result.confidence > bestResult.confidence) {
+      bestResult = result;
+    }
+  }
+  
+  return bestResult;
+} 
--- a/src/core/diff/strategies/new-unified/types.ts
+++ b/src/core/diff/strategies/new-unified/types.ts
@@ -0,0 +1,14 @@
+export type Change = {
+  type: 'context' | 'add' | 'remove';
+  content: string;
+  indent: string;
+  originalLine?: string;
+};
+
+export type Hunk = {
+  changes: Change[];
+};
+
+export type Diff = {
+  hunks: Hunk[];
+};