Add New Unified Diff Strategy Implementation

- Introduced a new unified diff strategy with support for context matching, DMP, and Git-based edits.
- Implemented helper functions for parsing unified diffs and evaluating similarity.
- Added types for changes, hunks, and diffs to enhance type safety.
- Created a main edit function that applies strategies sequentially based on confidence levels.
- Included detailed descriptions and usage examples for the new strategy.
This commit is contained in:
Daniel Riccio
2025-01-07 19:01:12 -05:00
parent 2857dd4996
commit 594481643b
4 changed files with 562 additions and 0 deletions

View File

@@ -0,0 +1,236 @@
import { diff_match_patch } from 'diff-match-patch';
import * as git from 'isomorphic-git';
import { fs as memfs, vol } from 'memfs';
import { Hunk } from './types';
import { getDMPSimilarity } from './search-strategies';
// Helper function to infer indentation
function inferIndentation(line: string, contextLines: string[], previousIndent: string = ''): string {
const match = line.match(/^(\s+)/);
if (match) {
return match[1];
}
for (const contextLine of contextLines) {
const contextMatch = contextLine.match(/^(\s+)/);
if (contextMatch) {
const currentLineDepth = (line.match(/^\s*/)?.[0] || '').length;
const contextLineDepth = contextMatch[1].length;
if (currentLineDepth > contextLineDepth) {
return contextMatch[1] + ' '.repeat(2);
}
return contextMatch[1];
}
}
return previousIndent;
}
export type EditResult = {
confidence: number;
result: string[];
strategy: string;
};
// Context matching edit strategy
export function applyContextMatching(hunk: Hunk, content: string[], matchPosition: number): EditResult {
if (matchPosition === -1) {
return { confidence: 0, result: content, strategy: 'context' };
}
const newResult = [...content.slice(0, matchPosition)];
let sourceIndex = matchPosition;
let previousIndent = '';
for (const change of hunk.changes) {
if (change.type === 'context') {
newResult.push(change.originalLine || (change.indent + change.content));
previousIndent = change.indent;
sourceIndex++;
} else if (change.type === 'add') {
const indent = change.indent || inferIndentation(change.content,
hunk.changes.filter(c => c.type === 'context').map(c => c.originalLine || ''),
previousIndent
);
newResult.push(indent + change.content);
previousIndent = indent;
} else if (change.type === 'remove') {
sourceIndex++;
}
}
newResult.push(...content.slice(sourceIndex));
// Validate the result
const similarity = getDMPSimilarity(
content.slice(matchPosition, matchPosition + hunk.changes.length).join('\n'),
newResult.slice(matchPosition, matchPosition + hunk.changes.length).join('\n')
);
return {
confidence: similarity,
result: newResult,
strategy: 'context'
};
}
// DMP edit strategy
export function applyDMP(hunk: Hunk, content: string[], matchPosition: number): EditResult {
if (matchPosition === -1) {
return { confidence: 0, result: content, strategy: 'dmp' };
}
const dmp = new diff_match_patch();
const currentText = content.join('\n');
const contextLines = hunk.changes
.filter(c => c.type === 'context')
.map(c => c.content);
// Create a patch from the hunk with proper indentation
const patch = dmp.patch_make(
currentText,
hunk.changes.reduce((acc, change) => {
if (change.type === 'add') {
const indent = change.indent || inferIndentation(change.content, contextLines);
return acc + indent + change.content + '\n';
}
if (change.type === 'remove') {
return acc.replace(change.content + '\n', '');
}
return acc + change.content + '\n';
}, '')
);
const [patchedText] = dmp.patch_apply(patch, currentText);
const similarity = getDMPSimilarity(
content.slice(matchPosition, matchPosition + hunk.changes.length).join('\n'),
patchedText
);
return {
confidence: similarity,
result: patchedText.split('\n'),
strategy: 'dmp'
};
}
// Git edit strategy
export async function applyGit(hunk: Hunk, content: string[], matchPosition: number): Promise<EditResult> {
if (matchPosition === -1) {
return { confidence: 0, result: content, strategy: 'git' };
}
vol.reset();
try {
await git.init({ fs: memfs, dir: '/' });
const originalContent = content.join('\n');
await memfs.promises.writeFile('/file.txt', originalContent);
await git.add({ fs: memfs, dir: '/', filepath: 'file.txt' });
await git.commit({
fs: memfs,
dir: '/',
author: { name: 'Temp', email: 'temp@example.com' },
message: 'Initial commit'
});
await git.branch({ fs: memfs, dir: '/', ref: 'patch-branch' });
await git.checkout({ fs: memfs, dir: '/', ref: 'patch-branch' });
const lines = originalContent.split('\n');
const newLines = [...lines];
let offset = matchPosition;
const contextLines = hunk.changes
.filter(c => c.type === 'context')
.map(c => c.content);
for (const change of hunk.changes) {
if (change.type === 'add') {
const indent = change.indent || inferIndentation(change.content, contextLines);
newLines.splice(offset, 0, indent + change.content);
offset++;
} else if (change.type === 'remove') {
const index = newLines.findIndex(
(line, i) => i >= offset && line.trimLeft() === change.content
);
if (index !== -1) {
newLines.splice(index, 1);
}
} else {
offset++;
}
}
const modifiedContent = newLines.join('\n');
await memfs.promises.writeFile('/file.txt', modifiedContent);
await git.add({ fs: memfs, dir: '/', filepath: 'file.txt' });
await git.commit({
fs: memfs,
dir: '/',
author: { name: 'Temp', email: 'temp@example.com' },
message: 'Apply changes'
});
const similarity = getDMPSimilarity(
content.slice(matchPosition, matchPosition + hunk.changes.length).join('\n'),
newLines.slice(matchPosition, matchPosition + hunk.changes.length).join('\n')
);
return {
confidence: similarity,
result: newLines,
strategy: 'git'
};
} catch (error) {
return { confidence: 0, result: content, strategy: 'git' };
} finally {
vol.reset();
}
}
// Main edit function that tries strategies sequentially
export async function applyEdit(hunk: Hunk, content: string[], matchPosition: number, confidence: number, debug: boolean = false): Promise<EditResult> {
// Don't attempt any edits if confidence is too low and not in debug mode
const MIN_CONFIDENCE = 0.9;
if (confidence < MIN_CONFIDENCE && !debug) {
return { confidence: 0, result: content, strategy: 'none' };
}
// Try each strategy in sequence until one succeeds
const strategies = [
{ name: 'context', apply: () => applyContextMatching(hunk, content, matchPosition) },
{ name: 'dmp', apply: () => applyDMP(hunk, content, matchPosition) },
{ name: 'git', apply: () => applyGit(hunk, content, matchPosition) }
];
if (debug) {
// In debug mode, try all strategies and return the first success
const results = await Promise.all(strategies.map(async strategy => {
const result = await strategy.apply();
return result;
}));
const successfulResults = results.filter(result => result.confidence > MIN_CONFIDENCE);
if (successfulResults.length > 0) {
return successfulResults.reduce((best, current) =>
current.confidence > best.confidence ? current : best
);
}
} else {
// Normal mode - try strategies sequentially until one succeeds
for (const strategy of strategies) {
const result = await strategy.apply();
if (result.confidence > MIN_CONFIDENCE) {
return result;
}
}
}
// If all strategies fail, return failure
return { confidence: 0, result: content, strategy: 'none' };
}

View File

@@ -0,0 +1,181 @@
import { Diff, Hunk } from "./types"
import { findBestMatch, prepareSearchString } from "./search-strategies"
import { applyEdit } from "./edit-strategies"
import { DiffResult, DiffStrategy } from "../../types"
export class NewUnifiedDiffStrategy implements DiffStrategy {
private parseUnifiedDiff(diff: string): Diff {
const lines = diff.split("\n")
const hunks: Hunk[] = []
let currentHunk: Hunk | null = null
let i = 0
while (i < lines.length && !lines[i].startsWith("@@")) {
i++
}
for (; i < lines.length; i++) {
const line = lines[i]
if (line.startsWith("@@")) {
if (currentHunk) {
hunks.push(currentHunk)
}
currentHunk = { changes: [] }
continue
}
if (!currentHunk) {
continue
}
// Extract the complete indentation for each line
const content = line.slice(1) // Remove the diff marker
const indentMatch = content.match(/^(\s*)/)
const indent = indentMatch ? indentMatch[0] : ""
const trimmedContent = content.slice(indent.length)
if (line.startsWith(" ")) {
currentHunk.changes.push({
type: "context",
content: trimmedContent,
indent,
originalLine: content,
})
} else if (line.startsWith("+")) {
currentHunk.changes.push({
type: "add",
content: trimmedContent,
indent,
originalLine: content,
})
} else if (line.startsWith("-")) {
currentHunk.changes.push({
type: "remove",
content: trimmedContent,
indent,
originalLine: content,
})
}
}
if (currentHunk && currentHunk.changes.length > 0) {
hunks.push(currentHunk)
}
return { hunks }
}
getToolDescription(cwd: string): string {
return `## apply_diff
Description: Apply a unified diff to a file at the specified path. This tool is useful when you need to make specific modifications to a file based on a set of changes provided in unified diff format (diff -U0).
Make sure you include the first 2 lines with the file paths.
Don't include timestamps with the file paths.
Start each hunk of changes with a \`@@ ... @@\` line.
Don't include line numbers like \`diff -U0\` does.
The user's patch tool doesn't need them.
Indentation matters in the diffs!
Start a new hunk for each section of the file that needs changes.
Only output hunks that specify changes with \`+\` or \`-\` lines.
Skip any hunks that are entirely unchanging \` \` lines.
The user's patch tool needs CORRECT patches that apply cleanly against the current contents of the file!
Think carefully and make sure you include and mark all lines that need to be removed or changed as \`-\` lines.
Make sure you mark all new or modified lines with \`+\`.
Don't leave out any lines or the diff patch won't apply correctly.
Output hunks in whatever order makes the most sense.
Hunks don't need to be in any particular order.
The hunks do not need line numbers.
When editing a function, method, loop, etc use a hunk to replace the *entire* code block.
Delete the entire existing version with \`-\` lines and then add a new, updated version with \`+\` lines.
This will help you generate correct code and correct diffs.
To move code within a file, use 2 hunks: 1 to delete it from its current location, 1 to insert it in the new location.
Parameters:
- path: (required) The path of the file to apply the diff to (relative to the current working directory ${cwd})
- diff: (required) The diff content in unified format to apply to the file.
For each file that needs to be changed, write out the changes similar to a unified diff like \`diff -U0\` would produce.
Example:
\`\`\`diff
--- mathweb/flask/app.py
+++ mathweb/flask/app.py
@@ ... @@
-class MathWeb:
+import sympy
+
+class MathWeb:
@@ ... @@
-def is_prime(x):
- if x < 2:
- return False
- for i in range(2, int(math.sqrt(x)) + 1):
- if x % i == 0:
- return False
- return True
@@ ... @@
-@app.route('/prime/<int:n>')
-def nth_prime(n):
- count = 0
- num = 1
- while count < n:
- num += 1
- if is_prime(num):
- count += 1
- return str(num)
+@app.route('/prime/<int:n>')
+def nth_prime(n):
+ count = 0
+ num = 1
+ while count < n:
+ num += 1
+ if sympy.isprime(num):
+ count += 1
+ return str(num)
\`\`\`
Usage:
<apply_diff>
<path>File path here</path>
<diff>
Your diff here
</diff>
</apply_diff>`
}
async applyDiff(
originalContent: string,
diffContent: string,
startLine?: number,
endLine?: number
): Promise<DiffResult> {
const MIN_CONFIDENCE = 0.9
const parsedDiff = this.parseUnifiedDiff(diffContent)
let result = originalContent.split("\n")
for (const hunk of parsedDiff.hunks) {
const contextStr = prepareSearchString(hunk.changes)
const { index: matchPosition, confidence } = findBestMatch(contextStr, result)
const editResult = await applyEdit(hunk, result, matchPosition, confidence)
if (editResult.confidence > MIN_CONFIDENCE) {
result = editResult.result
} else {
return { success: false, error: `Failed to apply edit using ${editResult.strategy} strategy` }
}
}
return { success: true, content: result.join("\n") }
}
}

View File

@@ -0,0 +1,131 @@
import { compareTwoStrings } from 'string-similarity';
import { closest } from 'fastest-levenshtein';
import { diff_match_patch } from 'diff-match-patch';
import { Change } from './types';
export type SearchResult = {
index: number;
confidence: number;
strategy: string;
};
//TODO: this should be configurable
const MIN_CONFIDENCE = 0.95;
// Helper function to prepare search string from context
export function prepareSearchString(changes: Change[]): string {
const lines = changes
.filter(c => c.type === 'context' || c.type === 'remove')
.map(c => c.content);
return lines.join('\n');
}
// Helper function to evaluate similarity between two texts
export function evaluateSimilarity(original: string, modified: string): number {
return compareTwoStrings(original, modified);
}
// Helper function to validate using diff-match-patch
export function getDMPSimilarity(original: string, modified: string): number {
const dmp = new diff_match_patch();
const diffs = dmp.diff_main(original, modified);
dmp.diff_cleanupSemantic(diffs);
const patches = dmp.patch_make(original, diffs);
const [expectedText] = dmp.patch_apply(patches, original);
const similarity = evaluateSimilarity(expectedText, modified);
return similarity;
}
// Exact match strategy
export function findExactMatch(searchStr: string, content: string[], startIndex: number = 0): SearchResult {
const contentStr = content.slice(startIndex).join('\n');
const searchLines = searchStr.split('\n');
const exactMatch = contentStr.indexOf(searchStr);
if (exactMatch !== -1) {
const matchedContent = content.slice(
startIndex + contentStr.slice(0, exactMatch).split('\n').length - 1,
startIndex + contentStr.slice(0, exactMatch).split('\n').length - 1 + searchLines.length
).join('\n');
const dmpValid = getDMPSimilarity(searchStr, matchedContent) >= MIN_CONFIDENCE;
return {
index: startIndex + contentStr.slice(0, exactMatch).split('\n').length - 1,
confidence: dmpValid ? 1.0 : 0.9,
strategy: 'exact'
};
}
return { index: -1, confidence: 0, strategy: 'exact' };
}
// String similarity strategy
export function findSimilarityMatch(searchStr: string, content: string[], startIndex: number = 0): SearchResult {
const searchLines = searchStr.split('\n');
let bestScore = 0;
let bestIndex = -1;
const minScore = 0.8;
for (let i = startIndex; i < content.length - searchLines.length + 1; i++) {
const windowStr = content.slice(i, i + searchLines.length).join('\n');
const score = compareTwoStrings(searchStr, windowStr);
if (score > bestScore && score >= minScore) {
const dmpValid = getDMPSimilarity(searchStr, windowStr) >= MIN_CONFIDENCE;
const adjustedScore = dmpValid ? score : score * 0.9;
if (adjustedScore > bestScore) {
bestScore = adjustedScore;
bestIndex = i;
}
}
}
return {
index: bestIndex,
confidence: bestIndex !== -1 ? bestScore : 0,
strategy: 'similarity'
};
}
// Levenshtein strategy
export function findLevenshteinMatch(searchStr: string, content: string[], startIndex: number = 0): SearchResult {
const searchLines = searchStr.split('\n');
const candidates = [];
for (let i = startIndex; i < content.length - searchLines.length + 1; i++) {
candidates.push(content.slice(i, i + searchLines.length).join('\n'));
}
if (candidates.length > 0) {
const closestMatch = closest(searchStr, candidates);
const index = startIndex + candidates.indexOf(closestMatch);
const dmpValid = getDMPSimilarity(searchStr, closestMatch) >= MIN_CONFIDENCE;
return {
index,
confidence: dmpValid ? 0.7 : 0.6,
strategy: 'levenshtein'
};
}
return { index: -1, confidence: 0, strategy: 'levenshtein' };
}
// Main search function that tries all strategies
export function findBestMatch(searchStr: string, content: string[], startIndex: number = 0): SearchResult {
const strategies = [
findExactMatch,
findSimilarityMatch,
findLevenshteinMatch
];
let bestResult: SearchResult = { index: -1, confidence: 0, strategy: 'none' };
for (const strategy of strategies) {
const result = strategy(searchStr, content, startIndex);
if (result.confidence > bestResult.confidence) {
bestResult = result;
}
}
return bestResult;
}

View File

@@ -0,0 +1,14 @@
export type Change = {
type: 'context' | 'add' | 'remove';
content: string;
indent: string;
originalLine?: string;
};
export type Hunk = {
changes: Change[];
};
export type Diff = {
hunks: Hunk[];
};