Refactor glob

This commit is contained in:
Saoud Rizwan
2024-09-24 11:21:08 -04:00
parent 24bad56785
commit 888b3b7022
4 changed files with 101 additions and 98 deletions

View File

@@ -11,7 +11,8 @@ import { serializeError } from "serialize-error"
import * as vscode from "vscode"
import { ApiHandler, buildApiHandler } from "../api"
import { TerminalManager } from "../integrations/TerminalManager"
import { listFiles, parseSourceCodeForDefinitionsTopLevel } from "../services/tree-sitter"
import { parseSourceCodeForDefinitionsTopLevel } from "../services/tree-sitter"
import { listFiles } from "../services/glob/list-files"
import { ClaudeDevProvider } from "./webviews/ClaudeDevProvider"
import { ApiConfiguration } from "../shared/api"
import { ClaudeRequestResult } from "../shared/ClaudeRequestResult"

View File

@@ -1,6 +1,6 @@
import * as vscode from "vscode"
import * as path from "path"
import { listFiles } from "../services/tree-sitter/index"
import { listFiles } from "../services/glob/list-files"
import { ClaudeDevProvider } from "../core/webviews/ClaudeDevProvider"
const cwd = vscode.workspace.workspaceFolders?.map((folder) => folder.uri.fsPath).at(0)

View File

@@ -0,0 +1,97 @@
import { globby, Options } from "globby"
import os from "os"
import * as path from "path"
import { arePathsEqual } from "../../utils/path-helpers"
export async function listFiles(dirPath: string, recursive: boolean, limit: number): Promise<[string[], boolean]> {
const absolutePath = path.resolve(dirPath)
// Do not allow listing files in root or home directory, which Claude tends to want to do when the user's prompt is vague.
const root = process.platform === "win32" ? path.parse(absolutePath).root : "/"
const isRoot = arePathsEqual(absolutePath, root)
if (isRoot) {
return [[root], false]
}
const homeDir = os.homedir()
const isHomeDir = arePathsEqual(absolutePath, homeDir)
if (isHomeDir) {
return [[homeDir], false]
}
const dirsToIgnore = [
"node_modules",
"__pycache__",
"env",
"venv",
"target/dependency",
"build/dependencies",
"dist",
"out",
"bundle",
"vendor",
"tmp",
"temp",
"deps",
"pkg",
"Pods",
".*", // '!**/.*' excludes hidden directories, while '!**/.*/**' excludes only their contents. This way we are at least aware of the existence of hidden directories.
].map((dir) => `**/${dir}/**`)
const options = {
cwd: dirPath,
dot: true, // do not ignore hidden files/directories
absolute: true,
markDirectories: true, // Append a / on any directories matched (/ is used on windows as well, so dont use path.sep)
gitignore: recursive, // globby ignores any files that are gitignored
ignore: recursive ? dirsToIgnore : undefined, // just in case there is no gitignore, we ignore sensible defaults
onlyFiles: false, // true by default, false means it will list directories on their own too
}
// * globs all files in one dir, ** globs files in nested directories
const files = recursive ? await globbyLevelByLevel(limit, options) : (await globby("*", options)).slice(0, limit)
return [files, files.length >= limit]
}
/*
Breadth-first traversal of directory structure level by level up to a limit:
- Queue-based approach ensures proper breadth-first traversal
- Processes directory patterns level by level
- Captures a representative sample of the directory structure up to the limit
- Minimizes risk of missing deeply nested files
- Notes:
- Relies on globby to mark directories with /
- Potential for loops if symbolic links reference back to parent (we could use followSymlinks: false but that may not be ideal for some projects and it's pointless if they're not using symlinks wrong)
- Timeout mechanism prevents infinite loops
*/
async function globbyLevelByLevel(limit: number, options?: Options) {
let results: Set<string> = new Set()
let queue: string[] = ["*"]
const globbingProcess = async () => {
while (queue.length > 0 && results.size < limit) {
const pattern = queue.shift()!
const filesAtLevel = await globby(pattern, options)
for (const file of filesAtLevel) {
if (results.size >= limit) {
break
}
results.add(file)
if (file.endsWith("/")) {
queue.push(`${file}*`)
}
}
}
return Array.from(results).slice(0, limit)
}
// Timeout after 10 seconds and return partial results
const timeoutPromise = new Promise<string[]>((_, reject) => {
setTimeout(() => reject(new Error("Globbing timeout")), 10_000)
})
try {
return await Promise.race([globbingProcess(), timeoutPromise])
} catch (error) {
console.warn("Globbing timed out, returning partial results")
return Array.from(results)
}
}

View File

@@ -1,9 +1,7 @@
import * as fs from "fs/promises"
import { globby, Options } from "globby"
import os from "os"
import * as path from "path"
import { listFiles } from "../glob/list-files"
import { LanguageParser, loadRequiredLanguageParsers } from "./languageParser"
import { arePathsEqual } from "../../utils/path-helpers"
// TODO: implement caching behavior to avoid having to keep analyzing project for new tasks.
export async function parseSourceCodeForDefinitionsTopLevel(dirPath: string): Promise<string> {
@@ -54,99 +52,6 @@ export async function parseSourceCodeForDefinitionsTopLevel(dirPath: string): Pr
return result ? result : "No source code definitions found."
}
export async function listFiles(dirPath: string, recursive: boolean, limit: number): Promise<[string[], boolean]> {
const absolutePath = path.resolve(dirPath)
// Do not allow listing files in root or home directory, which Claude tends to want to do when the user's prompt is vague.
const root = process.platform === "win32" ? path.parse(absolutePath).root : "/"
const isRoot = arePathsEqual(absolutePath, root)
if (isRoot) {
return [[root], false]
}
const homeDir = os.homedir()
const isHomeDir = arePathsEqual(absolutePath, homeDir)
if (isHomeDir) {
return [[homeDir], false]
}
const dirsToIgnore = [
"node_modules",
"__pycache__",
"env",
"venv",
"target/dependency",
"build/dependencies",
"dist",
"out",
"bundle",
"vendor",
"tmp",
"temp",
"deps",
"pkg",
"Pods",
".*", // '!**/.*' excludes hidden directories, while '!**/.*/**' excludes only their contents. This way we are at least aware of the existence of hidden directories.
].map((dir) => `**/${dir}/**`)
const options = {
cwd: dirPath,
dot: true, // do not ignore hidden files/directories
absolute: true,
markDirectories: true, // Append a / on any directories matched (/ is used on windows as well, so dont use path.sep)
gitignore: recursive, // globby ignores any files that are gitignored
ignore: recursive ? dirsToIgnore : undefined, // just in case there is no gitignore, we ignore sensible defaults
onlyFiles: false, // true by default, false means it will list directories on their own too
}
// * globs all files in one dir, ** globs files in nested directories
const files = recursive ? await globbyLevelByLevel(limit, options) : (await globby("*", options)).slice(0, limit)
return [files, files.length >= limit]
}
/*
Breadth-first traversal of directory structure level by level up to a limit:
- Queue-based approach ensures proper breadth-first traversal
- Processes directory patterns level by level
- Captures a representative sample of the directory structure up to the limit
- Minimizes risk of missing deeply nested files
- Notes:
- Relies on globby to mark directories with /
- Potential for loops if symbolic links reference back to parent (we could use followSymlinks: false but that may not be ideal for some projects and it's pointless if they're not using symlinks wrong)
- Timeout mechanism prevents infinite loops
*/
async function globbyLevelByLevel(limit: number, options?: Options) {
let results: Set<string> = new Set()
let queue: string[] = ["*"]
const globbingProcess = async () => {
while (queue.length > 0 && results.size < limit) {
const pattern = queue.shift()!
const filesAtLevel = await globby(pattern, options)
for (const file of filesAtLevel) {
if (results.size >= limit) {
break
}
results.add(file)
if (file.endsWith("/")) {
queue.push(`${file}*`)
}
}
}
return Array.from(results).slice(0, limit)
}
// Timeout after 10 seconds and return partial results
const timeoutPromise = new Promise<string[]>((_, reject) => {
setTimeout(() => reject(new Error("Globbing timeout")), 10_000)
})
try {
return await Promise.race([globbingProcess(), timeoutPromise])
} catch (error) {
console.warn("Globbing timed out, returning partial results")
return Array.from(results)
}
}
function separateFiles(allFiles: string[]): { filesToParse: string[]; remainingFiles: string[] } {
const extensions = [
"js",