use isbinaryfile to more safely read contents of folders and files

This commit is contained in:
Saoud Rizwan
2024-09-18 22:32:38 -04:00
parent 554da736d4
commit b7617e5f2a
4 changed files with 43 additions and 23 deletions

13
package-lock.json generated
View File

@@ -25,6 +25,7 @@
"diff": "^5.2.0",
"fast-deep-equal": "^3.1.3",
"globby": "^14.0.2",
"isbinaryfile": "^5.0.2",
"mammoth": "^1.8.0",
"monaco-vscode-textmate-theme-converter": "^0.1.7",
"openai": "^4.61.0",
@@ -7878,6 +7879,18 @@
"integrity": "sha512-VLghIWNM6ELQzo7zwmcg0NmTVyWKYjvIeM83yjp0wRDTmUnrM678fQbcKBo6n2CJEF0szoG//ytg+TKla89ALQ==",
"license": "MIT"
},
"node_modules/isbinaryfile": {
"version": "5.0.2",
"resolved": "https://registry.npmjs.org/isbinaryfile/-/isbinaryfile-5.0.2.tgz",
"integrity": "sha512-GvcjojwonMjWbTkfMpnVHVqXW/wKMYDfEpY94/8zy8HFMOqb/VL6oeONq9v87q4ttVlaTLnGXnJD4B5B1OTGIg==",
"license": "MIT",
"engines": {
"node": ">= 18.0.0"
},
"funding": {
"url": "https://github.com/sponsors/gjtorikian/"
}
},
"node_modules/isexe": {
"version": "2.0.0",
"resolved": "https://registry.npmjs.org/isexe/-/isexe-2.0.0.tgz",

View File

@@ -161,6 +161,7 @@
"diff": "^5.2.0",
"fast-deep-equal": "^3.1.3",
"globby": "^14.0.2",
"isbinaryfile": "^5.0.2",
"mammoth": "^1.8.0",
"monaco-vscode-textmate-theme-converter": "^0.1.7",
"openai": "^4.61.0",

View File

@@ -5,6 +5,7 @@ import { UrlScraper } from "./UrlScraper"
import { mentionRegexGlobal } from "../shared/context-mentions"
import fs from "fs/promises"
import { extractTextFromFile } from "./extract-text"
import { isBinaryFile } from "isbinaryfile"
export function openMention(mention?: string): void {
if (!mention) {
@@ -92,12 +93,16 @@ async function getFileOrFolderContent(mentionPath: string, cwd: string): Promise
const stats = await fs.stat(absPath)
if (stats.isFile()) {
const isBinary = await isBinaryFile(absPath).catch(() => false)
if (isBinary) {
return "(Binary file)"
}
const content = await extractTextFromFile(absPath)
return content
} else if (stats.isDirectory()) {
const entries = await fs.readdir(absPath, { withFileTypes: true })
let directoryContent = ""
const fileContentPromises: Promise<string>[] = []
const fileContentPromises: Promise<string | undefined>[] = []
entries.forEach((entry) => {
if (entry.isFile()) {
directoryContent += `- File: ${entry.name}\n`
@@ -105,12 +110,18 @@ async function getFileOrFolderContent(mentionPath: string, cwd: string): Promise
const absoluteFilePath = path.resolve(absPath, entry.name)
// const relativeFilePath = path.relative(cwd, absoluteFilePath);
fileContentPromises.push(
extractTextFromFile(absoluteFilePath)
.then((content) => `<file_content path="${filePath}">\n${content}\n</file_content>`)
.catch(
(error) =>
`<file_content path="${filePath}">\nError fetching content: ${error.message}\n</file_content>`
)
(async () => {
try {
const isBinary = await isBinaryFile(absoluteFilePath).catch(() => false)
if (isBinary) {
return undefined
}
const content = await extractTextFromFile(absoluteFilePath)
return `<file_content path="${filePath}">\n${content}\n</file_content>`
} catch (error) {
return undefined
}
})()
)
} else if (entry.isDirectory()) {
directoryContent += `- Directory: ${entry.name}/\n`
@@ -119,10 +130,10 @@ async function getFileOrFolderContent(mentionPath: string, cwd: string): Promise
directoryContent += `- Other: ${entry.name}\n`
}
})
const fileContents = await Promise.all(fileContentPromises)
return `${directoryContent}\n${fileContents.join("\n")}`
const fileContents = (await Promise.all(fileContentPromises)).filter((content) => content)
return `${directoryContent}\n${fileContents.join("\n")}`.trim()
} else {
return "Unsupported file type."
return `(Failed to read contents of ${mentionPath})`
}
} catch (error) {
throw new Error(`Failed to access path "${mentionPath}": ${error.message}`)
@@ -149,7 +160,7 @@ async function getWorkspaceDiagnostics(cwd: string): Promise<string> {
}
if (!diagnosticsDetails) {
return "No problems detected."
return "No errors or warnings detected."
}
return diagnosticsDetails.trim()

View File

@@ -3,6 +3,7 @@ import * as path from "path"
import pdf from "pdf-parse/lib/pdf-parse"
import mammoth from "mammoth"
import fs from "fs/promises"
import { isBinaryFile } from "isbinaryfile"
export async function extractTextFromFile(filePath: string): Promise<string> {
try {
@@ -18,19 +19,13 @@ export async function extractTextFromFile(filePath: string): Promise<string> {
return extractTextFromDOCX(filePath)
case ".ipynb":
return extractTextFromIPYNB(filePath)
case ".jpg":
case ".jpeg":
case ".png":
case ".gif":
case ".webp":
case ".mp4":
case ".mp3":
case ".wav":
case ".avi":
case ".mov":
return "Cannot read media file."
default:
return await fs.readFile(filePath, "utf8")
const isBinary = await isBinaryFile(filePath).catch(() => false)
if (!isBinary) {
return await fs.readFile(filePath, "utf8")
} else {
throw new Error(`Cannot read text for file type: ${fileExtension}`)
}
}
}