From b7617e5f2ae6f2cc688f8bc4c86e0cd4fadbf330 Mon Sep 17 00:00:00 2001 From: Saoud Rizwan <7799382+saoudrizwan@users.noreply.github.com> Date: Wed, 18 Sep 2024 22:32:38 -0400 Subject: [PATCH] use isbinaryfile to more safely read contents of folders and files --- package-lock.json | 13 +++++++++++++ package.json | 1 + src/utils/context-mentions.ts | 33 ++++++++++++++++++++++----------- src/utils/extract-text.ts | 19 +++++++------------ 4 files changed, 43 insertions(+), 23 deletions(-) diff --git a/package-lock.json b/package-lock.json index 33cf47f..94366dd 100644 --- a/package-lock.json +++ b/package-lock.json @@ -25,6 +25,7 @@ "diff": "^5.2.0", "fast-deep-equal": "^3.1.3", "globby": "^14.0.2", + "isbinaryfile": "^5.0.2", "mammoth": "^1.8.0", "monaco-vscode-textmate-theme-converter": "^0.1.7", "openai": "^4.61.0", @@ -7878,6 +7879,18 @@ "integrity": "sha512-VLghIWNM6ELQzo7zwmcg0NmTVyWKYjvIeM83yjp0wRDTmUnrM678fQbcKBo6n2CJEF0szoG//ytg+TKla89ALQ==", "license": "MIT" }, + "node_modules/isbinaryfile": { + "version": "5.0.2", + "resolved": "https://registry.npmjs.org/isbinaryfile/-/isbinaryfile-5.0.2.tgz", + "integrity": "sha512-GvcjojwonMjWbTkfMpnVHVqXW/wKMYDfEpY94/8zy8HFMOqb/VL6oeONq9v87q4ttVlaTLnGXnJD4B5B1OTGIg==", + "license": "MIT", + "engines": { + "node": ">= 18.0.0" + }, + "funding": { + "url": "https://github.com/sponsors/gjtorikian/" + } + }, "node_modules/isexe": { "version": "2.0.0", "resolved": "https://registry.npmjs.org/isexe/-/isexe-2.0.0.tgz", diff --git a/package.json b/package.json index 73246fd..9b60619 100644 --- a/package.json +++ b/package.json @@ -161,6 +161,7 @@ "diff": "^5.2.0", "fast-deep-equal": "^3.1.3", "globby": "^14.0.2", + "isbinaryfile": "^5.0.2", "mammoth": "^1.8.0", "monaco-vscode-textmate-theme-converter": "^0.1.7", "openai": "^4.61.0", diff --git a/src/utils/context-mentions.ts b/src/utils/context-mentions.ts index 64b289b..482141c 100644 --- a/src/utils/context-mentions.ts +++ b/src/utils/context-mentions.ts @@ -5,6 +5,7 @@ import { UrlScraper } from "./UrlScraper" import { mentionRegexGlobal } from "../shared/context-mentions" import fs from "fs/promises" import { extractTextFromFile } from "./extract-text" +import { isBinaryFile } from "isbinaryfile" export function openMention(mention?: string): void { if (!mention) { @@ -92,12 +93,16 @@ async function getFileOrFolderContent(mentionPath: string, cwd: string): Promise const stats = await fs.stat(absPath) if (stats.isFile()) { + const isBinary = await isBinaryFile(absPath).catch(() => false) + if (isBinary) { + return "(Binary file)" + } const content = await extractTextFromFile(absPath) return content } else if (stats.isDirectory()) { const entries = await fs.readdir(absPath, { withFileTypes: true }) let directoryContent = "" - const fileContentPromises: Promise[] = [] + const fileContentPromises: Promise[] = [] entries.forEach((entry) => { if (entry.isFile()) { directoryContent += `- File: ${entry.name}\n` @@ -105,12 +110,18 @@ async function getFileOrFolderContent(mentionPath: string, cwd: string): Promise const absoluteFilePath = path.resolve(absPath, entry.name) // const relativeFilePath = path.relative(cwd, absoluteFilePath); fileContentPromises.push( - extractTextFromFile(absoluteFilePath) - .then((content) => `\n${content}\n`) - .catch( - (error) => - `\nError fetching content: ${error.message}\n` - ) + (async () => { + try { + const isBinary = await isBinaryFile(absoluteFilePath).catch(() => false) + if (isBinary) { + return undefined + } + const content = await extractTextFromFile(absoluteFilePath) + return `\n${content}\n` + } catch (error) { + return undefined + } + })() ) } else if (entry.isDirectory()) { directoryContent += `- Directory: ${entry.name}/\n` @@ -119,10 +130,10 @@ async function getFileOrFolderContent(mentionPath: string, cwd: string): Promise directoryContent += `- Other: ${entry.name}\n` } }) - const fileContents = await Promise.all(fileContentPromises) - return `${directoryContent}\n${fileContents.join("\n")}` + const fileContents = (await Promise.all(fileContentPromises)).filter((content) => content) + return `${directoryContent}\n${fileContents.join("\n")}`.trim() } else { - return "Unsupported file type." + return `(Failed to read contents of ${mentionPath})` } } catch (error) { throw new Error(`Failed to access path "${mentionPath}": ${error.message}`) @@ -149,7 +160,7 @@ async function getWorkspaceDiagnostics(cwd: string): Promise { } if (!diagnosticsDetails) { - return "No problems detected." + return "No errors or warnings detected." } return diagnosticsDetails.trim() diff --git a/src/utils/extract-text.ts b/src/utils/extract-text.ts index 414f58f..67a580a 100644 --- a/src/utils/extract-text.ts +++ b/src/utils/extract-text.ts @@ -3,6 +3,7 @@ import * as path from "path" import pdf from "pdf-parse/lib/pdf-parse" import mammoth from "mammoth" import fs from "fs/promises" +import { isBinaryFile } from "isbinaryfile" export async function extractTextFromFile(filePath: string): Promise { try { @@ -18,19 +19,13 @@ export async function extractTextFromFile(filePath: string): Promise { return extractTextFromDOCX(filePath) case ".ipynb": return extractTextFromIPYNB(filePath) - case ".jpg": - case ".jpeg": - case ".png": - case ".gif": - case ".webp": - case ".mp4": - case ".mp3": - case ".wav": - case ".avi": - case ".mov": - return "Cannot read media file." default: - return await fs.readFile(filePath, "utf8") + const isBinary = await isBinaryFile(filePath).catch(() => false) + if (!isBinary) { + return await fs.readFile(filePath, "utf8") + } else { + throw new Error(`Cannot read text for file type: ${fileExtension}`) + } } }