Add support for reading PDF and docx files

This commit is contained in:
Saoud Rizwan
2024-08-31 02:21:23 -04:00
parent 5a05279a4d
commit 1d87bcf767
4 changed files with 195 additions and 14 deletions

View File

@@ -27,6 +27,7 @@ import { ClaudeAskResponse } from "./shared/WebviewMessage"
import { findLast, findLastIndex } from "./utils"
import { truncateHalfConversation } from "./utils/context-management"
import { regexSearchFiles } from "./utils/ripgrep"
import { extractTextFromFile } from "./utils/extract-text"
const SYSTEM_PROMPT =
() => `You are Claude Dev, a highly skilled software developer with extensive knowledge in many programming languages, frameworks, design patterns, and best practices.
@@ -924,7 +925,7 @@ export class ClaudeDev {
}
try {
const absolutePath = path.resolve(cwd, relPath)
const content = await fs.readFile(absolutePath, "utf-8")
const content = await extractTextFromFile(absolutePath)
const message = JSON.stringify({
tool: "readFile",

38
src/utils/extract-text.ts Normal file
View File

@@ -0,0 +1,38 @@
import * as path from "path"
import pdf from "pdf-parse"
import mammoth from "mammoth"
import { isBinaryFile } from "isbinaryfile"
import fs from "fs/promises"
export async function extractTextFromFile(filePath: string): Promise<string> {
try {
await fs.access(filePath)
} catch (error) {
throw new Error(`File not found: ${filePath}`)
}
const fileExtension = path.extname(filePath).toLowerCase()
switch (fileExtension) {
case ".pdf":
return extractTextFromPDF(filePath)
case ".docx":
return extractTextFromDOCX(filePath)
default:
const isBinary = await isBinaryFile(filePath)
if (!isBinary) {
return await fs.readFile(filePath, "utf8")
} else {
throw new Error(`Unsupported file type: ${fileExtension}`)
}
}
}
async function extractTextFromPDF(filePath: string): Promise<string> {
const dataBuffer = await fs.readFile(filePath)
const data = await pdf(dataBuffer)
return data.text
}
async function extractTextFromDOCX(filePath: string): Promise<string> {
const result = await mammoth.extractRawText({ path: filePath })
return result.value
}