mirror of
https://github.com/pacnpal/Roo-Code.git
synced 2025-12-22 21:31:08 -05:00
56 lines
1.6 KiB
TypeScript
56 lines
1.6 KiB
TypeScript
import * as path from "path"
|
|
// @ts-ignore-next-line
|
|
import pdf from "pdf-parse/lib/pdf-parse"
|
|
import mammoth from "mammoth"
|
|
import fs from "fs/promises"
|
|
import { isBinaryFile } from "isbinaryfile"
|
|
|
|
export async function extractTextFromFile(filePath: string): Promise<string> {
|
|
try {
|
|
await fs.access(filePath)
|
|
} catch (error) {
|
|
throw new Error(`File not found: ${filePath}`)
|
|
}
|
|
const fileExtension = path.extname(filePath).toLowerCase()
|
|
switch (fileExtension) {
|
|
case ".pdf":
|
|
return extractTextFromPDF(filePath)
|
|
case ".docx":
|
|
return extractTextFromDOCX(filePath)
|
|
case ".ipynb":
|
|
return extractTextFromIPYNB(filePath)
|
|
default:
|
|
const isBinary = await isBinaryFile(filePath).catch(() => false)
|
|
if (!isBinary) {
|
|
return await fs.readFile(filePath, "utf8")
|
|
} else {
|
|
throw new Error(`Cannot read text for file type: ${fileExtension}`)
|
|
}
|
|
}
|
|
}
|
|
|
|
async function extractTextFromPDF(filePath: string): Promise<string> {
|
|
const dataBuffer = await fs.readFile(filePath)
|
|
const data = await pdf(dataBuffer)
|
|
return data.text
|
|
}
|
|
|
|
async function extractTextFromDOCX(filePath: string): Promise<string> {
|
|
const result = await mammoth.extractRawText({ path: filePath })
|
|
return result.value
|
|
}
|
|
|
|
async function extractTextFromIPYNB(filePath: string): Promise<string> {
|
|
const data = await fs.readFile(filePath, "utf8")
|
|
const notebook = JSON.parse(data)
|
|
let extractedText = ""
|
|
|
|
for (const cell of notebook.cells) {
|
|
if ((cell.cell_type === "markdown" || cell.cell_type === "code") && cell.source) {
|
|
extractedText += cell.source.join("\n") + "\n"
|
|
}
|
|
}
|
|
|
|
return extractedText
|
|
}
|