Refactor out of utils

This commit is contained in:
Saoud Rizwan
2024-09-24 11:36:37 -04:00
parent dedf8e9e48
commit 7c21a4c833
14 changed files with 18 additions and 18 deletions

View File

@@ -0,0 +1,55 @@
import * as path from "path"
// @ts-ignore-next-line
import pdf from "pdf-parse/lib/pdf-parse"
import mammoth from "mammoth"
import fs from "fs/promises"
import { isBinaryFile } from "isbinaryfile"
export async function extractTextFromFile(filePath: string): Promise<string> {
try {
await fs.access(filePath)
} catch (error) {
throw new Error(`File not found: ${filePath}`)
}
const fileExtension = path.extname(filePath).toLowerCase()
switch (fileExtension) {
case ".pdf":
return extractTextFromPDF(filePath)
case ".docx":
return extractTextFromDOCX(filePath)
case ".ipynb":
return extractTextFromIPYNB(filePath)
default:
const isBinary = await isBinaryFile(filePath).catch(() => false)
if (!isBinary) {
return await fs.readFile(filePath, "utf8")
} else {
throw new Error(`Cannot read text for file type: ${fileExtension}`)
}
}
}
async function extractTextFromPDF(filePath: string): Promise<string> {
const dataBuffer = await fs.readFile(filePath)
const data = await pdf(dataBuffer)
return data.text
}
async function extractTextFromDOCX(filePath: string): Promise<string> {
const result = await mammoth.extractRawText({ path: filePath })
return result.value
}
async function extractTextFromIPYNB(filePath: string): Promise<string> {
const data = await fs.readFile(filePath, "utf8")
const notebook = JSON.parse(data)
let extractedText = ""
for (const cell of notebook.cells) {
if ((cell.cell_type === "markdown" || cell.cell_type === "code") && cell.source) {
extractedText += cell.source.join("\n") + "\n"
}
}
return extractedText
}