Files
Roo-Code/src/integrations/misc/extract-text.ts
2024-12-17 09:28:37 -05:00

90 lines
3.1 KiB
TypeScript

import * as path from "path"
// @ts-ignore-next-line
import pdf from "pdf-parse/lib/pdf-parse"
import mammoth from "mammoth"
import fs from "fs/promises"
import { isBinaryFile } from "isbinaryfile"
export async function extractTextFromFile(filePath: string): Promise<string> {
try {
await fs.access(filePath)
} catch (error) {
throw new Error(`File not found: ${filePath}`)
}
const fileExtension = path.extname(filePath).toLowerCase()
switch (fileExtension) {
case ".pdf":
return extractTextFromPDF(filePath)
case ".docx":
return extractTextFromDOCX(filePath)
case ".ipynb":
return extractTextFromIPYNB(filePath)
default:
const isBinary = await isBinaryFile(filePath).catch(() => false)
if (!isBinary) {
return addLineNumbers(await fs.readFile(filePath, "utf8"))
} else {
throw new Error(`Cannot read text for file type: ${fileExtension}`)
}
}
}
async function extractTextFromPDF(filePath: string): Promise<string> {
const dataBuffer = await fs.readFile(filePath)
const data = await pdf(dataBuffer)
return addLineNumbers(data.text)
}
async function extractTextFromDOCX(filePath: string): Promise<string> {
const result = await mammoth.extractRawText({ path: filePath })
return addLineNumbers(result.value)
}
async function extractTextFromIPYNB(filePath: string): Promise<string> {
const data = await fs.readFile(filePath, "utf8")
const notebook = JSON.parse(data)
let extractedText = ""
for (const cell of notebook.cells) {
if ((cell.cell_type === "markdown" || cell.cell_type === "code") && cell.source) {
extractedText += cell.source.join("\n") + "\n"
}
}
return addLineNumbers(extractedText)
}
export function addLineNumbers(content: string, startLine: number = 1): string {
const lines = content.split('\n')
const maxLineNumberWidth = String(startLine + lines.length - 1).length
return lines
.map((line, index) => {
const lineNumber = String(startLine + index).padStart(maxLineNumberWidth, ' ')
return `${lineNumber} | ${line}`
}).join('\n')
}
// Checks if every line in the content has line numbers prefixed (e.g., "1 | content" or "123 | content")
// Line numbers must be followed by a single pipe character (not double pipes)
export function everyLineHasLineNumbers(content: string): boolean {
const lines = content.split(/\r?\n/)
return lines.length > 0 && lines.every(line => /^\s*\d+\s+\|(?!\|)/.test(line))
}
// Strips line numbers from content while preserving the actual content
// Handles formats like "1 | content", " 12 | content", "123 | content"
// Preserves content that naturally starts with pipe characters
export function stripLineNumbers(content: string): string {
// Split into lines to handle each line individually
const lines = content.split(/\r?\n/)
// Process each line
const processedLines = lines.map(line => {
// Match line number pattern and capture everything after the pipe
const match = line.match(/^\s*\d+\s+\|(?!\|)\s?(.*)$/)
return match ? match[1] : line
})
// Join back with original line endings
const lineEnding = content.includes('\r\n') ? '\r\n' : '\n'
return processedLines.join(lineEnding)
}