mirror of
https://github.com/pacnpal/Roo-Code.git
synced 2025-12-23 13:51:11 -05:00
90 lines
3.1 KiB
TypeScript
90 lines
3.1 KiB
TypeScript
import * as path from "path"
|
|
// @ts-ignore-next-line
|
|
import pdf from "pdf-parse/lib/pdf-parse"
|
|
import mammoth from "mammoth"
|
|
import fs from "fs/promises"
|
|
import { isBinaryFile } from "isbinaryfile"
|
|
|
|
export async function extractTextFromFile(filePath: string): Promise<string> {
|
|
try {
|
|
await fs.access(filePath)
|
|
} catch (error) {
|
|
throw new Error(`File not found: ${filePath}`)
|
|
}
|
|
const fileExtension = path.extname(filePath).toLowerCase()
|
|
switch (fileExtension) {
|
|
case ".pdf":
|
|
return extractTextFromPDF(filePath)
|
|
case ".docx":
|
|
return extractTextFromDOCX(filePath)
|
|
case ".ipynb":
|
|
return extractTextFromIPYNB(filePath)
|
|
default:
|
|
const isBinary = await isBinaryFile(filePath).catch(() => false)
|
|
if (!isBinary) {
|
|
return addLineNumbers(await fs.readFile(filePath, "utf8"))
|
|
} else {
|
|
throw new Error(`Cannot read text for file type: ${fileExtension}`)
|
|
}
|
|
}
|
|
}
|
|
|
|
async function extractTextFromPDF(filePath: string): Promise<string> {
|
|
const dataBuffer = await fs.readFile(filePath)
|
|
const data = await pdf(dataBuffer)
|
|
return addLineNumbers(data.text)
|
|
}
|
|
|
|
async function extractTextFromDOCX(filePath: string): Promise<string> {
|
|
const result = await mammoth.extractRawText({ path: filePath })
|
|
return addLineNumbers(result.value)
|
|
}
|
|
|
|
async function extractTextFromIPYNB(filePath: string): Promise<string> {
|
|
const data = await fs.readFile(filePath, "utf8")
|
|
const notebook = JSON.parse(data)
|
|
let extractedText = ""
|
|
|
|
for (const cell of notebook.cells) {
|
|
if ((cell.cell_type === "markdown" || cell.cell_type === "code") && cell.source) {
|
|
extractedText += cell.source.join("\n") + "\n"
|
|
}
|
|
}
|
|
|
|
return addLineNumbers(extractedText)
|
|
}
|
|
|
|
export function addLineNumbers(content: string, startLine: number = 1): string {
|
|
const lines = content.split('\n')
|
|
const maxLineNumberWidth = String(startLine + lines.length - 1).length
|
|
return lines
|
|
.map((line, index) => {
|
|
const lineNumber = String(startLine + index).padStart(maxLineNumberWidth, ' ')
|
|
return `${lineNumber} | ${line}`
|
|
}).join('\n')
|
|
}
|
|
// Checks if every line in the content has line numbers prefixed (e.g., "1 | content" or "123 | content")
|
|
// Line numbers must be followed by a single pipe character (not double pipes)
|
|
export function everyLineHasLineNumbers(content: string): boolean {
|
|
const lines = content.split(/\r?\n/)
|
|
return lines.length > 0 && lines.every(line => /^\s*\d+\s+\|(?!\|)/.test(line))
|
|
}
|
|
|
|
// Strips line numbers from content while preserving the actual content
|
|
// Handles formats like "1 | content", " 12 | content", "123 | content"
|
|
// Preserves content that naturally starts with pipe characters
|
|
export function stripLineNumbers(content: string): string {
|
|
// Split into lines to handle each line individually
|
|
const lines = content.split(/\r?\n/)
|
|
|
|
// Process each line
|
|
const processedLines = lines.map(line => {
|
|
// Match line number pattern and capture everything after the pipe
|
|
const match = line.match(/^\s*\d+\s+\|(?!\|)\s?(.*)$/)
|
|
return match ? match[1] : line
|
|
})
|
|
|
|
// Join back with original line endings
|
|
const lineEnding = content.includes('\r\n') ? '\r\n' : '\n'
|
|
return processedLines.join(lineEnding)
|
|
} |