import * as path from "path" // @ts-ignore-next-line import pdf from "pdf-parse/lib/pdf-parse" import mammoth from "mammoth" import fs from "fs/promises" import { isBinaryFile } from "isbinaryfile" export async function extractTextFromFile(filePath: string): Promise { try { await fs.access(filePath) } catch (error) { throw new Error(`File not found: ${filePath}`) } const fileExtension = path.extname(filePath).toLowerCase() switch (fileExtension) { case ".pdf": return extractTextFromPDF(filePath) case ".docx": return extractTextFromDOCX(filePath) case ".ipynb": return extractTextFromIPYNB(filePath) default: const isBinary = await isBinaryFile(filePath).catch(() => false) if (!isBinary) { return addLineNumbers(await fs.readFile(filePath, "utf8")) } else { throw new Error(`Cannot read text for file type: ${fileExtension}`) } } } async function extractTextFromPDF(filePath: string): Promise { const dataBuffer = await fs.readFile(filePath) const data = await pdf(dataBuffer) return addLineNumbers(data.text) } async function extractTextFromDOCX(filePath: string): Promise { const result = await mammoth.extractRawText({ path: filePath }) return addLineNumbers(result.value) } async function extractTextFromIPYNB(filePath: string): Promise { const data = await fs.readFile(filePath, "utf8") const notebook = JSON.parse(data) let extractedText = "" for (const cell of notebook.cells) { if ((cell.cell_type === "markdown" || cell.cell_type === "code") && cell.source) { extractedText += cell.source.join("\n") + "\n" } } return addLineNumbers(extractedText) } export function addLineNumbers(content: string, startLine: number = 1): string { const lines = content.split('\n') const maxLineNumberWidth = String(startLine + lines.length - 1).length return lines .map((line, index) => { const lineNumber = String(startLine + index).padStart(maxLineNumberWidth, ' ') return `${lineNumber} | ${line}` }).join('\n') } // Checks if every line in the content has line numbers prefixed (e.g., "1 | content" or "123 | content") // Line numbers must be followed by a single pipe character (not double pipes) export function everyLineHasLineNumbers(content: string): boolean { const lines = content.split(/\r?\n/) return lines.length > 0 && lines.every(line => /^\s*\d+\s+\|(?!\|)/.test(line)) } // Strips line numbers from content while preserving the actual content // Handles formats like "1 | content", " 12 | content", "123 | content" // Preserves content that naturally starts with pipe characters export function stripLineNumbers(content: string): string { // Split into lines to handle each line individually const lines = content.split(/\r?\n/) // Process each line const processedLines = lines.map(line => { // Match line number pattern and capture everything after the pipe const match = line.match(/^\s*\d+\s+\|(?!\|)\s?(.*)$/) return match ? match[1] : line }) // Join back with original line endings const lineEnding = content.includes('\r\n') ? '\r\n' : '\n' return processedLines.join(lineEnding) }