import * as path from "path" // @ts-ignore-next-line import pdf from "pdf-parse/lib/pdf-parse" import mammoth from "mammoth" import fs from "fs/promises" import { isBinaryFile } from "isbinaryfile" export async function extractTextFromFile(filePath: string): Promise { try { await fs.access(filePath) } catch (error) { throw new Error(`File not found: ${filePath}`) } const fileExtension = path.extname(filePath).toLowerCase() switch (fileExtension) { case ".pdf": return extractTextFromPDF(filePath) case ".docx": return extractTextFromDOCX(filePath) case ".ipynb": return extractTextFromIPYNB(filePath) default: const isBinary = await isBinaryFile(filePath).catch(() => false) if (!isBinary) { return addLineNumbers(await fs.readFile(filePath, "utf8")) } else { throw new Error(`Cannot read text for file type: ${fileExtension}`) } } } async function extractTextFromPDF(filePath: string): Promise { const dataBuffer = await fs.readFile(filePath) const data = await pdf(dataBuffer) return addLineNumbers(data.text) } async function extractTextFromDOCX(filePath: string): Promise { const result = await mammoth.extractRawText({ path: filePath }) return addLineNumbers(result.value) } async function extractTextFromIPYNB(filePath: string): Promise { const data = await fs.readFile(filePath, "utf8") const notebook = JSON.parse(data) let extractedText = "" for (const cell of notebook.cells) { if ((cell.cell_type === "markdown" || cell.cell_type === "code") && cell.source) { extractedText += cell.source.join("\n") + "\n" } } return addLineNumbers(extractedText) } export function addLineNumbers(content: string, startLine: number = 1): string { const lines = content.split("\n") const maxLineNumberWidth = String(startLine + lines.length - 1).length return lines .map((line, index) => { const lineNumber = String(startLine + index).padStart(maxLineNumberWidth, " ") return `${lineNumber} | ${line}` }) .join("\n") } // Checks if every line in the content has line numbers prefixed (e.g., "1 | content" or "123 | content") // Line numbers must be followed by a single pipe character (not double pipes) export function everyLineHasLineNumbers(content: string): boolean { const lines = content.split(/\r?\n/) return lines.length > 0 && lines.every((line) => /^\s*\d+\s+\|(?!\|)/.test(line)) } // Strips line numbers from content while preserving the actual content // Handles formats like "1 | content", " 12 | content", "123 | content" // Preserves content that naturally starts with pipe characters export function stripLineNumbers(content: string): string { // Split into lines to handle each line individually const lines = content.split(/\r?\n/) // Process each line const processedLines = lines.map((line) => { // Match line number pattern and capture everything after the pipe const match = line.match(/^\s*\d+\s+\|(?!\|)\s?(.*)$/) return match ? match[1] : line }) // Join back with original line endings const lineEnding = content.includes("\r\n") ? "\r\n" : "\n" return processedLines.join(lineEnding) } /** * Truncates multi-line output while preserving context from both the beginning and end. * When truncation is needed, it keeps 20% of the lines from the start and 80% from the end, * with a clear indicator of how many lines were omitted in between. * * @param content The multi-line string to truncate * @param lineLimit Optional maximum number of lines to keep. If not provided or 0, returns the original content * @returns The truncated string with an indicator of omitted lines, or the original content if no truncation needed * * @example * // With 10 line limit on 25 lines of content: * // - Keeps first 2 lines (20% of 10) * // - Keeps last 8 lines (80% of 10) * // - Adds "[...15 lines omitted...]" in between */ export function truncateOutput(content: string, lineLimit?: number): string { if (!lineLimit) { return content } const lines = content.split("\n") if (lines.length <= lineLimit) { return content } const beforeLimit = Math.floor(lineLimit * 0.2) // 20% of lines before const afterLimit = lineLimit - beforeLimit // remaining 80% after return [ ...lines.slice(0, beforeLimit), `\n[...${lines.length - lineLimit} lines omitted...]\n`, ...lines.slice(-afterLimit), ].join("\n") }