diff --git a/src/utils/extract-text.ts b/src/utils/extract-text.ts index d3a0039..8f4205f 100644 --- a/src/utils/extract-text.ts +++ b/src/utils/extract-text.ts @@ -17,6 +17,8 @@ export async function extractTextFromFile(filePath: string): Promise { return extractTextFromPDF(filePath) case ".docx": return extractTextFromDOCX(filePath) + case ".ipynb": + return extractTextFromIPYNB(filePath) default: const isBinary = await isBinaryFile(filePath) if (!isBinary) { @@ -37,3 +39,17 @@ async function extractTextFromDOCX(filePath: string): Promise { const result = await mammoth.extractRawText({ path: filePath }) return result.value } + +async function extractTextFromIPYNB(filePath: string): Promise { + const data = await fs.readFile(filePath, "utf8") + const notebook = JSON.parse(data) + let extractedText = "" + + for (const cell of notebook.cells) { + if ((cell.cell_type === "markdown" || cell.cell_type === "code") && cell.source) { + extractedText += cell.source.join("\n") + "\n" + } + } + + return extractedText +}