From 5609395780c0ba2acf0dddaffeb0fece35abffe7 Mon Sep 17 00:00:00 2001 From: Saoud Rizwan <7799382+saoudrizwan@users.noreply.github.com> Date: Mon, 2 Sep 2024 04:24:07 -0400 Subject: [PATCH] Add support for reading .ipynb --- src/utils/extract-text.ts | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/src/utils/extract-text.ts b/src/utils/extract-text.ts index d3a0039..8f4205f 100644 --- a/src/utils/extract-text.ts +++ b/src/utils/extract-text.ts @@ -17,6 +17,8 @@ export async function extractTextFromFile(filePath: string): Promise { return extractTextFromPDF(filePath) case ".docx": return extractTextFromDOCX(filePath) + case ".ipynb": + return extractTextFromIPYNB(filePath) default: const isBinary = await isBinaryFile(filePath) if (!isBinary) { @@ -37,3 +39,17 @@ async function extractTextFromDOCX(filePath: string): Promise { const result = await mammoth.extractRawText({ path: filePath }) return result.value } + +async function extractTextFromIPYNB(filePath: string): Promise { + const data = await fs.readFile(filePath, "utf8") + const notebook = JSON.parse(data) + let extractedText = "" + + for (const cell of notebook.cells) { + if ((cell.cell_type === "markdown" || cell.cell_type === "code") && cell.source) { + extractedText += cell.source.join("\n") + "\n" + } + } + + return extractedText +}