Refactor

2025-12-20 20:31:37 -05:00 · 2024-09-19 16:49:31 -04:00
parent cfc2ee830e
commit 5fbb335bb6
5 changed files with 14 additions and 17 deletions
--- a/src/utils/UrlContentFetcher.ts
+++ b/src/utils/UrlContentFetcher.ts
@@ -0,0 +1,92 @@
+import * as vscode from "vscode"
+import * as fs from "fs/promises"
+import * as path from "path"
+import { Browser, Page, launch } from "puppeteer-core"
+import * as cheerio from "cheerio"
+import TurndownService from "turndown"
+// @ts-ignore
+import PCR from "puppeteer-chromium-resolver"
+
+interface PCRStats {
+	puppeteer: { launch: typeof launch }
+	executablePath: string
+}
+
+export class UrlContentFetcher {
+	private context: vscode.ExtensionContext
+	private browser?: Browser
+	private page?: Page
+
+	constructor(context: vscode.ExtensionContext) {
+		this.context = context
+	}
+
+	private async ensureChromiumExists(): Promise<PCRStats> {
+		const globalStoragePath = this.context?.globalStorageUri?.fsPath
+		if (!globalStoragePath) {
+			throw new Error("Global storage uri is invalid")
+		}
+
+		const puppeteerDir = path.join(globalStoragePath, "puppeteer")
+		const dirExists = await fs
+			.access(puppeteerDir)
+			.then(() => true)
+			.catch(() => false)
+		if (!dirExists) {
+			await fs.mkdir(puppeteerDir, { recursive: true })
+		}
+
+		// if chromium doesn't exist, this will download it to path.join(puppeteerDir, ".chromium-browser-snapshots")
+		// if it does exist it will return the path to existing chromium
+		const stats: PCRStats = await PCR({
+			downloadPath: puppeteerDir,
+		})
+
+		return stats
+	}
+
+	async launchBrowser(): Promise<void> {
+		if (this.browser) {
+			return
+		}
+		const stats = await this.ensureChromiumExists()
+		this.browser = await stats.puppeteer.launch({
+			args: [
+				"--user-agent=Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/128.0.0.0 Safari/537.36",
+			],
+			executablePath: stats.executablePath,
+		})
+		// (latest version of puppeteer does not add headless to user agent)
+		this.page = await this.browser?.newPage()
+	}
+
+	async closeBrowser(): Promise<void> {
+		await this.browser?.close()
+		this.browser = undefined
+		this.page = undefined
+	}
+
+	// must make sure to call launchBrowser before and closeBrowser after using this
+	async urlToMarkdown(url: string): Promise<string> {
+		if (!this.browser || !this.page) {
+			throw new Error("Browser not initialized")
+		}
+		/*
+		- networkidle2 is equivalent to playwright's networkidle where it waits until there are no more than 2 network connections for at least 500 ms.
+		- domcontentloaded is when the basic DOM is loaded
+		this should be sufficient for most doc sites
+		*/
+		await this.page.goto(url, { timeout: 10_000, waitUntil: ["domcontentloaded", "networkidle2"] })
+		const content = await this.page.content()
+
+		// use cheerio to parse and clean up the HTML
+		const $ = cheerio.load(content)
+		$("script, style, nav, footer, header").remove()
+
+		// convert cleaned HTML to markdown
+		const turndownService = new TurndownService()
+		const markdown = turndownService.turndown($.html())
+
+		return markdown
+	}
+}