Use a single browser instance to scrape multiple sites

2026-02-06 20:44:44 -05:00 · 2024-09-19 12:44:11 -04:00
parent e75cab491a
commit 73f082bf98
4 changed files with 98 additions and 112 deletions
--- a/src/utils/UrlScraper.ts
+++ b/src/utils/UrlScraper.ts
@@ -1,129 +1,93 @@
 import * as vscode from "vscode"
 import * as fs from "fs/promises"
 import * as path from "path"
-import { Browser } from "puppeteer-core"
+import { Browser, Page, launch } from "puppeteer-core"
 import * as cheerio from "cheerio"
 import TurndownService from "turndown"
 // @ts-ignore
 import PCR from "puppeteer-chromium-resolver"

-const PUPPETEER_DIR = "puppeteer"
+interface PCRStats {
+	puppeteer: { launch: typeof launch }
+	executablePath: string
+}

 export class UrlScraper {
 	private context: vscode.ExtensionContext
+	private browser?: Browser
+	private page?: Page

 	constructor(context: vscode.ExtensionContext) {
 		this.context = context
 	}

-	private async ensureChromiumExists(): Promise<void> {
+	private async ensureChromiumExists(): Promise<PCRStats> {
 		const globalStoragePath = this.context?.globalStorageUri?.fsPath
 		if (!globalStoragePath) {
 			throw new Error("Global storage uri is invalid")
 		}

-		const puppeteerDir = path.join(globalStoragePath, PUPPETEER_DIR)
-
-		if (!(await fileExists(puppeteerDir))) {
+		const puppeteerDir = path.join(globalStoragePath, "puppeteer")
+		const dirExists = await fs
+			.access(puppeteerDir)
+			.then(() => true)
+			.catch(() => false)
+		if (!dirExists) {
 			await fs.mkdir(puppeteerDir, { recursive: true })
 		}

-		const chromiumPath = path.join(puppeteerDir, ".chromium-browser-snapshots")
-
-		if (!(await fileExists(chromiumPath))) {
-			// If Chromium doesn't exist, download it
-			await PCR({
-				downloadPath: puppeteerDir,
-			})
-		}
-	}
-
-	async urlToMarkdown(url: string): Promise<string> {
-		await this.ensureChromiumExists()
-
-		const globalStoragePath = this.context?.globalStorageUri?.fsPath
-		if (!globalStoragePath) {
-			throw new Error("Global storage uri is invalid")
-		}
-		const puppeteerDir = path.join(globalStoragePath, PUPPETEER_DIR)
-
-		const stats = await PCR({
+		// if chromium doesn't exist, this will download it to path.join(puppeteerDir, ".chromium-browser-snapshots")
+		// if it does exist it will return the path to existing chromium
+		const stats: PCRStats = await PCR({
 			downloadPath: puppeteerDir,
 		})
-		const browser: Browser = await stats.puppeteer.launch({
+
+		return stats
+	}
+
+	async launchBrowser(): Promise<void> {
+		if (this.browser) {
+			return
+		}
+		const stats = await this.ensureChromiumExists()
+		this.browser = await stats.puppeteer.launch({
 			args: [
-				"--user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.150 Safari/537.36",
+				"--user-agent=Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/128.0.0.0 Safari/537.36",
 			],
 			executablePath: stats.executablePath,
 		})
-
-		try {
-			const page = await browser.newPage()
-
-			/*
-			- networkidle2 is equivalent to playwright's networkidle where it waits until there are no more than 2 network connections for at least 500 ms.
-			- domcontentloaded is when the basic DOM is loaded
-			this should be sufficient for most doc sites, but we can use the more elaborate waitTillHTMLRendered if we find users are scraping more dynamic complex sites
-			*/
-			await page.goto(url, { timeout: 10_000, waitUntil: ["domcontentloaded", "networkidle2"] })
-			// await this.waitTillHTMLRendered(page)
-			const content = await page.content()
-
-			// Use Cheerio to parse and clean up the HTML
-			const $ = cheerio.load(content)
-			$("script, style, nav, footer").remove() // Remove unnecessary elements (todo: make this more robust)
-
-			// Convert cleaned HTML to Markdown
-			const turndownService = new TurndownService()
-			const markdown = turndownService.turndown($.html())
-
-			return markdown
-		} finally {
-			await browser.close()
-		}
+		// (latest version of puppeteer does not add headless to user agent)
+		this.page = await this.browser?.newPage()
 	}

-	// page.goto { waitUntil: "networkidle0" } may not ever resolve, and not waiting could return page content too early before js has loaded
-	// https://stackoverflow.com/questions/52497252/puppeteer-wait-until-page-is-completely-loaded/61304202#61304202
-	/*
-	private async waitTillHTMLRendered(page: Page, timeout = 10_000) {
-		const checkDurationMsecs = 500 // 1000
-		const maxChecks = timeout / checkDurationMsecs
-		let lastHTMLSize = 0
-		let checkCounts = 1
-		let countStableSizeIterations = 0
-		const minStableSizeIterations = 3
-
-		while (checkCounts++ <= maxChecks) {
-			let html = await page.content()
-			let currentHTMLSize = html.length
-
-			// let bodyHTMLSize = await page.evaluate(() => document.body.innerHTML.length)
-			console.log("last: ", lastHTMLSize, " <> curr: ", currentHTMLSize)
-
-			if (lastHTMLSize !== 0 && currentHTMLSize === lastHTMLSize) {
-				countStableSizeIterations++
-			} else {
-				countStableSizeIterations = 0 //reset the counter
-			}
-
-			if (countStableSizeIterations >= minStableSizeIterations) {
-				console.log("Page rendered fully...")
-				break
-			}
-
-			lastHTMLSize = currentHTMLSize
-			await delay(checkDurationMsecs)
-		}
+	async closeBrowser(): Promise<void> {
+		await this.browser?.close()
+		this.browser = undefined
+		this.page = undefined
 	}
-	*/
-}

-async function fileExists(path: string): Promise<boolean> {
-	try {
-		await fs.access(path)
-		return true
-	} catch {
-		return false
+	// must make sure to call launchBrowser before and closeBrowser after using this
+	async urlToMarkdown(url: string): Promise<string> {
+		if (!this.browser || !this.page) {
+			throw new Error("Browser not initialized")
+		}
+		/*
+		- networkidle2 is equivalent to playwright's networkidle where it waits until there are no more than 2 network connections for at least 500 ms.
+		- domcontentloaded is when the basic DOM is loaded
+		this should be sufficient for most doc sites, but we can use the more elaborate waitTillHTMLRendered if we find users are scraping more dynamic complex sites
+		https://stackoverflow.com/questions/52497252/puppeteer-wait-until-page-is-completely-loaded/61304202#61304202
+		*/
+		await this.page.goto(url, { timeout: 10_000, waitUntil: ["domcontentloaded", "networkidle2"] })
+		const content = await this.page.content()
+
+		// use cheerio to parse and clean up the HTML
+		const $ = cheerio.load(content)
+		$("script, style, nav, footer, header").remove()
+
+		// convert cleaned HTML to markdown
+		const turndownService = new TurndownService()
+		const markdown = turndownService.turndown($.html())
+
+		return markdown
 	}
 }
--- a/src/utils/context-mentions.ts
+++ b/src/utils/context-mentions.ts
@@ -32,7 +32,7 @@ export function openMention(mention?: string): void {
 	}
 }

-export async function parseMentions(text: string, cwd: string, urlScraper?: UrlScraper): Promise<string> {
+export async function parseMentions(text: string, cwd: string, urlScraper: UrlScraper): Promise<string> {
 	const mentions: Set<string> = new Set()
 	let parsedText = text.replace(mentionRegexGlobal, (match, mention) => {
 		mentions.add(mention)
@@ -48,15 +48,32 @@ export async function parseMentions(text: string, cwd: string, urlScraper?: UrlS
 		return match
 	})

+	const urlMention = Array.from(mentions).find((mention) => mention.startsWith("http"))
+	let launchBrowserError: Error | undefined
+	if (urlMention) {
+		try {
+			await urlScraper.launchBrowser()
+		} catch (error) {
+			launchBrowserError = error
+			vscode.window.showErrorMessage(`Error fetching content for ${urlMention}: ${error.message}`)
+		}
+	}
+
 	for (const mention of mentions) {
-		if (mention.startsWith("http") && urlScraper) {
-			try {
-				const markdown = await urlScraper.urlToMarkdown(mention)
-				parsedText += `\n\n<url_content url="${mention}">\n${markdown}\n</url_content>`
-			} catch (error) {
-				vscode.window.showErrorMessage(`Error fetching content for ${mention}: ${JSON.stringify(error)}`)
-				parsedText += `\n\n<url_content url="${mention}">\nError fetching content: ${error.message}\n</url_content>`
+		if (mention.startsWith("http")) {
+			let result: string
+			if (launchBrowserError) {
+				result = `Error fetching content: ${launchBrowserError.message}`
+			} else {
+				try {
+					const markdown = await urlScraper.urlToMarkdown(mention)
+					result = markdown
+				} catch (error) {
+					vscode.window.showErrorMessage(`Error fetching content for ${mention}: ${error.message}`)
+					result = `Error fetching content: ${error.message}`
+				}
 			}
+			parsedText += `\n\n<url_content url="${mention}">\n${result}\n</url_content>`
 		} else if (mention.startsWith("/")) {
 			const mentionPath = mention.slice(1) // Remove the leading '/'
 			try {
@@ -83,6 +100,14 @@ export async function parseMentions(text: string, cwd: string, urlScraper?: UrlS
 		}
 	}

+	if (urlMention) {
+		try {
+			await urlScraper.closeBrowser()
+		} catch (error) {
+			console.error(`Error closing browser: ${error.message}`)
+		}
+	}
+
 	return parsedText
 }

@@ -95,7 +120,7 @@ async function getFileOrFolderContent(mentionPath: string, cwd: string): Promise
 		if (stats.isFile()) {
 			const isBinary = await isBinaryFile(absPath).catch(() => false)
 			if (isBinary) {
-				return "(Binary file)"
+				return "(Binary file, unable to display content)"
 			}
 			const content = await extractTextFromFile(absPath)
 			return content