import * as vscode from "vscode" import * as fs from "fs/promises" import * as path from "path" import { Browser, Page, ScreenshotOptions, TimeoutError, launch } from "puppeteer-core" import * as cheerio from "cheerio" import TurndownService from "turndown" // @ts-ignore import PCR from "puppeteer-chromium-resolver" import pWaitFor from "p-wait-for" import delay from "delay" interface PCRStats { puppeteer: { launch: typeof launch } executablePath: string } export class UrlContentFetcher { private context: vscode.ExtensionContext private browser?: Browser private page?: Page constructor(context: vscode.ExtensionContext) { this.context = context } private async ensureChromiumExists(): Promise { const globalStoragePath = this.context?.globalStorageUri?.fsPath if (!globalStoragePath) { throw new Error("Global storage uri is invalid") } const puppeteerDir = path.join(globalStoragePath, "puppeteer") const dirExists = await fs .access(puppeteerDir) .then(() => true) .catch(() => false) if (!dirExists) { await fs.mkdir(puppeteerDir, { recursive: true }) } // if chromium doesn't exist, this will download it to path.join(puppeteerDir, ".chromium-browser-snapshots") // if it does exist it will return the path to existing chromium const stats: PCRStats = await PCR({ downloadPath: puppeteerDir, }) return stats } async launchBrowser(): Promise { if (this.browser) { return } const stats = await this.ensureChromiumExists() this.browser = await stats.puppeteer.launch({ args: [ "--user-agent=Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/128.0.0.0 Safari/537.36", ], executablePath: stats.executablePath, }) // (latest version of puppeteer does not add headless to user agent) this.page = await this.browser?.newPage() } async closeBrowser(): Promise { await this.browser?.close() this.browser = undefined this.page = undefined } // must make sure to call launchBrowser before and closeBrowser after using this async urlToMarkdown(url: string): Promise { if (!this.browser || !this.page) { throw new Error("Browser not initialized") } /* - networkidle2 is equivalent to playwright's networkidle where it waits until there are no more than 2 network connections for at least 500 ms. - domcontentloaded is when the basic DOM is loaded this should be sufficient for most doc sites */ await this.page.goto(url, { timeout: 10_000, waitUntil: ["domcontentloaded", "networkidle2"] }) const content = await this.page.content() // use cheerio to parse and clean up the HTML const $ = cheerio.load(content) $("script, style, nav, footer, header").remove() // convert cleaned HTML to markdown const turndownService = new TurndownService() const markdown = turndownService.turndown($.html()) return markdown } async urlToScreenshotAndLogs(url: string): Promise<{ screenshot: string; logs: string }> { if (!this.browser || !this.page) { throw new Error("Browser not initialized") } const logs: string[] = [] let lastLogTs = Date.now() this.page.on("console", (msg) => { if (msg.type() === "log") { logs.push(msg.text()) } else { logs.push(`[${msg.type()}] ${msg.text()}`) } lastLogTs = Date.now() }) this.page.on("pageerror", (err) => { logs.push(`[Page Error] ${err.toString()}`) lastLogTs = Date.now() }) try { // networkidle2 isn't good enough since page may take some time to load. we can assume locally running dev sites will reach networkidle0 in a reasonable amount of time await this.page.goto(url, { timeout: 7_000, waitUntil: ["domcontentloaded", "networkidle2"] }) // await this.page.goto(url, { timeout: 10_000, waitUntil: "load" }) await this.waitTillHTMLStable(this.page) // in case the page is loading more resources } catch (err) { if (!(err instanceof TimeoutError)) { logs.push(`[Navigation Error] ${err.toString()}`) } } // Wait for console inactivity, with a timeout await pWaitFor(() => Date.now() - lastLogTs >= 500, { timeout: 3_000, interval: 100, }).catch(() => {}) // image cannot exceed 8_000 pixels const { pageHeight, pageWidth } = await this.page.evaluate(() => { const html: HTMLElement | null = document.documentElement const body: HTMLElement | null = document.body return { pageHeight: html?.scrollHeight || body?.scrollHeight, pageWidth: html?.clientWidth || body?.clientWidth, } }) // const defaultViewport = this.page.viewport(); // width 800 height 600 by default let options: ScreenshotOptions = { // fullPage: true, // clip and fullPage are mutually exclusive encoding: "base64", // quality: 80, clip: { x: 0, y: 0, width: pageWidth, height: Math.min(pageHeight, 8_000), }, } let screenshotBase64 = await this.page.screenshot({ ...options, type: "webp", }) let screenshot = `data:image/webp;base64,${screenshotBase64}` if (!screenshotBase64) { console.log("webp screenshot failed, trying png") screenshotBase64 = await this.page.screenshot({ ...options, type: "png", }) screenshot = `data:image/png;base64,${screenshotBase64}` } if (!screenshotBase64) { throw new Error("Failed to take screenshot.") } this.page.removeAllListeners() return { screenshot, logs: logs.join("\n"), } } // page.goto { waitUntil: "networkidle0" } may not ever resolve, and not waiting could return page content too early before js has loaded // https://stackoverflow.com/questions/52497252/puppeteer-wait-until-page-is-completely-loaded/61304202#61304202 private async waitTillHTMLStable(page: Page, timeout = 5_000) { const checkDurationMsecs = 500 // 1000 const maxChecks = timeout / checkDurationMsecs let lastHTMLSize = 0 let checkCounts = 1 let countStableSizeIterations = 0 const minStableSizeIterations = 3 while (checkCounts++ <= maxChecks) { let html = await page.content() let currentHTMLSize = html.length // let bodyHTMLSize = await page.evaluate(() => document.body.innerHTML.length) console.log("last: ", lastHTMLSize, " <> curr: ", currentHTMLSize) if (lastHTMLSize !== 0 && currentHTMLSize === lastHTMLSize) { countStableSizeIterations++ } else { countStableSizeIterations = 0 //reset the counter } if (countStableSizeIterations >= minStableSizeIterations) { console.log("Page rendered fully...") break } lastHTMLSize = currentHTMLSize await delay(checkDurationMsecs) } } }