Files
Roo-Code/src/utils/UrlContentFetcher.ts

224 lines
6.7 KiB
TypeScript

import * as vscode from "vscode"
import * as fs from "fs/promises"
import * as path from "path"
import { Browser, Page, ScreenshotOptions, TimeoutError, launch } from "puppeteer-core"
import * as cheerio from "cheerio"
import TurndownService from "turndown"
// @ts-ignore
import PCR from "puppeteer-chromium-resolver"
import pWaitFor from "p-wait-for"
import delay from "delay"
interface PCRStats {
puppeteer: { launch: typeof launch }
executablePath: string
}
export class UrlContentFetcher {
private context: vscode.ExtensionContext
private browser?: Browser
private page?: Page
constructor(context: vscode.ExtensionContext) {
this.context = context
}
private async ensureChromiumExists(): Promise<PCRStats> {
const globalStoragePath = this.context?.globalStorageUri?.fsPath
if (!globalStoragePath) {
throw new Error("Global storage uri is invalid")
}
const puppeteerDir = path.join(globalStoragePath, "puppeteer")
const dirExists = await fs
.access(puppeteerDir)
.then(() => true)
.catch(() => false)
if (!dirExists) {
await fs.mkdir(puppeteerDir, { recursive: true })
}
// if chromium doesn't exist, this will download it to path.join(puppeteerDir, ".chromium-browser-snapshots")
// if it does exist it will return the path to existing chromium
const stats: PCRStats = await PCR({
downloadPath: puppeteerDir,
})
return stats
}
async launchBrowser(): Promise<void> {
if (this.browser) {
return
}
const stats = await this.ensureChromiumExists()
this.browser = await stats.puppeteer.launch({
args: [
"--user-agent=Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/128.0.0.0 Safari/537.36",
],
executablePath: stats.executablePath,
})
// (latest version of puppeteer does not add headless to user agent)
this.page = await this.browser?.newPage()
}
async closeBrowser(): Promise<void> {
await this.browser?.close()
this.browser = undefined
this.page = undefined
}
// must make sure to call launchBrowser before and closeBrowser after using this
async urlToMarkdown(url: string): Promise<string> {
if (!this.browser || !this.page) {
throw new Error("Browser not initialized")
}
/*
- networkidle2 is equivalent to playwright's networkidle where it waits until there are no more than 2 network connections for at least 500 ms.
- domcontentloaded is when the basic DOM is loaded
this should be sufficient for most doc sites
*/
await this.page.goto(url, { timeout: 10_000, waitUntil: ["domcontentloaded", "networkidle2"] })
const content = await this.page.content()
// use cheerio to parse and clean up the HTML
const $ = cheerio.load(content)
$("script, style, nav, footer, header").remove()
// convert cleaned HTML to markdown
const turndownService = new TurndownService()
const markdown = turndownService.turndown($.html())
return markdown
}
async urlToScreenshotAndLogs(url: string): Promise<{ screenshot: string; logs: string }> {
if (!this.browser || !this.page) {
throw new Error("Browser not initialized")
}
const logs: string[] = []
let lastLogTs = Date.now()
this.page.on("console", (msg) => {
if (msg.type() === "log") {
logs.push(msg.text())
} else {
logs.push(`[${msg.type()}] ${msg.text()}`)
}
lastLogTs = Date.now()
})
this.page.on("pageerror", (err) => {
logs.push(`[Page Error] ${err.toString()}`)
lastLogTs = Date.now()
})
try {
// networkidle2 isn't good enough since page may take some time to load. we can assume locally running dev sites will reach networkidle0 in a reasonable amount of time
await this.page.goto(url, { timeout: 7_000, waitUntil: ["domcontentloaded", "networkidle2"] })
// await this.page.goto(url, { timeout: 10_000, waitUntil: "load" })
await this.waitTillHTMLStable(this.page) // in case the page is loading more resources
} catch (err) {
if (!(err instanceof TimeoutError)) {
logs.push(`[Navigation Error] ${err.toString()}`)
}
}
// Wait for console inactivity, with a timeout
await pWaitFor(() => Date.now() - lastLogTs >= 500, {
timeout: 3_000,
interval: 100,
}).catch(() => {})
// image cannot exceed 8_000 pixels
const { pageHeight, pageWidth } = await this.page.evaluate(() => {
const html: HTMLElement | null = document.documentElement
const body: HTMLElement | null = document.body
return {
pageHeight: html?.scrollHeight || body?.scrollHeight,
pageWidth: html?.clientWidth || body?.clientWidth,
}
})
// const defaultViewport = this.page.viewport(); // width 800 height 600 by default
let options: ScreenshotOptions
if (pageHeight && pageWidth) {
options = {
// fullPage: true, // clip and fullPage are mutually exclusive
encoding: "base64",
// quality: 80,
clip: {
x: 0,
y: 0,
width: pageWidth,
height: Math.min(pageHeight, 8_000),
},
}
} else {
// if we can't get the page dimensions, fallback to full page screenshot
options = {
encoding: "base64",
fullPage: true,
}
}
let screenshotBase64 = await this.page.screenshot({
...options,
type: "webp",
})
let screenshot = `data:image/webp;base64,${screenshotBase64}`
if (!screenshotBase64) {
console.log("webp screenshot failed, trying png")
screenshotBase64 = await this.page.screenshot({
...options,
type: "png",
})
screenshot = `data:image/png;base64,${screenshotBase64}`
}
if (!screenshotBase64) {
throw new Error("Failed to take screenshot.")
}
this.page.removeAllListeners()
return {
screenshot,
logs: logs.join("\n"),
}
}
// page.goto { waitUntil: "networkidle0" } may not ever resolve, and not waiting could return page content too early before js has loaded
// https://stackoverflow.com/questions/52497252/puppeteer-wait-until-page-is-completely-loaded/61304202#61304202
private async waitTillHTMLStable(page: Page, timeout = 5_000) {
const checkDurationMsecs = 500 // 1000
const maxChecks = timeout / checkDurationMsecs
let lastHTMLSize = 0
let checkCounts = 1
let countStableSizeIterations = 0
const minStableSizeIterations = 3
while (checkCounts++ <= maxChecks) {
let html = await page.content()
let currentHTMLSize = html.length
// let bodyHTMLSize = await page.evaluate(() => document.body.innerHTML.length)
console.log("last: ", lastHTMLSize, " <> curr: ", currentHTMLSize)
if (lastHTMLSize !== 0 && currentHTMLSize === lastHTMLSize) {
countStableSizeIterations++
} else {
countStableSizeIterations = 0 //reset the counter
}
if (countStableSizeIterations >= minStableSizeIterations) {
console.log("Page rendered fully...")
break
}
lastHTMLSize = currentHTMLSize
await delay(checkDurationMsecs)
}
}
}