mirror of
https://github.com/pacnpal/Roo-Code.git
synced 2025-12-22 13:21:07 -05:00
Use a single browser instance to scrape multiple sites
This commit is contained in:
@@ -1,129 +1,93 @@
|
||||
import * as vscode from "vscode"
|
||||
import * as fs from "fs/promises"
|
||||
import * as path from "path"
|
||||
import { Browser } from "puppeteer-core"
|
||||
import { Browser, Page, launch } from "puppeteer-core"
|
||||
import * as cheerio from "cheerio"
|
||||
import TurndownService from "turndown"
|
||||
// @ts-ignore
|
||||
import PCR from "puppeteer-chromium-resolver"
|
||||
|
||||
const PUPPETEER_DIR = "puppeteer"
|
||||
interface PCRStats {
|
||||
puppeteer: { launch: typeof launch }
|
||||
executablePath: string
|
||||
}
|
||||
|
||||
export class UrlScraper {
|
||||
private context: vscode.ExtensionContext
|
||||
private browser?: Browser
|
||||
private page?: Page
|
||||
|
||||
constructor(context: vscode.ExtensionContext) {
|
||||
this.context = context
|
||||
}
|
||||
|
||||
private async ensureChromiumExists(): Promise<void> {
|
||||
private async ensureChromiumExists(): Promise<PCRStats> {
|
||||
const globalStoragePath = this.context?.globalStorageUri?.fsPath
|
||||
if (!globalStoragePath) {
|
||||
throw new Error("Global storage uri is invalid")
|
||||
}
|
||||
|
||||
const puppeteerDir = path.join(globalStoragePath, PUPPETEER_DIR)
|
||||
|
||||
if (!(await fileExists(puppeteerDir))) {
|
||||
const puppeteerDir = path.join(globalStoragePath, "puppeteer")
|
||||
const dirExists = await fs
|
||||
.access(puppeteerDir)
|
||||
.then(() => true)
|
||||
.catch(() => false)
|
||||
if (!dirExists) {
|
||||
await fs.mkdir(puppeteerDir, { recursive: true })
|
||||
}
|
||||
|
||||
const chromiumPath = path.join(puppeteerDir, ".chromium-browser-snapshots")
|
||||
|
||||
if (!(await fileExists(chromiumPath))) {
|
||||
// If Chromium doesn't exist, download it
|
||||
await PCR({
|
||||
downloadPath: puppeteerDir,
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
async urlToMarkdown(url: string): Promise<string> {
|
||||
await this.ensureChromiumExists()
|
||||
|
||||
const globalStoragePath = this.context?.globalStorageUri?.fsPath
|
||||
if (!globalStoragePath) {
|
||||
throw new Error("Global storage uri is invalid")
|
||||
}
|
||||
const puppeteerDir = path.join(globalStoragePath, PUPPETEER_DIR)
|
||||
|
||||
const stats = await PCR({
|
||||
// if chromium doesn't exist, this will download it to path.join(puppeteerDir, ".chromium-browser-snapshots")
|
||||
// if it does exist it will return the path to existing chromium
|
||||
const stats: PCRStats = await PCR({
|
||||
downloadPath: puppeteerDir,
|
||||
})
|
||||
const browser: Browser = await stats.puppeteer.launch({
|
||||
|
||||
return stats
|
||||
}
|
||||
|
||||
async launchBrowser(): Promise<void> {
|
||||
if (this.browser) {
|
||||
return
|
||||
}
|
||||
const stats = await this.ensureChromiumExists()
|
||||
this.browser = await stats.puppeteer.launch({
|
||||
args: [
|
||||
"--user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.150 Safari/537.36",
|
||||
"--user-agent=Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/128.0.0.0 Safari/537.36",
|
||||
],
|
||||
executablePath: stats.executablePath,
|
||||
})
|
||||
|
||||
try {
|
||||
const page = await browser.newPage()
|
||||
|
||||
/*
|
||||
- networkidle2 is equivalent to playwright's networkidle where it waits until there are no more than 2 network connections for at least 500 ms.
|
||||
- domcontentloaded is when the basic DOM is loaded
|
||||
this should be sufficient for most doc sites, but we can use the more elaborate waitTillHTMLRendered if we find users are scraping more dynamic complex sites
|
||||
*/
|
||||
await page.goto(url, { timeout: 10_000, waitUntil: ["domcontentloaded", "networkidle2"] })
|
||||
// await this.waitTillHTMLRendered(page)
|
||||
const content = await page.content()
|
||||
|
||||
// Use Cheerio to parse and clean up the HTML
|
||||
const $ = cheerio.load(content)
|
||||
$("script, style, nav, footer").remove() // Remove unnecessary elements (todo: make this more robust)
|
||||
|
||||
// Convert cleaned HTML to Markdown
|
||||
const turndownService = new TurndownService()
|
||||
const markdown = turndownService.turndown($.html())
|
||||
|
||||
return markdown
|
||||
} finally {
|
||||
await browser.close()
|
||||
}
|
||||
// (latest version of puppeteer does not add headless to user agent)
|
||||
this.page = await this.browser?.newPage()
|
||||
}
|
||||
|
||||
// page.goto { waitUntil: "networkidle0" } may not ever resolve, and not waiting could return page content too early before js has loaded
|
||||
// https://stackoverflow.com/questions/52497252/puppeteer-wait-until-page-is-completely-loaded/61304202#61304202
|
||||
/*
|
||||
private async waitTillHTMLRendered(page: Page, timeout = 10_000) {
|
||||
const checkDurationMsecs = 500 // 1000
|
||||
const maxChecks = timeout / checkDurationMsecs
|
||||
let lastHTMLSize = 0
|
||||
let checkCounts = 1
|
||||
let countStableSizeIterations = 0
|
||||
const minStableSizeIterations = 3
|
||||
|
||||
while (checkCounts++ <= maxChecks) {
|
||||
let html = await page.content()
|
||||
let currentHTMLSize = html.length
|
||||
|
||||
// let bodyHTMLSize = await page.evaluate(() => document.body.innerHTML.length)
|
||||
console.log("last: ", lastHTMLSize, " <> curr: ", currentHTMLSize)
|
||||
|
||||
if (lastHTMLSize !== 0 && currentHTMLSize === lastHTMLSize) {
|
||||
countStableSizeIterations++
|
||||
} else {
|
||||
countStableSizeIterations = 0 //reset the counter
|
||||
}
|
||||
|
||||
if (countStableSizeIterations >= minStableSizeIterations) {
|
||||
console.log("Page rendered fully...")
|
||||
break
|
||||
}
|
||||
|
||||
lastHTMLSize = currentHTMLSize
|
||||
await delay(checkDurationMsecs)
|
||||
}
|
||||
async closeBrowser(): Promise<void> {
|
||||
await this.browser?.close()
|
||||
this.browser = undefined
|
||||
this.page = undefined
|
||||
}
|
||||
*/
|
||||
}
|
||||
|
||||
async function fileExists(path: string): Promise<boolean> {
|
||||
try {
|
||||
await fs.access(path)
|
||||
return true
|
||||
} catch {
|
||||
return false
|
||||
// must make sure to call launchBrowser before and closeBrowser after using this
|
||||
async urlToMarkdown(url: string): Promise<string> {
|
||||
if (!this.browser || !this.page) {
|
||||
throw new Error("Browser not initialized")
|
||||
}
|
||||
/*
|
||||
- networkidle2 is equivalent to playwright's networkidle where it waits until there are no more than 2 network connections for at least 500 ms.
|
||||
- domcontentloaded is when the basic DOM is loaded
|
||||
this should be sufficient for most doc sites, but we can use the more elaborate waitTillHTMLRendered if we find users are scraping more dynamic complex sites
|
||||
https://stackoverflow.com/questions/52497252/puppeteer-wait-until-page-is-completely-loaded/61304202#61304202
|
||||
*/
|
||||
await this.page.goto(url, { timeout: 10_000, waitUntil: ["domcontentloaded", "networkidle2"] })
|
||||
const content = await this.page.content()
|
||||
|
||||
// use cheerio to parse and clean up the HTML
|
||||
const $ = cheerio.load(content)
|
||||
$("script, style, nav, footer, header").remove()
|
||||
|
||||
// convert cleaned HTML to markdown
|
||||
const turndownService = new TurndownService()
|
||||
const markdown = turndownService.turndown($.html())
|
||||
|
||||
return markdown
|
||||
}
|
||||
}
|
||||
|
||||
@@ -32,7 +32,7 @@ export function openMention(mention?: string): void {
|
||||
}
|
||||
}
|
||||
|
||||
export async function parseMentions(text: string, cwd: string, urlScraper?: UrlScraper): Promise<string> {
|
||||
export async function parseMentions(text: string, cwd: string, urlScraper: UrlScraper): Promise<string> {
|
||||
const mentions: Set<string> = new Set()
|
||||
let parsedText = text.replace(mentionRegexGlobal, (match, mention) => {
|
||||
mentions.add(mention)
|
||||
@@ -48,15 +48,32 @@ export async function parseMentions(text: string, cwd: string, urlScraper?: UrlS
|
||||
return match
|
||||
})
|
||||
|
||||
const urlMention = Array.from(mentions).find((mention) => mention.startsWith("http"))
|
||||
let launchBrowserError: Error | undefined
|
||||
if (urlMention) {
|
||||
try {
|
||||
await urlScraper.launchBrowser()
|
||||
} catch (error) {
|
||||
launchBrowserError = error
|
||||
vscode.window.showErrorMessage(`Error fetching content for ${urlMention}: ${error.message}`)
|
||||
}
|
||||
}
|
||||
|
||||
for (const mention of mentions) {
|
||||
if (mention.startsWith("http") && urlScraper) {
|
||||
try {
|
||||
const markdown = await urlScraper.urlToMarkdown(mention)
|
||||
parsedText += `\n\n<url_content url="${mention}">\n${markdown}\n</url_content>`
|
||||
} catch (error) {
|
||||
vscode.window.showErrorMessage(`Error fetching content for ${mention}: ${JSON.stringify(error)}`)
|
||||
parsedText += `\n\n<url_content url="${mention}">\nError fetching content: ${error.message}\n</url_content>`
|
||||
if (mention.startsWith("http")) {
|
||||
let result: string
|
||||
if (launchBrowserError) {
|
||||
result = `Error fetching content: ${launchBrowserError.message}`
|
||||
} else {
|
||||
try {
|
||||
const markdown = await urlScraper.urlToMarkdown(mention)
|
||||
result = markdown
|
||||
} catch (error) {
|
||||
vscode.window.showErrorMessage(`Error fetching content for ${mention}: ${error.message}`)
|
||||
result = `Error fetching content: ${error.message}`
|
||||
}
|
||||
}
|
||||
parsedText += `\n\n<url_content url="${mention}">\n${result}\n</url_content>`
|
||||
} else if (mention.startsWith("/")) {
|
||||
const mentionPath = mention.slice(1) // Remove the leading '/'
|
||||
try {
|
||||
@@ -83,6 +100,14 @@ export async function parseMentions(text: string, cwd: string, urlScraper?: UrlS
|
||||
}
|
||||
}
|
||||
|
||||
if (urlMention) {
|
||||
try {
|
||||
await urlScraper.closeBrowser()
|
||||
} catch (error) {
|
||||
console.error(`Error closing browser: ${error.message}`)
|
||||
}
|
||||
}
|
||||
|
||||
return parsedText
|
||||
}
|
||||
|
||||
@@ -95,7 +120,7 @@ async function getFileOrFolderContent(mentionPath: string, cwd: string): Promise
|
||||
if (stats.isFile()) {
|
||||
const isBinary = await isBinaryFile(absPath).catch(() => false)
|
||||
if (isBinary) {
|
||||
return "(Binary file)"
|
||||
return "(Binary file, unable to display content)"
|
||||
}
|
||||
const content = await extractTextFromFile(absPath)
|
||||
return content
|
||||
|
||||
Reference in New Issue
Block a user