mirror of
https://github.com/pacnpal/Roo-Code.git
synced 2025-12-20 04:11:10 -05:00
Use a single browser instance to scrape multiple sites
This commit is contained in:
@@ -27,6 +27,7 @@ import { truncateHalfConversation } from "./utils/context-management"
|
||||
import { extractTextFromFile } from "./utils/extract-text"
|
||||
import { regexSearchFiles } from "./utils/ripgrep"
|
||||
import { parseMentions } from "./utils/context-mentions"
|
||||
import { UrlScraper } from "./utils/UrlScraper"
|
||||
|
||||
const SYSTEM_PROMPT =
|
||||
async () => `You are Claude Dev, a highly skilled software developer with extensive knowledge in many programming languages, frameworks, design patterns, and best practices.
|
||||
@@ -250,6 +251,7 @@ export class ClaudeDev {
|
||||
readonly taskId: string
|
||||
private api: ApiHandler
|
||||
private terminalManager: TerminalManager
|
||||
private urlScraper: UrlScraper
|
||||
private didEditFile: boolean = false
|
||||
private customInstructions?: string
|
||||
private alwaysAllowReadOnly: boolean
|
||||
@@ -275,6 +277,7 @@ export class ClaudeDev {
|
||||
this.providerRef = new WeakRef(provider)
|
||||
this.api = buildApiHandler(apiConfiguration)
|
||||
this.terminalManager = new TerminalManager()
|
||||
this.urlScraper = new UrlScraper(provider.context)
|
||||
this.customInstructions = customInstructions
|
||||
this.alwaysAllowReadOnly = alwaysAllowReadOnly ?? false
|
||||
|
||||
@@ -675,6 +678,7 @@ export class ClaudeDev {
|
||||
abortTask() {
|
||||
this.abort = true // will stop any autonomously running promises
|
||||
this.terminalManager.disposeAll()
|
||||
this.urlScraper.closeBrowser()
|
||||
}
|
||||
|
||||
async executeTool(toolName: ToolName, toolInput: any): Promise<[boolean, ToolResponse]> {
|
||||
@@ -1643,14 +1647,14 @@ ${this.customInstructions.trim()}
|
||||
if (block.type === "text") {
|
||||
return {
|
||||
...block,
|
||||
text: await parseMentions(block.text, cwd, this.providerRef.deref()?.urlScraper),
|
||||
text: await parseMentions(block.text, cwd, this.urlScraper),
|
||||
}
|
||||
} else if (block.type === "tool_result") {
|
||||
const isUserMessage = (text: string) => text.includes("<feedback>") || text.includes("<answer>")
|
||||
if (typeof block.content === "string" && isUserMessage(block.content)) {
|
||||
return {
|
||||
...block,
|
||||
content: await parseMentions(block.content, cwd, this.providerRef.deref()?.urlScraper),
|
||||
content: await parseMentions(block.content, cwd, this.urlScraper),
|
||||
}
|
||||
} else if (Array.isArray(block.content)) {
|
||||
const parsedContent = await Promise.all(
|
||||
@@ -1658,11 +1662,7 @@ ${this.customInstructions.trim()}
|
||||
if (contentBlock.type === "text" && isUserMessage(contentBlock.text)) {
|
||||
return {
|
||||
...contentBlock,
|
||||
text: await parseMentions(
|
||||
contentBlock.text,
|
||||
cwd,
|
||||
this.providerRef.deref()?.urlScraper
|
||||
),
|
||||
text: await parseMentions(contentBlock.text, cwd, this.urlScraper),
|
||||
}
|
||||
}
|
||||
return contentBlock
|
||||
|
||||
@@ -54,14 +54,12 @@ export class ClaudeDevProvider implements vscode.WebviewViewProvider {
|
||||
private view?: vscode.WebviewView | vscode.WebviewPanel
|
||||
private claudeDev?: ClaudeDev
|
||||
private workspaceTracker?: WorkspaceTracker
|
||||
urlScraper?: UrlScraper
|
||||
private latestAnnouncementId = "sep-14-2024" // update to some unique identifier when we add a new announcement
|
||||
|
||||
constructor(readonly context: vscode.ExtensionContext, private readonly outputChannel: vscode.OutputChannel) {
|
||||
this.outputChannel.appendLine("ClaudeDevProvider instantiated")
|
||||
ClaudeDevProvider.activeInstances.add(this)
|
||||
this.workspaceTracker = new WorkspaceTracker(this)
|
||||
this.urlScraper = new UrlScraper(this.context)
|
||||
this.revertKodu()
|
||||
}
|
||||
|
||||
@@ -107,7 +105,6 @@ export class ClaudeDevProvider implements vscode.WebviewViewProvider {
|
||||
}
|
||||
this.workspaceTracker?.dispose()
|
||||
this.workspaceTracker = undefined
|
||||
this.urlScraper = undefined
|
||||
this.outputChannel.appendLine("Disposed all disposables")
|
||||
ClaudeDevProvider.activeInstances.delete(this)
|
||||
}
|
||||
|
||||
@@ -1,129 +1,93 @@
|
||||
import * as vscode from "vscode"
|
||||
import * as fs from "fs/promises"
|
||||
import * as path from "path"
|
||||
import { Browser } from "puppeteer-core"
|
||||
import { Browser, Page, launch } from "puppeteer-core"
|
||||
import * as cheerio from "cheerio"
|
||||
import TurndownService from "turndown"
|
||||
// @ts-ignore
|
||||
import PCR from "puppeteer-chromium-resolver"
|
||||
|
||||
const PUPPETEER_DIR = "puppeteer"
|
||||
interface PCRStats {
|
||||
puppeteer: { launch: typeof launch }
|
||||
executablePath: string
|
||||
}
|
||||
|
||||
export class UrlScraper {
|
||||
private context: vscode.ExtensionContext
|
||||
private browser?: Browser
|
||||
private page?: Page
|
||||
|
||||
constructor(context: vscode.ExtensionContext) {
|
||||
this.context = context
|
||||
}
|
||||
|
||||
private async ensureChromiumExists(): Promise<void> {
|
||||
private async ensureChromiumExists(): Promise<PCRStats> {
|
||||
const globalStoragePath = this.context?.globalStorageUri?.fsPath
|
||||
if (!globalStoragePath) {
|
||||
throw new Error("Global storage uri is invalid")
|
||||
}
|
||||
|
||||
const puppeteerDir = path.join(globalStoragePath, PUPPETEER_DIR)
|
||||
|
||||
if (!(await fileExists(puppeteerDir))) {
|
||||
const puppeteerDir = path.join(globalStoragePath, "puppeteer")
|
||||
const dirExists = await fs
|
||||
.access(puppeteerDir)
|
||||
.then(() => true)
|
||||
.catch(() => false)
|
||||
if (!dirExists) {
|
||||
await fs.mkdir(puppeteerDir, { recursive: true })
|
||||
}
|
||||
|
||||
const chromiumPath = path.join(puppeteerDir, ".chromium-browser-snapshots")
|
||||
|
||||
if (!(await fileExists(chromiumPath))) {
|
||||
// If Chromium doesn't exist, download it
|
||||
await PCR({
|
||||
downloadPath: puppeteerDir,
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
async urlToMarkdown(url: string): Promise<string> {
|
||||
await this.ensureChromiumExists()
|
||||
|
||||
const globalStoragePath = this.context?.globalStorageUri?.fsPath
|
||||
if (!globalStoragePath) {
|
||||
throw new Error("Global storage uri is invalid")
|
||||
}
|
||||
const puppeteerDir = path.join(globalStoragePath, PUPPETEER_DIR)
|
||||
|
||||
const stats = await PCR({
|
||||
// if chromium doesn't exist, this will download it to path.join(puppeteerDir, ".chromium-browser-snapshots")
|
||||
// if it does exist it will return the path to existing chromium
|
||||
const stats: PCRStats = await PCR({
|
||||
downloadPath: puppeteerDir,
|
||||
})
|
||||
const browser: Browser = await stats.puppeteer.launch({
|
||||
|
||||
return stats
|
||||
}
|
||||
|
||||
async launchBrowser(): Promise<void> {
|
||||
if (this.browser) {
|
||||
return
|
||||
}
|
||||
const stats = await this.ensureChromiumExists()
|
||||
this.browser = await stats.puppeteer.launch({
|
||||
args: [
|
||||
"--user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.150 Safari/537.36",
|
||||
"--user-agent=Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/128.0.0.0 Safari/537.36",
|
||||
],
|
||||
executablePath: stats.executablePath,
|
||||
})
|
||||
|
||||
try {
|
||||
const page = await browser.newPage()
|
||||
|
||||
/*
|
||||
- networkidle2 is equivalent to playwright's networkidle where it waits until there are no more than 2 network connections for at least 500 ms.
|
||||
- domcontentloaded is when the basic DOM is loaded
|
||||
this should be sufficient for most doc sites, but we can use the more elaborate waitTillHTMLRendered if we find users are scraping more dynamic complex sites
|
||||
*/
|
||||
await page.goto(url, { timeout: 10_000, waitUntil: ["domcontentloaded", "networkidle2"] })
|
||||
// await this.waitTillHTMLRendered(page)
|
||||
const content = await page.content()
|
||||
|
||||
// Use Cheerio to parse and clean up the HTML
|
||||
const $ = cheerio.load(content)
|
||||
$("script, style, nav, footer").remove() // Remove unnecessary elements (todo: make this more robust)
|
||||
|
||||
// Convert cleaned HTML to Markdown
|
||||
const turndownService = new TurndownService()
|
||||
const markdown = turndownService.turndown($.html())
|
||||
|
||||
return markdown
|
||||
} finally {
|
||||
await browser.close()
|
||||
}
|
||||
// (latest version of puppeteer does not add headless to user agent)
|
||||
this.page = await this.browser?.newPage()
|
||||
}
|
||||
|
||||
// page.goto { waitUntil: "networkidle0" } may not ever resolve, and not waiting could return page content too early before js has loaded
|
||||
// https://stackoverflow.com/questions/52497252/puppeteer-wait-until-page-is-completely-loaded/61304202#61304202
|
||||
/*
|
||||
private async waitTillHTMLRendered(page: Page, timeout = 10_000) {
|
||||
const checkDurationMsecs = 500 // 1000
|
||||
const maxChecks = timeout / checkDurationMsecs
|
||||
let lastHTMLSize = 0
|
||||
let checkCounts = 1
|
||||
let countStableSizeIterations = 0
|
||||
const minStableSizeIterations = 3
|
||||
|
||||
while (checkCounts++ <= maxChecks) {
|
||||
let html = await page.content()
|
||||
let currentHTMLSize = html.length
|
||||
|
||||
// let bodyHTMLSize = await page.evaluate(() => document.body.innerHTML.length)
|
||||
console.log("last: ", lastHTMLSize, " <> curr: ", currentHTMLSize)
|
||||
|
||||
if (lastHTMLSize !== 0 && currentHTMLSize === lastHTMLSize) {
|
||||
countStableSizeIterations++
|
||||
} else {
|
||||
countStableSizeIterations = 0 //reset the counter
|
||||
}
|
||||
|
||||
if (countStableSizeIterations >= minStableSizeIterations) {
|
||||
console.log("Page rendered fully...")
|
||||
break
|
||||
}
|
||||
|
||||
lastHTMLSize = currentHTMLSize
|
||||
await delay(checkDurationMsecs)
|
||||
}
|
||||
async closeBrowser(): Promise<void> {
|
||||
await this.browser?.close()
|
||||
this.browser = undefined
|
||||
this.page = undefined
|
||||
}
|
||||
*/
|
||||
}
|
||||
|
||||
async function fileExists(path: string): Promise<boolean> {
|
||||
try {
|
||||
await fs.access(path)
|
||||
return true
|
||||
} catch {
|
||||
return false
|
||||
// must make sure to call launchBrowser before and closeBrowser after using this
|
||||
async urlToMarkdown(url: string): Promise<string> {
|
||||
if (!this.browser || !this.page) {
|
||||
throw new Error("Browser not initialized")
|
||||
}
|
||||
/*
|
||||
- networkidle2 is equivalent to playwright's networkidle where it waits until there are no more than 2 network connections for at least 500 ms.
|
||||
- domcontentloaded is when the basic DOM is loaded
|
||||
this should be sufficient for most doc sites, but we can use the more elaborate waitTillHTMLRendered if we find users are scraping more dynamic complex sites
|
||||
https://stackoverflow.com/questions/52497252/puppeteer-wait-until-page-is-completely-loaded/61304202#61304202
|
||||
*/
|
||||
await this.page.goto(url, { timeout: 10_000, waitUntil: ["domcontentloaded", "networkidle2"] })
|
||||
const content = await this.page.content()
|
||||
|
||||
// use cheerio to parse and clean up the HTML
|
||||
const $ = cheerio.load(content)
|
||||
$("script, style, nav, footer, header").remove()
|
||||
|
||||
// convert cleaned HTML to markdown
|
||||
const turndownService = new TurndownService()
|
||||
const markdown = turndownService.turndown($.html())
|
||||
|
||||
return markdown
|
||||
}
|
||||
}
|
||||
|
||||
@@ -32,7 +32,7 @@ export function openMention(mention?: string): void {
|
||||
}
|
||||
}
|
||||
|
||||
export async function parseMentions(text: string, cwd: string, urlScraper?: UrlScraper): Promise<string> {
|
||||
export async function parseMentions(text: string, cwd: string, urlScraper: UrlScraper): Promise<string> {
|
||||
const mentions: Set<string> = new Set()
|
||||
let parsedText = text.replace(mentionRegexGlobal, (match, mention) => {
|
||||
mentions.add(mention)
|
||||
@@ -48,15 +48,32 @@ export async function parseMentions(text: string, cwd: string, urlScraper?: UrlS
|
||||
return match
|
||||
})
|
||||
|
||||
const urlMention = Array.from(mentions).find((mention) => mention.startsWith("http"))
|
||||
let launchBrowserError: Error | undefined
|
||||
if (urlMention) {
|
||||
try {
|
||||
await urlScraper.launchBrowser()
|
||||
} catch (error) {
|
||||
launchBrowserError = error
|
||||
vscode.window.showErrorMessage(`Error fetching content for ${urlMention}: ${error.message}`)
|
||||
}
|
||||
}
|
||||
|
||||
for (const mention of mentions) {
|
||||
if (mention.startsWith("http") && urlScraper) {
|
||||
try {
|
||||
const markdown = await urlScraper.urlToMarkdown(mention)
|
||||
parsedText += `\n\n<url_content url="${mention}">\n${markdown}\n</url_content>`
|
||||
} catch (error) {
|
||||
vscode.window.showErrorMessage(`Error fetching content for ${mention}: ${JSON.stringify(error)}`)
|
||||
parsedText += `\n\n<url_content url="${mention}">\nError fetching content: ${error.message}\n</url_content>`
|
||||
if (mention.startsWith("http")) {
|
||||
let result: string
|
||||
if (launchBrowserError) {
|
||||
result = `Error fetching content: ${launchBrowserError.message}`
|
||||
} else {
|
||||
try {
|
||||
const markdown = await urlScraper.urlToMarkdown(mention)
|
||||
result = markdown
|
||||
} catch (error) {
|
||||
vscode.window.showErrorMessage(`Error fetching content for ${mention}: ${error.message}`)
|
||||
result = `Error fetching content: ${error.message}`
|
||||
}
|
||||
}
|
||||
parsedText += `\n\n<url_content url="${mention}">\n${result}\n</url_content>`
|
||||
} else if (mention.startsWith("/")) {
|
||||
const mentionPath = mention.slice(1) // Remove the leading '/'
|
||||
try {
|
||||
@@ -83,6 +100,14 @@ export async function parseMentions(text: string, cwd: string, urlScraper?: UrlS
|
||||
}
|
||||
}
|
||||
|
||||
if (urlMention) {
|
||||
try {
|
||||
await urlScraper.closeBrowser()
|
||||
} catch (error) {
|
||||
console.error(`Error closing browser: ${error.message}`)
|
||||
}
|
||||
}
|
||||
|
||||
return parsedText
|
||||
}
|
||||
|
||||
@@ -95,7 +120,7 @@ async function getFileOrFolderContent(mentionPath: string, cwd: string): Promise
|
||||
if (stats.isFile()) {
|
||||
const isBinary = await isBinaryFile(absPath).catch(() => false)
|
||||
if (isBinary) {
|
||||
return "(Binary file)"
|
||||
return "(Binary file, unable to display content)"
|
||||
}
|
||||
const content = await extractTextFromFile(absPath)
|
||||
return content
|
||||
|
||||
Reference in New Issue
Block a user