mirror of
https://github.com/pacnpal/Roo-Code.git
synced 2025-12-20 12:21:13 -05:00
Use a single browser instance to scrape multiple sites
This commit is contained in:
@@ -27,6 +27,7 @@ import { truncateHalfConversation } from "./utils/context-management"
|
|||||||
import { extractTextFromFile } from "./utils/extract-text"
|
import { extractTextFromFile } from "./utils/extract-text"
|
||||||
import { regexSearchFiles } from "./utils/ripgrep"
|
import { regexSearchFiles } from "./utils/ripgrep"
|
||||||
import { parseMentions } from "./utils/context-mentions"
|
import { parseMentions } from "./utils/context-mentions"
|
||||||
|
import { UrlScraper } from "./utils/UrlScraper"
|
||||||
|
|
||||||
const SYSTEM_PROMPT =
|
const SYSTEM_PROMPT =
|
||||||
async () => `You are Claude Dev, a highly skilled software developer with extensive knowledge in many programming languages, frameworks, design patterns, and best practices.
|
async () => `You are Claude Dev, a highly skilled software developer with extensive knowledge in many programming languages, frameworks, design patterns, and best practices.
|
||||||
@@ -250,6 +251,7 @@ export class ClaudeDev {
|
|||||||
readonly taskId: string
|
readonly taskId: string
|
||||||
private api: ApiHandler
|
private api: ApiHandler
|
||||||
private terminalManager: TerminalManager
|
private terminalManager: TerminalManager
|
||||||
|
private urlScraper: UrlScraper
|
||||||
private didEditFile: boolean = false
|
private didEditFile: boolean = false
|
||||||
private customInstructions?: string
|
private customInstructions?: string
|
||||||
private alwaysAllowReadOnly: boolean
|
private alwaysAllowReadOnly: boolean
|
||||||
@@ -275,6 +277,7 @@ export class ClaudeDev {
|
|||||||
this.providerRef = new WeakRef(provider)
|
this.providerRef = new WeakRef(provider)
|
||||||
this.api = buildApiHandler(apiConfiguration)
|
this.api = buildApiHandler(apiConfiguration)
|
||||||
this.terminalManager = new TerminalManager()
|
this.terminalManager = new TerminalManager()
|
||||||
|
this.urlScraper = new UrlScraper(provider.context)
|
||||||
this.customInstructions = customInstructions
|
this.customInstructions = customInstructions
|
||||||
this.alwaysAllowReadOnly = alwaysAllowReadOnly ?? false
|
this.alwaysAllowReadOnly = alwaysAllowReadOnly ?? false
|
||||||
|
|
||||||
@@ -675,6 +678,7 @@ export class ClaudeDev {
|
|||||||
abortTask() {
|
abortTask() {
|
||||||
this.abort = true // will stop any autonomously running promises
|
this.abort = true // will stop any autonomously running promises
|
||||||
this.terminalManager.disposeAll()
|
this.terminalManager.disposeAll()
|
||||||
|
this.urlScraper.closeBrowser()
|
||||||
}
|
}
|
||||||
|
|
||||||
async executeTool(toolName: ToolName, toolInput: any): Promise<[boolean, ToolResponse]> {
|
async executeTool(toolName: ToolName, toolInput: any): Promise<[boolean, ToolResponse]> {
|
||||||
@@ -1643,14 +1647,14 @@ ${this.customInstructions.trim()}
|
|||||||
if (block.type === "text") {
|
if (block.type === "text") {
|
||||||
return {
|
return {
|
||||||
...block,
|
...block,
|
||||||
text: await parseMentions(block.text, cwd, this.providerRef.deref()?.urlScraper),
|
text: await parseMentions(block.text, cwd, this.urlScraper),
|
||||||
}
|
}
|
||||||
} else if (block.type === "tool_result") {
|
} else if (block.type === "tool_result") {
|
||||||
const isUserMessage = (text: string) => text.includes("<feedback>") || text.includes("<answer>")
|
const isUserMessage = (text: string) => text.includes("<feedback>") || text.includes("<answer>")
|
||||||
if (typeof block.content === "string" && isUserMessage(block.content)) {
|
if (typeof block.content === "string" && isUserMessage(block.content)) {
|
||||||
return {
|
return {
|
||||||
...block,
|
...block,
|
||||||
content: await parseMentions(block.content, cwd, this.providerRef.deref()?.urlScraper),
|
content: await parseMentions(block.content, cwd, this.urlScraper),
|
||||||
}
|
}
|
||||||
} else if (Array.isArray(block.content)) {
|
} else if (Array.isArray(block.content)) {
|
||||||
const parsedContent = await Promise.all(
|
const parsedContent = await Promise.all(
|
||||||
@@ -1658,11 +1662,7 @@ ${this.customInstructions.trim()}
|
|||||||
if (contentBlock.type === "text" && isUserMessage(contentBlock.text)) {
|
if (contentBlock.type === "text" && isUserMessage(contentBlock.text)) {
|
||||||
return {
|
return {
|
||||||
...contentBlock,
|
...contentBlock,
|
||||||
text: await parseMentions(
|
text: await parseMentions(contentBlock.text, cwd, this.urlScraper),
|
||||||
contentBlock.text,
|
|
||||||
cwd,
|
|
||||||
this.providerRef.deref()?.urlScraper
|
|
||||||
),
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
return contentBlock
|
return contentBlock
|
||||||
|
|||||||
@@ -54,14 +54,12 @@ export class ClaudeDevProvider implements vscode.WebviewViewProvider {
|
|||||||
private view?: vscode.WebviewView | vscode.WebviewPanel
|
private view?: vscode.WebviewView | vscode.WebviewPanel
|
||||||
private claudeDev?: ClaudeDev
|
private claudeDev?: ClaudeDev
|
||||||
private workspaceTracker?: WorkspaceTracker
|
private workspaceTracker?: WorkspaceTracker
|
||||||
urlScraper?: UrlScraper
|
|
||||||
private latestAnnouncementId = "sep-14-2024" // update to some unique identifier when we add a new announcement
|
private latestAnnouncementId = "sep-14-2024" // update to some unique identifier when we add a new announcement
|
||||||
|
|
||||||
constructor(readonly context: vscode.ExtensionContext, private readonly outputChannel: vscode.OutputChannel) {
|
constructor(readonly context: vscode.ExtensionContext, private readonly outputChannel: vscode.OutputChannel) {
|
||||||
this.outputChannel.appendLine("ClaudeDevProvider instantiated")
|
this.outputChannel.appendLine("ClaudeDevProvider instantiated")
|
||||||
ClaudeDevProvider.activeInstances.add(this)
|
ClaudeDevProvider.activeInstances.add(this)
|
||||||
this.workspaceTracker = new WorkspaceTracker(this)
|
this.workspaceTracker = new WorkspaceTracker(this)
|
||||||
this.urlScraper = new UrlScraper(this.context)
|
|
||||||
this.revertKodu()
|
this.revertKodu()
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -107,7 +105,6 @@ export class ClaudeDevProvider implements vscode.WebviewViewProvider {
|
|||||||
}
|
}
|
||||||
this.workspaceTracker?.dispose()
|
this.workspaceTracker?.dispose()
|
||||||
this.workspaceTracker = undefined
|
this.workspaceTracker = undefined
|
||||||
this.urlScraper = undefined
|
|
||||||
this.outputChannel.appendLine("Disposed all disposables")
|
this.outputChannel.appendLine("Disposed all disposables")
|
||||||
ClaudeDevProvider.activeInstances.delete(this)
|
ClaudeDevProvider.activeInstances.delete(this)
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -1,129 +1,93 @@
|
|||||||
import * as vscode from "vscode"
|
import * as vscode from "vscode"
|
||||||
import * as fs from "fs/promises"
|
import * as fs from "fs/promises"
|
||||||
import * as path from "path"
|
import * as path from "path"
|
||||||
import { Browser } from "puppeteer-core"
|
import { Browser, Page, launch } from "puppeteer-core"
|
||||||
import * as cheerio from "cheerio"
|
import * as cheerio from "cheerio"
|
||||||
import TurndownService from "turndown"
|
import TurndownService from "turndown"
|
||||||
// @ts-ignore
|
// @ts-ignore
|
||||||
import PCR from "puppeteer-chromium-resolver"
|
import PCR from "puppeteer-chromium-resolver"
|
||||||
|
|
||||||
const PUPPETEER_DIR = "puppeteer"
|
interface PCRStats {
|
||||||
|
puppeteer: { launch: typeof launch }
|
||||||
|
executablePath: string
|
||||||
|
}
|
||||||
|
|
||||||
export class UrlScraper {
|
export class UrlScraper {
|
||||||
private context: vscode.ExtensionContext
|
private context: vscode.ExtensionContext
|
||||||
|
private browser?: Browser
|
||||||
|
private page?: Page
|
||||||
|
|
||||||
constructor(context: vscode.ExtensionContext) {
|
constructor(context: vscode.ExtensionContext) {
|
||||||
this.context = context
|
this.context = context
|
||||||
}
|
}
|
||||||
|
|
||||||
private async ensureChromiumExists(): Promise<void> {
|
private async ensureChromiumExists(): Promise<PCRStats> {
|
||||||
const globalStoragePath = this.context?.globalStorageUri?.fsPath
|
const globalStoragePath = this.context?.globalStorageUri?.fsPath
|
||||||
if (!globalStoragePath) {
|
if (!globalStoragePath) {
|
||||||
throw new Error("Global storage uri is invalid")
|
throw new Error("Global storage uri is invalid")
|
||||||
}
|
}
|
||||||
|
|
||||||
const puppeteerDir = path.join(globalStoragePath, PUPPETEER_DIR)
|
const puppeteerDir = path.join(globalStoragePath, "puppeteer")
|
||||||
|
const dirExists = await fs
|
||||||
if (!(await fileExists(puppeteerDir))) {
|
.access(puppeteerDir)
|
||||||
|
.then(() => true)
|
||||||
|
.catch(() => false)
|
||||||
|
if (!dirExists) {
|
||||||
await fs.mkdir(puppeteerDir, { recursive: true })
|
await fs.mkdir(puppeteerDir, { recursive: true })
|
||||||
}
|
}
|
||||||
|
|
||||||
const chromiumPath = path.join(puppeteerDir, ".chromium-browser-snapshots")
|
// if chromium doesn't exist, this will download it to path.join(puppeteerDir, ".chromium-browser-snapshots")
|
||||||
|
// if it does exist it will return the path to existing chromium
|
||||||
if (!(await fileExists(chromiumPath))) {
|
const stats: PCRStats = await PCR({
|
||||||
// If Chromium doesn't exist, download it
|
|
||||||
await PCR({
|
|
||||||
downloadPath: puppeteerDir,
|
downloadPath: puppeteerDir,
|
||||||
})
|
})
|
||||||
}
|
|
||||||
|
return stats
|
||||||
}
|
}
|
||||||
|
|
||||||
async urlToMarkdown(url: string): Promise<string> {
|
async launchBrowser(): Promise<void> {
|
||||||
await this.ensureChromiumExists()
|
if (this.browser) {
|
||||||
|
return
|
||||||
const globalStoragePath = this.context?.globalStorageUri?.fsPath
|
|
||||||
if (!globalStoragePath) {
|
|
||||||
throw new Error("Global storage uri is invalid")
|
|
||||||
}
|
}
|
||||||
const puppeteerDir = path.join(globalStoragePath, PUPPETEER_DIR)
|
const stats = await this.ensureChromiumExists()
|
||||||
|
this.browser = await stats.puppeteer.launch({
|
||||||
const stats = await PCR({
|
|
||||||
downloadPath: puppeteerDir,
|
|
||||||
})
|
|
||||||
const browser: Browser = await stats.puppeteer.launch({
|
|
||||||
args: [
|
args: [
|
||||||
"--user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.150 Safari/537.36",
|
"--user-agent=Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/128.0.0.0 Safari/537.36",
|
||||||
],
|
],
|
||||||
executablePath: stats.executablePath,
|
executablePath: stats.executablePath,
|
||||||
})
|
})
|
||||||
|
// (latest version of puppeteer does not add headless to user agent)
|
||||||
|
this.page = await this.browser?.newPage()
|
||||||
|
}
|
||||||
|
|
||||||
try {
|
async closeBrowser(): Promise<void> {
|
||||||
const page = await browser.newPage()
|
await this.browser?.close()
|
||||||
|
this.browser = undefined
|
||||||
|
this.page = undefined
|
||||||
|
}
|
||||||
|
|
||||||
|
// must make sure to call launchBrowser before and closeBrowser after using this
|
||||||
|
async urlToMarkdown(url: string): Promise<string> {
|
||||||
|
if (!this.browser || !this.page) {
|
||||||
|
throw new Error("Browser not initialized")
|
||||||
|
}
|
||||||
/*
|
/*
|
||||||
- networkidle2 is equivalent to playwright's networkidle where it waits until there are no more than 2 network connections for at least 500 ms.
|
- networkidle2 is equivalent to playwright's networkidle where it waits until there are no more than 2 network connections for at least 500 ms.
|
||||||
- domcontentloaded is when the basic DOM is loaded
|
- domcontentloaded is when the basic DOM is loaded
|
||||||
this should be sufficient for most doc sites, but we can use the more elaborate waitTillHTMLRendered if we find users are scraping more dynamic complex sites
|
this should be sufficient for most doc sites, but we can use the more elaborate waitTillHTMLRendered if we find users are scraping more dynamic complex sites
|
||||||
|
https://stackoverflow.com/questions/52497252/puppeteer-wait-until-page-is-completely-loaded/61304202#61304202
|
||||||
*/
|
*/
|
||||||
await page.goto(url, { timeout: 10_000, waitUntil: ["domcontentloaded", "networkidle2"] })
|
await this.page.goto(url, { timeout: 10_000, waitUntil: ["domcontentloaded", "networkidle2"] })
|
||||||
// await this.waitTillHTMLRendered(page)
|
const content = await this.page.content()
|
||||||
const content = await page.content()
|
|
||||||
|
|
||||||
// Use Cheerio to parse and clean up the HTML
|
// use cheerio to parse and clean up the HTML
|
||||||
const $ = cheerio.load(content)
|
const $ = cheerio.load(content)
|
||||||
$("script, style, nav, footer").remove() // Remove unnecessary elements (todo: make this more robust)
|
$("script, style, nav, footer, header").remove()
|
||||||
|
|
||||||
// Convert cleaned HTML to Markdown
|
// convert cleaned HTML to markdown
|
||||||
const turndownService = new TurndownService()
|
const turndownService = new TurndownService()
|
||||||
const markdown = turndownService.turndown($.html())
|
const markdown = turndownService.turndown($.html())
|
||||||
|
|
||||||
return markdown
|
return markdown
|
||||||
} finally {
|
|
||||||
await browser.close()
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// page.goto { waitUntil: "networkidle0" } may not ever resolve, and not waiting could return page content too early before js has loaded
|
|
||||||
// https://stackoverflow.com/questions/52497252/puppeteer-wait-until-page-is-completely-loaded/61304202#61304202
|
|
||||||
/*
|
|
||||||
private async waitTillHTMLRendered(page: Page, timeout = 10_000) {
|
|
||||||
const checkDurationMsecs = 500 // 1000
|
|
||||||
const maxChecks = timeout / checkDurationMsecs
|
|
||||||
let lastHTMLSize = 0
|
|
||||||
let checkCounts = 1
|
|
||||||
let countStableSizeIterations = 0
|
|
||||||
const minStableSizeIterations = 3
|
|
||||||
|
|
||||||
while (checkCounts++ <= maxChecks) {
|
|
||||||
let html = await page.content()
|
|
||||||
let currentHTMLSize = html.length
|
|
||||||
|
|
||||||
// let bodyHTMLSize = await page.evaluate(() => document.body.innerHTML.length)
|
|
||||||
console.log("last: ", lastHTMLSize, " <> curr: ", currentHTMLSize)
|
|
||||||
|
|
||||||
if (lastHTMLSize !== 0 && currentHTMLSize === lastHTMLSize) {
|
|
||||||
countStableSizeIterations++
|
|
||||||
} else {
|
|
||||||
countStableSizeIterations = 0 //reset the counter
|
|
||||||
}
|
|
||||||
|
|
||||||
if (countStableSizeIterations >= minStableSizeIterations) {
|
|
||||||
console.log("Page rendered fully...")
|
|
||||||
break
|
|
||||||
}
|
|
||||||
|
|
||||||
lastHTMLSize = currentHTMLSize
|
|
||||||
await delay(checkDurationMsecs)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
*/
|
|
||||||
}
|
|
||||||
|
|
||||||
async function fileExists(path: string): Promise<boolean> {
|
|
||||||
try {
|
|
||||||
await fs.access(path)
|
|
||||||
return true
|
|
||||||
} catch {
|
|
||||||
return false
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -32,7 +32,7 @@ export function openMention(mention?: string): void {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
export async function parseMentions(text: string, cwd: string, urlScraper?: UrlScraper): Promise<string> {
|
export async function parseMentions(text: string, cwd: string, urlScraper: UrlScraper): Promise<string> {
|
||||||
const mentions: Set<string> = new Set()
|
const mentions: Set<string> = new Set()
|
||||||
let parsedText = text.replace(mentionRegexGlobal, (match, mention) => {
|
let parsedText = text.replace(mentionRegexGlobal, (match, mention) => {
|
||||||
mentions.add(mention)
|
mentions.add(mention)
|
||||||
@@ -48,15 +48,32 @@ export async function parseMentions(text: string, cwd: string, urlScraper?: UrlS
|
|||||||
return match
|
return match
|
||||||
})
|
})
|
||||||
|
|
||||||
|
const urlMention = Array.from(mentions).find((mention) => mention.startsWith("http"))
|
||||||
|
let launchBrowserError: Error | undefined
|
||||||
|
if (urlMention) {
|
||||||
|
try {
|
||||||
|
await urlScraper.launchBrowser()
|
||||||
|
} catch (error) {
|
||||||
|
launchBrowserError = error
|
||||||
|
vscode.window.showErrorMessage(`Error fetching content for ${urlMention}: ${error.message}`)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
for (const mention of mentions) {
|
for (const mention of mentions) {
|
||||||
if (mention.startsWith("http") && urlScraper) {
|
if (mention.startsWith("http")) {
|
||||||
|
let result: string
|
||||||
|
if (launchBrowserError) {
|
||||||
|
result = `Error fetching content: ${launchBrowserError.message}`
|
||||||
|
} else {
|
||||||
try {
|
try {
|
||||||
const markdown = await urlScraper.urlToMarkdown(mention)
|
const markdown = await urlScraper.urlToMarkdown(mention)
|
||||||
parsedText += `\n\n<url_content url="${mention}">\n${markdown}\n</url_content>`
|
result = markdown
|
||||||
} catch (error) {
|
} catch (error) {
|
||||||
vscode.window.showErrorMessage(`Error fetching content for ${mention}: ${JSON.stringify(error)}`)
|
vscode.window.showErrorMessage(`Error fetching content for ${mention}: ${error.message}`)
|
||||||
parsedText += `\n\n<url_content url="${mention}">\nError fetching content: ${error.message}\n</url_content>`
|
result = `Error fetching content: ${error.message}`
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
parsedText += `\n\n<url_content url="${mention}">\n${result}\n</url_content>`
|
||||||
} else if (mention.startsWith("/")) {
|
} else if (mention.startsWith("/")) {
|
||||||
const mentionPath = mention.slice(1) // Remove the leading '/'
|
const mentionPath = mention.slice(1) // Remove the leading '/'
|
||||||
try {
|
try {
|
||||||
@@ -83,6 +100,14 @@ export async function parseMentions(text: string, cwd: string, urlScraper?: UrlS
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (urlMention) {
|
||||||
|
try {
|
||||||
|
await urlScraper.closeBrowser()
|
||||||
|
} catch (error) {
|
||||||
|
console.error(`Error closing browser: ${error.message}`)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
return parsedText
|
return parsedText
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -95,7 +120,7 @@ async function getFileOrFolderContent(mentionPath: string, cwd: string): Promise
|
|||||||
if (stats.isFile()) {
|
if (stats.isFile()) {
|
||||||
const isBinary = await isBinaryFile(absPath).catch(() => false)
|
const isBinary = await isBinaryFile(absPath).catch(() => false)
|
||||||
if (isBinary) {
|
if (isBinary) {
|
||||||
return "(Binary file)"
|
return "(Binary file, unable to display content)"
|
||||||
}
|
}
|
||||||
const content = await extractTextFromFile(absPath)
|
const content = await extractTextFromFile(absPath)
|
||||||
return content
|
return content
|
||||||
|
|||||||
Reference in New Issue
Block a user