Use a single browser instance to scrape multiple sites

This commit is contained in:
Saoud Rizwan
2024-09-19 12:44:11 -04:00
parent e75cab491a
commit 73f082bf98
4 changed files with 98 additions and 112 deletions

View File

@@ -27,6 +27,7 @@ import { truncateHalfConversation } from "./utils/context-management"
import { extractTextFromFile } from "./utils/extract-text" import { extractTextFromFile } from "./utils/extract-text"
import { regexSearchFiles } from "./utils/ripgrep" import { regexSearchFiles } from "./utils/ripgrep"
import { parseMentions } from "./utils/context-mentions" import { parseMentions } from "./utils/context-mentions"
import { UrlScraper } from "./utils/UrlScraper"
const SYSTEM_PROMPT = const SYSTEM_PROMPT =
async () => `You are Claude Dev, a highly skilled software developer with extensive knowledge in many programming languages, frameworks, design patterns, and best practices. async () => `You are Claude Dev, a highly skilled software developer with extensive knowledge in many programming languages, frameworks, design patterns, and best practices.
@@ -250,6 +251,7 @@ export class ClaudeDev {
readonly taskId: string readonly taskId: string
private api: ApiHandler private api: ApiHandler
private terminalManager: TerminalManager private terminalManager: TerminalManager
private urlScraper: UrlScraper
private didEditFile: boolean = false private didEditFile: boolean = false
private customInstructions?: string private customInstructions?: string
private alwaysAllowReadOnly: boolean private alwaysAllowReadOnly: boolean
@@ -275,6 +277,7 @@ export class ClaudeDev {
this.providerRef = new WeakRef(provider) this.providerRef = new WeakRef(provider)
this.api = buildApiHandler(apiConfiguration) this.api = buildApiHandler(apiConfiguration)
this.terminalManager = new TerminalManager() this.terminalManager = new TerminalManager()
this.urlScraper = new UrlScraper(provider.context)
this.customInstructions = customInstructions this.customInstructions = customInstructions
this.alwaysAllowReadOnly = alwaysAllowReadOnly ?? false this.alwaysAllowReadOnly = alwaysAllowReadOnly ?? false
@@ -675,6 +678,7 @@ export class ClaudeDev {
abortTask() { abortTask() {
this.abort = true // will stop any autonomously running promises this.abort = true // will stop any autonomously running promises
this.terminalManager.disposeAll() this.terminalManager.disposeAll()
this.urlScraper.closeBrowser()
} }
async executeTool(toolName: ToolName, toolInput: any): Promise<[boolean, ToolResponse]> { async executeTool(toolName: ToolName, toolInput: any): Promise<[boolean, ToolResponse]> {
@@ -1643,14 +1647,14 @@ ${this.customInstructions.trim()}
if (block.type === "text") { if (block.type === "text") {
return { return {
...block, ...block,
text: await parseMentions(block.text, cwd, this.providerRef.deref()?.urlScraper), text: await parseMentions(block.text, cwd, this.urlScraper),
} }
} else if (block.type === "tool_result") { } else if (block.type === "tool_result") {
const isUserMessage = (text: string) => text.includes("<feedback>") || text.includes("<answer>") const isUserMessage = (text: string) => text.includes("<feedback>") || text.includes("<answer>")
if (typeof block.content === "string" && isUserMessage(block.content)) { if (typeof block.content === "string" && isUserMessage(block.content)) {
return { return {
...block, ...block,
content: await parseMentions(block.content, cwd, this.providerRef.deref()?.urlScraper), content: await parseMentions(block.content, cwd, this.urlScraper),
} }
} else if (Array.isArray(block.content)) { } else if (Array.isArray(block.content)) {
const parsedContent = await Promise.all( const parsedContent = await Promise.all(
@@ -1658,11 +1662,7 @@ ${this.customInstructions.trim()}
if (contentBlock.type === "text" && isUserMessage(contentBlock.text)) { if (contentBlock.type === "text" && isUserMessage(contentBlock.text)) {
return { return {
...contentBlock, ...contentBlock,
text: await parseMentions( text: await parseMentions(contentBlock.text, cwd, this.urlScraper),
contentBlock.text,
cwd,
this.providerRef.deref()?.urlScraper
),
} }
} }
return contentBlock return contentBlock

View File

@@ -54,14 +54,12 @@ export class ClaudeDevProvider implements vscode.WebviewViewProvider {
private view?: vscode.WebviewView | vscode.WebviewPanel private view?: vscode.WebviewView | vscode.WebviewPanel
private claudeDev?: ClaudeDev private claudeDev?: ClaudeDev
private workspaceTracker?: WorkspaceTracker private workspaceTracker?: WorkspaceTracker
urlScraper?: UrlScraper
private latestAnnouncementId = "sep-14-2024" // update to some unique identifier when we add a new announcement private latestAnnouncementId = "sep-14-2024" // update to some unique identifier when we add a new announcement
constructor(readonly context: vscode.ExtensionContext, private readonly outputChannel: vscode.OutputChannel) { constructor(readonly context: vscode.ExtensionContext, private readonly outputChannel: vscode.OutputChannel) {
this.outputChannel.appendLine("ClaudeDevProvider instantiated") this.outputChannel.appendLine("ClaudeDevProvider instantiated")
ClaudeDevProvider.activeInstances.add(this) ClaudeDevProvider.activeInstances.add(this)
this.workspaceTracker = new WorkspaceTracker(this) this.workspaceTracker = new WorkspaceTracker(this)
this.urlScraper = new UrlScraper(this.context)
this.revertKodu() this.revertKodu()
} }
@@ -107,7 +105,6 @@ export class ClaudeDevProvider implements vscode.WebviewViewProvider {
} }
this.workspaceTracker?.dispose() this.workspaceTracker?.dispose()
this.workspaceTracker = undefined this.workspaceTracker = undefined
this.urlScraper = undefined
this.outputChannel.appendLine("Disposed all disposables") this.outputChannel.appendLine("Disposed all disposables")
ClaudeDevProvider.activeInstances.delete(this) ClaudeDevProvider.activeInstances.delete(this)
} }

View File

@@ -1,129 +1,93 @@
import * as vscode from "vscode" import * as vscode from "vscode"
import * as fs from "fs/promises" import * as fs from "fs/promises"
import * as path from "path" import * as path from "path"
import { Browser } from "puppeteer-core" import { Browser, Page, launch } from "puppeteer-core"
import * as cheerio from "cheerio" import * as cheerio from "cheerio"
import TurndownService from "turndown" import TurndownService from "turndown"
// @ts-ignore // @ts-ignore
import PCR from "puppeteer-chromium-resolver" import PCR from "puppeteer-chromium-resolver"
const PUPPETEER_DIR = "puppeteer" interface PCRStats {
puppeteer: { launch: typeof launch }
executablePath: string
}
export class UrlScraper { export class UrlScraper {
private context: vscode.ExtensionContext private context: vscode.ExtensionContext
private browser?: Browser
private page?: Page
constructor(context: vscode.ExtensionContext) { constructor(context: vscode.ExtensionContext) {
this.context = context this.context = context
} }
private async ensureChromiumExists(): Promise<void> { private async ensureChromiumExists(): Promise<PCRStats> {
const globalStoragePath = this.context?.globalStorageUri?.fsPath const globalStoragePath = this.context?.globalStorageUri?.fsPath
if (!globalStoragePath) { if (!globalStoragePath) {
throw new Error("Global storage uri is invalid") throw new Error("Global storage uri is invalid")
} }
const puppeteerDir = path.join(globalStoragePath, PUPPETEER_DIR) const puppeteerDir = path.join(globalStoragePath, "puppeteer")
const dirExists = await fs
if (!(await fileExists(puppeteerDir))) { .access(puppeteerDir)
.then(() => true)
.catch(() => false)
if (!dirExists) {
await fs.mkdir(puppeteerDir, { recursive: true }) await fs.mkdir(puppeteerDir, { recursive: true })
} }
const chromiumPath = path.join(puppeteerDir, ".chromium-browser-snapshots") // if chromium doesn't exist, this will download it to path.join(puppeteerDir, ".chromium-browser-snapshots")
// if it does exist it will return the path to existing chromium
if (!(await fileExists(chromiumPath))) { const stats: PCRStats = await PCR({
// If Chromium doesn't exist, download it
await PCR({
downloadPath: puppeteerDir, downloadPath: puppeteerDir,
}) })
}
return stats
} }
async urlToMarkdown(url: string): Promise<string> { async launchBrowser(): Promise<void> {
await this.ensureChromiumExists() if (this.browser) {
return
const globalStoragePath = this.context?.globalStorageUri?.fsPath
if (!globalStoragePath) {
throw new Error("Global storage uri is invalid")
} }
const puppeteerDir = path.join(globalStoragePath, PUPPETEER_DIR) const stats = await this.ensureChromiumExists()
this.browser = await stats.puppeteer.launch({
const stats = await PCR({
downloadPath: puppeteerDir,
})
const browser: Browser = await stats.puppeteer.launch({
args: [ args: [
"--user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.150 Safari/537.36", "--user-agent=Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/128.0.0.0 Safari/537.36",
], ],
executablePath: stats.executablePath, executablePath: stats.executablePath,
}) })
// (latest version of puppeteer does not add headless to user agent)
this.page = await this.browser?.newPage()
}
try { async closeBrowser(): Promise<void> {
const page = await browser.newPage() await this.browser?.close()
this.browser = undefined
this.page = undefined
}
// must make sure to call launchBrowser before and closeBrowser after using this
async urlToMarkdown(url: string): Promise<string> {
if (!this.browser || !this.page) {
throw new Error("Browser not initialized")
}
/* /*
- networkidle2 is equivalent to playwright's networkidle where it waits until there are no more than 2 network connections for at least 500 ms. - networkidle2 is equivalent to playwright's networkidle where it waits until there are no more than 2 network connections for at least 500 ms.
- domcontentloaded is when the basic DOM is loaded - domcontentloaded is when the basic DOM is loaded
this should be sufficient for most doc sites, but we can use the more elaborate waitTillHTMLRendered if we find users are scraping more dynamic complex sites this should be sufficient for most doc sites, but we can use the more elaborate waitTillHTMLRendered if we find users are scraping more dynamic complex sites
https://stackoverflow.com/questions/52497252/puppeteer-wait-until-page-is-completely-loaded/61304202#61304202
*/ */
await page.goto(url, { timeout: 10_000, waitUntil: ["domcontentloaded", "networkidle2"] }) await this.page.goto(url, { timeout: 10_000, waitUntil: ["domcontentloaded", "networkidle2"] })
// await this.waitTillHTMLRendered(page) const content = await this.page.content()
const content = await page.content()
// Use Cheerio to parse and clean up the HTML // use cheerio to parse and clean up the HTML
const $ = cheerio.load(content) const $ = cheerio.load(content)
$("script, style, nav, footer").remove() // Remove unnecessary elements (todo: make this more robust) $("script, style, nav, footer, header").remove()
// Convert cleaned HTML to Markdown // convert cleaned HTML to markdown
const turndownService = new TurndownService() const turndownService = new TurndownService()
const markdown = turndownService.turndown($.html()) const markdown = turndownService.turndown($.html())
return markdown return markdown
} finally {
await browser.close()
}
}
// page.goto { waitUntil: "networkidle0" } may not ever resolve, and not waiting could return page content too early before js has loaded
// https://stackoverflow.com/questions/52497252/puppeteer-wait-until-page-is-completely-loaded/61304202#61304202
/*
private async waitTillHTMLRendered(page: Page, timeout = 10_000) {
const checkDurationMsecs = 500 // 1000
const maxChecks = timeout / checkDurationMsecs
let lastHTMLSize = 0
let checkCounts = 1
let countStableSizeIterations = 0
const minStableSizeIterations = 3
while (checkCounts++ <= maxChecks) {
let html = await page.content()
let currentHTMLSize = html.length
// let bodyHTMLSize = await page.evaluate(() => document.body.innerHTML.length)
console.log("last: ", lastHTMLSize, " <> curr: ", currentHTMLSize)
if (lastHTMLSize !== 0 && currentHTMLSize === lastHTMLSize) {
countStableSizeIterations++
} else {
countStableSizeIterations = 0 //reset the counter
}
if (countStableSizeIterations >= minStableSizeIterations) {
console.log("Page rendered fully...")
break
}
lastHTMLSize = currentHTMLSize
await delay(checkDurationMsecs)
}
}
*/
}
async function fileExists(path: string): Promise<boolean> {
try {
await fs.access(path)
return true
} catch {
return false
} }
} }

View File

@@ -32,7 +32,7 @@ export function openMention(mention?: string): void {
} }
} }
export async function parseMentions(text: string, cwd: string, urlScraper?: UrlScraper): Promise<string> { export async function parseMentions(text: string, cwd: string, urlScraper: UrlScraper): Promise<string> {
const mentions: Set<string> = new Set() const mentions: Set<string> = new Set()
let parsedText = text.replace(mentionRegexGlobal, (match, mention) => { let parsedText = text.replace(mentionRegexGlobal, (match, mention) => {
mentions.add(mention) mentions.add(mention)
@@ -48,15 +48,32 @@ export async function parseMentions(text: string, cwd: string, urlScraper?: UrlS
return match return match
}) })
const urlMention = Array.from(mentions).find((mention) => mention.startsWith("http"))
let launchBrowserError: Error | undefined
if (urlMention) {
try {
await urlScraper.launchBrowser()
} catch (error) {
launchBrowserError = error
vscode.window.showErrorMessage(`Error fetching content for ${urlMention}: ${error.message}`)
}
}
for (const mention of mentions) { for (const mention of mentions) {
if (mention.startsWith("http") && urlScraper) { if (mention.startsWith("http")) {
let result: string
if (launchBrowserError) {
result = `Error fetching content: ${launchBrowserError.message}`
} else {
try { try {
const markdown = await urlScraper.urlToMarkdown(mention) const markdown = await urlScraper.urlToMarkdown(mention)
parsedText += `\n\n<url_content url="${mention}">\n${markdown}\n</url_content>` result = markdown
} catch (error) { } catch (error) {
vscode.window.showErrorMessage(`Error fetching content for ${mention}: ${JSON.stringify(error)}`) vscode.window.showErrorMessage(`Error fetching content for ${mention}: ${error.message}`)
parsedText += `\n\n<url_content url="${mention}">\nError fetching content: ${error.message}\n</url_content>` result = `Error fetching content: ${error.message}`
} }
}
parsedText += `\n\n<url_content url="${mention}">\n${result}\n</url_content>`
} else if (mention.startsWith("/")) { } else if (mention.startsWith("/")) {
const mentionPath = mention.slice(1) // Remove the leading '/' const mentionPath = mention.slice(1) // Remove the leading '/'
try { try {
@@ -83,6 +100,14 @@ export async function parseMentions(text: string, cwd: string, urlScraper?: UrlS
} }
} }
if (urlMention) {
try {
await urlScraper.closeBrowser()
} catch (error) {
console.error(`Error closing browser: ${error.message}`)
}
}
return parsedText return parsedText
} }
@@ -95,7 +120,7 @@ async function getFileOrFolderContent(mentionPath: string, cwd: string): Promise
if (stats.isFile()) { if (stats.isFile()) {
const isBinary = await isBinaryFile(absPath).catch(() => false) const isBinary = await isBinaryFile(absPath).catch(() => false)
if (isBinary) { if (isBinary) {
return "(Binary file)" return "(Binary file, unable to display content)"
} }
const content = await extractTextFromFile(absPath) const content = await extractTextFromFile(absPath)
return content return content