From 73f082bf980112380ac4344a832c16b0405b054a Mon Sep 17 00:00:00 2001 From: Saoud Rizwan <7799382+saoudrizwan@users.noreply.github.com> Date: Thu, 19 Sep 2024 12:44:11 -0400 Subject: [PATCH] Use a single browser instance to scrape multiple sites --- src/ClaudeDev.ts | 14 +-- src/providers/ClaudeDevProvider.ts | 3 - src/utils/UrlScraper.ts | 150 +++++++++++------------------ src/utils/context-mentions.ts | 43 +++++++-- 4 files changed, 98 insertions(+), 112 deletions(-) diff --git a/src/ClaudeDev.ts b/src/ClaudeDev.ts index 7bdc952..d9e7473 100644 --- a/src/ClaudeDev.ts +++ b/src/ClaudeDev.ts @@ -27,6 +27,7 @@ import { truncateHalfConversation } from "./utils/context-management" import { extractTextFromFile } from "./utils/extract-text" import { regexSearchFiles } from "./utils/ripgrep" import { parseMentions } from "./utils/context-mentions" +import { UrlScraper } from "./utils/UrlScraper" const SYSTEM_PROMPT = async () => `You are Claude Dev, a highly skilled software developer with extensive knowledge in many programming languages, frameworks, design patterns, and best practices. @@ -250,6 +251,7 @@ export class ClaudeDev { readonly taskId: string private api: ApiHandler private terminalManager: TerminalManager + private urlScraper: UrlScraper private didEditFile: boolean = false private customInstructions?: string private alwaysAllowReadOnly: boolean @@ -275,6 +277,7 @@ export class ClaudeDev { this.providerRef = new WeakRef(provider) this.api = buildApiHandler(apiConfiguration) this.terminalManager = new TerminalManager() + this.urlScraper = new UrlScraper(provider.context) this.customInstructions = customInstructions this.alwaysAllowReadOnly = alwaysAllowReadOnly ?? false @@ -675,6 +678,7 @@ export class ClaudeDev { abortTask() { this.abort = true // will stop any autonomously running promises this.terminalManager.disposeAll() + this.urlScraper.closeBrowser() } async executeTool(toolName: ToolName, toolInput: any): Promise<[boolean, ToolResponse]> { @@ -1643,14 +1647,14 @@ ${this.customInstructions.trim()} if (block.type === "text") { return { ...block, - text: await parseMentions(block.text, cwd, this.providerRef.deref()?.urlScraper), + text: await parseMentions(block.text, cwd, this.urlScraper), } } else if (block.type === "tool_result") { const isUserMessage = (text: string) => text.includes("") || text.includes("") if (typeof block.content === "string" && isUserMessage(block.content)) { return { ...block, - content: await parseMentions(block.content, cwd, this.providerRef.deref()?.urlScraper), + content: await parseMentions(block.content, cwd, this.urlScraper), } } else if (Array.isArray(block.content)) { const parsedContent = await Promise.all( @@ -1658,11 +1662,7 @@ ${this.customInstructions.trim()} if (contentBlock.type === "text" && isUserMessage(contentBlock.text)) { return { ...contentBlock, - text: await parseMentions( - contentBlock.text, - cwd, - this.providerRef.deref()?.urlScraper - ), + text: await parseMentions(contentBlock.text, cwd, this.urlScraper), } } return contentBlock diff --git a/src/providers/ClaudeDevProvider.ts b/src/providers/ClaudeDevProvider.ts index 7cc4ab4..3f8e1a0 100644 --- a/src/providers/ClaudeDevProvider.ts +++ b/src/providers/ClaudeDevProvider.ts @@ -54,14 +54,12 @@ export class ClaudeDevProvider implements vscode.WebviewViewProvider { private view?: vscode.WebviewView | vscode.WebviewPanel private claudeDev?: ClaudeDev private workspaceTracker?: WorkspaceTracker - urlScraper?: UrlScraper private latestAnnouncementId = "sep-14-2024" // update to some unique identifier when we add a new announcement constructor(readonly context: vscode.ExtensionContext, private readonly outputChannel: vscode.OutputChannel) { this.outputChannel.appendLine("ClaudeDevProvider instantiated") ClaudeDevProvider.activeInstances.add(this) this.workspaceTracker = new WorkspaceTracker(this) - this.urlScraper = new UrlScraper(this.context) this.revertKodu() } @@ -107,7 +105,6 @@ export class ClaudeDevProvider implements vscode.WebviewViewProvider { } this.workspaceTracker?.dispose() this.workspaceTracker = undefined - this.urlScraper = undefined this.outputChannel.appendLine("Disposed all disposables") ClaudeDevProvider.activeInstances.delete(this) } diff --git a/src/utils/UrlScraper.ts b/src/utils/UrlScraper.ts index 51ae6be..50aa8b5 100644 --- a/src/utils/UrlScraper.ts +++ b/src/utils/UrlScraper.ts @@ -1,129 +1,93 @@ import * as vscode from "vscode" import * as fs from "fs/promises" import * as path from "path" -import { Browser } from "puppeteer-core" +import { Browser, Page, launch } from "puppeteer-core" import * as cheerio from "cheerio" import TurndownService from "turndown" // @ts-ignore import PCR from "puppeteer-chromium-resolver" -const PUPPETEER_DIR = "puppeteer" +interface PCRStats { + puppeteer: { launch: typeof launch } + executablePath: string +} export class UrlScraper { private context: vscode.ExtensionContext + private browser?: Browser + private page?: Page constructor(context: vscode.ExtensionContext) { this.context = context } - private async ensureChromiumExists(): Promise { + private async ensureChromiumExists(): Promise { const globalStoragePath = this.context?.globalStorageUri?.fsPath if (!globalStoragePath) { throw new Error("Global storage uri is invalid") } - const puppeteerDir = path.join(globalStoragePath, PUPPETEER_DIR) - - if (!(await fileExists(puppeteerDir))) { + const puppeteerDir = path.join(globalStoragePath, "puppeteer") + const dirExists = await fs + .access(puppeteerDir) + .then(() => true) + .catch(() => false) + if (!dirExists) { await fs.mkdir(puppeteerDir, { recursive: true }) } - const chromiumPath = path.join(puppeteerDir, ".chromium-browser-snapshots") - - if (!(await fileExists(chromiumPath))) { - // If Chromium doesn't exist, download it - await PCR({ - downloadPath: puppeteerDir, - }) - } - } - - async urlToMarkdown(url: string): Promise { - await this.ensureChromiumExists() - - const globalStoragePath = this.context?.globalStorageUri?.fsPath - if (!globalStoragePath) { - throw new Error("Global storage uri is invalid") - } - const puppeteerDir = path.join(globalStoragePath, PUPPETEER_DIR) - - const stats = await PCR({ + // if chromium doesn't exist, this will download it to path.join(puppeteerDir, ".chromium-browser-snapshots") + // if it does exist it will return the path to existing chromium + const stats: PCRStats = await PCR({ downloadPath: puppeteerDir, }) - const browser: Browser = await stats.puppeteer.launch({ + + return stats + } + + async launchBrowser(): Promise { + if (this.browser) { + return + } + const stats = await this.ensureChromiumExists() + this.browser = await stats.puppeteer.launch({ args: [ - "--user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.150 Safari/537.36", + "--user-agent=Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/128.0.0.0 Safari/537.36", ], executablePath: stats.executablePath, }) - - try { - const page = await browser.newPage() - - /* - - networkidle2 is equivalent to playwright's networkidle where it waits until there are no more than 2 network connections for at least 500 ms. - - domcontentloaded is when the basic DOM is loaded - this should be sufficient for most doc sites, but we can use the more elaborate waitTillHTMLRendered if we find users are scraping more dynamic complex sites - */ - await page.goto(url, { timeout: 10_000, waitUntil: ["domcontentloaded", "networkidle2"] }) - // await this.waitTillHTMLRendered(page) - const content = await page.content() - - // Use Cheerio to parse and clean up the HTML - const $ = cheerio.load(content) - $("script, style, nav, footer").remove() // Remove unnecessary elements (todo: make this more robust) - - // Convert cleaned HTML to Markdown - const turndownService = new TurndownService() - const markdown = turndownService.turndown($.html()) - - return markdown - } finally { - await browser.close() - } + // (latest version of puppeteer does not add headless to user agent) + this.page = await this.browser?.newPage() } - // page.goto { waitUntil: "networkidle0" } may not ever resolve, and not waiting could return page content too early before js has loaded - // https://stackoverflow.com/questions/52497252/puppeteer-wait-until-page-is-completely-loaded/61304202#61304202 - /* - private async waitTillHTMLRendered(page: Page, timeout = 10_000) { - const checkDurationMsecs = 500 // 1000 - const maxChecks = timeout / checkDurationMsecs - let lastHTMLSize = 0 - let checkCounts = 1 - let countStableSizeIterations = 0 - const minStableSizeIterations = 3 - - while (checkCounts++ <= maxChecks) { - let html = await page.content() - let currentHTMLSize = html.length - - // let bodyHTMLSize = await page.evaluate(() => document.body.innerHTML.length) - console.log("last: ", lastHTMLSize, " <> curr: ", currentHTMLSize) - - if (lastHTMLSize !== 0 && currentHTMLSize === lastHTMLSize) { - countStableSizeIterations++ - } else { - countStableSizeIterations = 0 //reset the counter - } - - if (countStableSizeIterations >= minStableSizeIterations) { - console.log("Page rendered fully...") - break - } - - lastHTMLSize = currentHTMLSize - await delay(checkDurationMsecs) - } + async closeBrowser(): Promise { + await this.browser?.close() + this.browser = undefined + this.page = undefined } - */ -} -async function fileExists(path: string): Promise { - try { - await fs.access(path) - return true - } catch { - return false + // must make sure to call launchBrowser before and closeBrowser after using this + async urlToMarkdown(url: string): Promise { + if (!this.browser || !this.page) { + throw new Error("Browser not initialized") + } + /* + - networkidle2 is equivalent to playwright's networkidle where it waits until there are no more than 2 network connections for at least 500 ms. + - domcontentloaded is when the basic DOM is loaded + this should be sufficient for most doc sites, but we can use the more elaborate waitTillHTMLRendered if we find users are scraping more dynamic complex sites + https://stackoverflow.com/questions/52497252/puppeteer-wait-until-page-is-completely-loaded/61304202#61304202 + */ + await this.page.goto(url, { timeout: 10_000, waitUntil: ["domcontentloaded", "networkidle2"] }) + const content = await this.page.content() + + // use cheerio to parse and clean up the HTML + const $ = cheerio.load(content) + $("script, style, nav, footer, header").remove() + + // convert cleaned HTML to markdown + const turndownService = new TurndownService() + const markdown = turndownService.turndown($.html()) + + return markdown } } diff --git a/src/utils/context-mentions.ts b/src/utils/context-mentions.ts index bd9049a..a8f9af7 100644 --- a/src/utils/context-mentions.ts +++ b/src/utils/context-mentions.ts @@ -32,7 +32,7 @@ export function openMention(mention?: string): void { } } -export async function parseMentions(text: string, cwd: string, urlScraper?: UrlScraper): Promise { +export async function parseMentions(text: string, cwd: string, urlScraper: UrlScraper): Promise { const mentions: Set = new Set() let parsedText = text.replace(mentionRegexGlobal, (match, mention) => { mentions.add(mention) @@ -48,15 +48,32 @@ export async function parseMentions(text: string, cwd: string, urlScraper?: UrlS return match }) + const urlMention = Array.from(mentions).find((mention) => mention.startsWith("http")) + let launchBrowserError: Error | undefined + if (urlMention) { + try { + await urlScraper.launchBrowser() + } catch (error) { + launchBrowserError = error + vscode.window.showErrorMessage(`Error fetching content for ${urlMention}: ${error.message}`) + } + } + for (const mention of mentions) { - if (mention.startsWith("http") && urlScraper) { - try { - const markdown = await urlScraper.urlToMarkdown(mention) - parsedText += `\n\n\n${markdown}\n` - } catch (error) { - vscode.window.showErrorMessage(`Error fetching content for ${mention}: ${JSON.stringify(error)}`) - parsedText += `\n\n\nError fetching content: ${error.message}\n` + if (mention.startsWith("http")) { + let result: string + if (launchBrowserError) { + result = `Error fetching content: ${launchBrowserError.message}` + } else { + try { + const markdown = await urlScraper.urlToMarkdown(mention) + result = markdown + } catch (error) { + vscode.window.showErrorMessage(`Error fetching content for ${mention}: ${error.message}`) + result = `Error fetching content: ${error.message}` + } } + parsedText += `\n\n\n${result}\n` } else if (mention.startsWith("/")) { const mentionPath = mention.slice(1) // Remove the leading '/' try { @@ -83,6 +100,14 @@ export async function parseMentions(text: string, cwd: string, urlScraper?: UrlS } } + if (urlMention) { + try { + await urlScraper.closeBrowser() + } catch (error) { + console.error(`Error closing browser: ${error.message}`) + } + } + return parsedText } @@ -95,7 +120,7 @@ async function getFileOrFolderContent(mentionPath: string, cwd: string): Promise if (stats.isFile()) { const isBinary = await isBinaryFile(absPath).catch(() => false) if (isBinary) { - return "(Binary file)" + return "(Binary file, unable to display content)" } const content = await extractTextFromFile(absPath) return content