Use a single browser instance to scrape multiple sites

This commit is contained in:
Saoud Rizwan
2024-09-19 12:44:11 -04:00
parent e75cab491a
commit 73f082bf98
4 changed files with 98 additions and 112 deletions

View File

@@ -27,6 +27,7 @@ import { truncateHalfConversation } from "./utils/context-management"
import { extractTextFromFile } from "./utils/extract-text"
import { regexSearchFiles } from "./utils/ripgrep"
import { parseMentions } from "./utils/context-mentions"
import { UrlScraper } from "./utils/UrlScraper"
const SYSTEM_PROMPT =
async () => `You are Claude Dev, a highly skilled software developer with extensive knowledge in many programming languages, frameworks, design patterns, and best practices.
@@ -250,6 +251,7 @@ export class ClaudeDev {
readonly taskId: string
private api: ApiHandler
private terminalManager: TerminalManager
private urlScraper: UrlScraper
private didEditFile: boolean = false
private customInstructions?: string
private alwaysAllowReadOnly: boolean
@@ -275,6 +277,7 @@ export class ClaudeDev {
this.providerRef = new WeakRef(provider)
this.api = buildApiHandler(apiConfiguration)
this.terminalManager = new TerminalManager()
this.urlScraper = new UrlScraper(provider.context)
this.customInstructions = customInstructions
this.alwaysAllowReadOnly = alwaysAllowReadOnly ?? false
@@ -675,6 +678,7 @@ export class ClaudeDev {
abortTask() {
this.abort = true // will stop any autonomously running promises
this.terminalManager.disposeAll()
this.urlScraper.closeBrowser()
}
async executeTool(toolName: ToolName, toolInput: any): Promise<[boolean, ToolResponse]> {
@@ -1643,14 +1647,14 @@ ${this.customInstructions.trim()}
if (block.type === "text") {
return {
...block,
text: await parseMentions(block.text, cwd, this.providerRef.deref()?.urlScraper),
text: await parseMentions(block.text, cwd, this.urlScraper),
}
} else if (block.type === "tool_result") {
const isUserMessage = (text: string) => text.includes("<feedback>") || text.includes("<answer>")
if (typeof block.content === "string" && isUserMessage(block.content)) {
return {
...block,
content: await parseMentions(block.content, cwd, this.providerRef.deref()?.urlScraper),
content: await parseMentions(block.content, cwd, this.urlScraper),
}
} else if (Array.isArray(block.content)) {
const parsedContent = await Promise.all(
@@ -1658,11 +1662,7 @@ ${this.customInstructions.trim()}
if (contentBlock.type === "text" && isUserMessage(contentBlock.text)) {
return {
...contentBlock,
text: await parseMentions(
contentBlock.text,
cwd,
this.providerRef.deref()?.urlScraper
),
text: await parseMentions(contentBlock.text, cwd, this.urlScraper),
}
}
return contentBlock

View File

@@ -54,14 +54,12 @@ export class ClaudeDevProvider implements vscode.WebviewViewProvider {
private view?: vscode.WebviewView | vscode.WebviewPanel
private claudeDev?: ClaudeDev
private workspaceTracker?: WorkspaceTracker
urlScraper?: UrlScraper
private latestAnnouncementId = "sep-14-2024" // update to some unique identifier when we add a new announcement
constructor(readonly context: vscode.ExtensionContext, private readonly outputChannel: vscode.OutputChannel) {
this.outputChannel.appendLine("ClaudeDevProvider instantiated")
ClaudeDevProvider.activeInstances.add(this)
this.workspaceTracker = new WorkspaceTracker(this)
this.urlScraper = new UrlScraper(this.context)
this.revertKodu()
}
@@ -107,7 +105,6 @@ export class ClaudeDevProvider implements vscode.WebviewViewProvider {
}
this.workspaceTracker?.dispose()
this.workspaceTracker = undefined
this.urlScraper = undefined
this.outputChannel.appendLine("Disposed all disposables")
ClaudeDevProvider.activeInstances.delete(this)
}

View File

@@ -1,129 +1,93 @@
import * as vscode from "vscode"
import * as fs from "fs/promises"
import * as path from "path"
import { Browser } from "puppeteer-core"
import { Browser, Page, launch } from "puppeteer-core"
import * as cheerio from "cheerio"
import TurndownService from "turndown"
// @ts-ignore
import PCR from "puppeteer-chromium-resolver"
const PUPPETEER_DIR = "puppeteer"
interface PCRStats {
puppeteer: { launch: typeof launch }
executablePath: string
}
export class UrlScraper {
private context: vscode.ExtensionContext
private browser?: Browser
private page?: Page
constructor(context: vscode.ExtensionContext) {
this.context = context
}
private async ensureChromiumExists(): Promise<void> {
private async ensureChromiumExists(): Promise<PCRStats> {
const globalStoragePath = this.context?.globalStorageUri?.fsPath
if (!globalStoragePath) {
throw new Error("Global storage uri is invalid")
}
const puppeteerDir = path.join(globalStoragePath, PUPPETEER_DIR)
if (!(await fileExists(puppeteerDir))) {
const puppeteerDir = path.join(globalStoragePath, "puppeteer")
const dirExists = await fs
.access(puppeteerDir)
.then(() => true)
.catch(() => false)
if (!dirExists) {
await fs.mkdir(puppeteerDir, { recursive: true })
}
const chromiumPath = path.join(puppeteerDir, ".chromium-browser-snapshots")
if (!(await fileExists(chromiumPath))) {
// If Chromium doesn't exist, download it
await PCR({
downloadPath: puppeteerDir,
})
}
}
async urlToMarkdown(url: string): Promise<string> {
await this.ensureChromiumExists()
const globalStoragePath = this.context?.globalStorageUri?.fsPath
if (!globalStoragePath) {
throw new Error("Global storage uri is invalid")
}
const puppeteerDir = path.join(globalStoragePath, PUPPETEER_DIR)
const stats = await PCR({
// if chromium doesn't exist, this will download it to path.join(puppeteerDir, ".chromium-browser-snapshots")
// if it does exist it will return the path to existing chromium
const stats: PCRStats = await PCR({
downloadPath: puppeteerDir,
})
const browser: Browser = await stats.puppeteer.launch({
return stats
}
async launchBrowser(): Promise<void> {
if (this.browser) {
return
}
const stats = await this.ensureChromiumExists()
this.browser = await stats.puppeteer.launch({
args: [
"--user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.150 Safari/537.36",
"--user-agent=Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/128.0.0.0 Safari/537.36",
],
executablePath: stats.executablePath,
})
try {
const page = await browser.newPage()
/*
- networkidle2 is equivalent to playwright's networkidle where it waits until there are no more than 2 network connections for at least 500 ms.
- domcontentloaded is when the basic DOM is loaded
this should be sufficient for most doc sites, but we can use the more elaborate waitTillHTMLRendered if we find users are scraping more dynamic complex sites
*/
await page.goto(url, { timeout: 10_000, waitUntil: ["domcontentloaded", "networkidle2"] })
// await this.waitTillHTMLRendered(page)
const content = await page.content()
// Use Cheerio to parse and clean up the HTML
const $ = cheerio.load(content)
$("script, style, nav, footer").remove() // Remove unnecessary elements (todo: make this more robust)
// Convert cleaned HTML to Markdown
const turndownService = new TurndownService()
const markdown = turndownService.turndown($.html())
return markdown
} finally {
await browser.close()
}
// (latest version of puppeteer does not add headless to user agent)
this.page = await this.browser?.newPage()
}
// page.goto { waitUntil: "networkidle0" } may not ever resolve, and not waiting could return page content too early before js has loaded
// https://stackoverflow.com/questions/52497252/puppeteer-wait-until-page-is-completely-loaded/61304202#61304202
/*
private async waitTillHTMLRendered(page: Page, timeout = 10_000) {
const checkDurationMsecs = 500 // 1000
const maxChecks = timeout / checkDurationMsecs
let lastHTMLSize = 0
let checkCounts = 1
let countStableSizeIterations = 0
const minStableSizeIterations = 3
while (checkCounts++ <= maxChecks) {
let html = await page.content()
let currentHTMLSize = html.length
// let bodyHTMLSize = await page.evaluate(() => document.body.innerHTML.length)
console.log("last: ", lastHTMLSize, " <> curr: ", currentHTMLSize)
if (lastHTMLSize !== 0 && currentHTMLSize === lastHTMLSize) {
countStableSizeIterations++
} else {
countStableSizeIterations = 0 //reset the counter
}
if (countStableSizeIterations >= minStableSizeIterations) {
console.log("Page rendered fully...")
break
}
lastHTMLSize = currentHTMLSize
await delay(checkDurationMsecs)
}
async closeBrowser(): Promise<void> {
await this.browser?.close()
this.browser = undefined
this.page = undefined
}
*/
}
async function fileExists(path: string): Promise<boolean> {
try {
await fs.access(path)
return true
} catch {
return false
// must make sure to call launchBrowser before and closeBrowser after using this
async urlToMarkdown(url: string): Promise<string> {
if (!this.browser || !this.page) {
throw new Error("Browser not initialized")
}
/*
- networkidle2 is equivalent to playwright's networkidle where it waits until there are no more than 2 network connections for at least 500 ms.
- domcontentloaded is when the basic DOM is loaded
this should be sufficient for most doc sites, but we can use the more elaborate waitTillHTMLRendered if we find users are scraping more dynamic complex sites
https://stackoverflow.com/questions/52497252/puppeteer-wait-until-page-is-completely-loaded/61304202#61304202
*/
await this.page.goto(url, { timeout: 10_000, waitUntil: ["domcontentloaded", "networkidle2"] })
const content = await this.page.content()
// use cheerio to parse and clean up the HTML
const $ = cheerio.load(content)
$("script, style, nav, footer, header").remove()
// convert cleaned HTML to markdown
const turndownService = new TurndownService()
const markdown = turndownService.turndown($.html())
return markdown
}
}

View File

@@ -32,7 +32,7 @@ export function openMention(mention?: string): void {
}
}
export async function parseMentions(text: string, cwd: string, urlScraper?: UrlScraper): Promise<string> {
export async function parseMentions(text: string, cwd: string, urlScraper: UrlScraper): Promise<string> {
const mentions: Set<string> = new Set()
let parsedText = text.replace(mentionRegexGlobal, (match, mention) => {
mentions.add(mention)
@@ -48,15 +48,32 @@ export async function parseMentions(text: string, cwd: string, urlScraper?: UrlS
return match
})
const urlMention = Array.from(mentions).find((mention) => mention.startsWith("http"))
let launchBrowserError: Error | undefined
if (urlMention) {
try {
await urlScraper.launchBrowser()
} catch (error) {
launchBrowserError = error
vscode.window.showErrorMessage(`Error fetching content for ${urlMention}: ${error.message}`)
}
}
for (const mention of mentions) {
if (mention.startsWith("http") && urlScraper) {
try {
const markdown = await urlScraper.urlToMarkdown(mention)
parsedText += `\n\n<url_content url="${mention}">\n${markdown}\n</url_content>`
} catch (error) {
vscode.window.showErrorMessage(`Error fetching content for ${mention}: ${JSON.stringify(error)}`)
parsedText += `\n\n<url_content url="${mention}">\nError fetching content: ${error.message}\n</url_content>`
if (mention.startsWith("http")) {
let result: string
if (launchBrowserError) {
result = `Error fetching content: ${launchBrowserError.message}`
} else {
try {
const markdown = await urlScraper.urlToMarkdown(mention)
result = markdown
} catch (error) {
vscode.window.showErrorMessage(`Error fetching content for ${mention}: ${error.message}`)
result = `Error fetching content: ${error.message}`
}
}
parsedText += `\n\n<url_content url="${mention}">\n${result}\n</url_content>`
} else if (mention.startsWith("/")) {
const mentionPath = mention.slice(1) // Remove the leading '/'
try {
@@ -83,6 +100,14 @@ export async function parseMentions(text: string, cwd: string, urlScraper?: UrlS
}
}
if (urlMention) {
try {
await urlScraper.closeBrowser()
} catch (error) {
console.error(`Error closing browser: ${error.message}`)
}
}
return parsedText
}
@@ -95,7 +120,7 @@ async function getFileOrFolderContent(mentionPath: string, cwd: string): Promise
if (stats.isFile()) {
const isBinary = await isBinaryFile(absPath).catch(() => false)
if (isBinary) {
return "(Binary file)"
return "(Binary file, unable to display content)"
}
const content = await extractTextFromFile(absPath)
return content