mirror of
https://github.com/pacnpal/Roo-Code.git
synced 2025-12-20 04:11:10 -05:00
Add URL scraping with puppeteer
This commit is contained in:
1143
package-lock.json
generated
1143
package-lock.json
generated
File diff suppressed because it is too large
Load Diff
@@ -151,8 +151,10 @@
|
|||||||
"@google/generative-ai": "^0.18.0",
|
"@google/generative-ai": "^0.18.0",
|
||||||
"@types/clone-deep": "^4.0.4",
|
"@types/clone-deep": "^4.0.4",
|
||||||
"@types/pdf-parse": "^1.1.4",
|
"@types/pdf-parse": "^1.1.4",
|
||||||
|
"@types/turndown": "^5.0.5",
|
||||||
"@vscode/codicons": "^0.0.36",
|
"@vscode/codicons": "^0.0.36",
|
||||||
"axios": "^1.7.4",
|
"axios": "^1.7.4",
|
||||||
|
"cheerio": "^1.0.0",
|
||||||
"clone-deep": "^4.0.1",
|
"clone-deep": "^4.0.1",
|
||||||
"default-shell": "^2.2.0",
|
"default-shell": "^2.2.0",
|
||||||
"delay": "^6.0.0",
|
"delay": "^6.0.0",
|
||||||
@@ -165,9 +167,12 @@
|
|||||||
"os-name": "^6.0.0",
|
"os-name": "^6.0.0",
|
||||||
"p-wait-for": "^5.0.2",
|
"p-wait-for": "^5.0.2",
|
||||||
"pdf-parse": "^1.1.1",
|
"pdf-parse": "^1.1.1",
|
||||||
|
"puppeteer-chromium-resolver": "^23.0.0",
|
||||||
|
"puppeteer-core": "^23.4.0",
|
||||||
"serialize-error": "^11.0.3",
|
"serialize-error": "^11.0.3",
|
||||||
"strip-ansi": "^7.1.0",
|
"strip-ansi": "^7.1.0",
|
||||||
"tree-sitter-wasms": "^0.1.11",
|
"tree-sitter-wasms": "^0.1.11",
|
||||||
|
"turndown": "^7.2.0",
|
||||||
"web-tree-sitter": "^0.22.6"
|
"web-tree-sitter": "^0.22.6"
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -3,6 +3,7 @@
|
|||||||
import * as vscode from "vscode"
|
import * as vscode from "vscode"
|
||||||
import { ClaudeDevProvider } from "./providers/ClaudeDevProvider"
|
import { ClaudeDevProvider } from "./providers/ClaudeDevProvider"
|
||||||
import delay from "delay"
|
import delay from "delay"
|
||||||
|
import { UrlScraper } from "./utils/UrlScraper"
|
||||||
|
|
||||||
/*
|
/*
|
||||||
Built using https://github.com/microsoft/vscode-webview-ui-toolkit
|
Built using https://github.com/microsoft/vscode-webview-ui-toolkit
|
||||||
@@ -39,6 +40,11 @@ export function activate(context: vscode.ExtensionContext) {
|
|||||||
|
|
||||||
const sidebarProvider = new ClaudeDevProvider(context, outputChannel)
|
const sidebarProvider = new ClaudeDevProvider(context, outputChannel)
|
||||||
|
|
||||||
|
// Installs chromium for puppeteer url scraping
|
||||||
|
UrlScraper.ensureChromiumExists(context).catch((error) => {
|
||||||
|
outputChannel.appendLine(`Error installing Chromium: ${JSON.stringify(error)}`)
|
||||||
|
})
|
||||||
|
|
||||||
context.subscriptions.push(
|
context.subscriptions.push(
|
||||||
vscode.window.registerWebviewViewProvider(ClaudeDevProvider.sideBarId, sidebarProvider, {
|
vscode.window.registerWebviewViewProvider(ClaudeDevProvider.sideBarId, sidebarProvider, {
|
||||||
webviewOptions: { retainContextWhenHidden: true },
|
webviewOptions: { retainContextWhenHidden: true },
|
||||||
|
|||||||
120
src/utils/UrlScraper.ts
Normal file
120
src/utils/UrlScraper.ts
Normal file
@@ -0,0 +1,120 @@
|
|||||||
|
import * as vscode from "vscode"
|
||||||
|
import * as fs from "fs/promises"
|
||||||
|
import * as path from "path"
|
||||||
|
import { Page } from "puppeteer-core"
|
||||||
|
import * as cheerio from "cheerio"
|
||||||
|
import TurndownService from "turndown"
|
||||||
|
import delay from "delay"
|
||||||
|
// @ts-ignore
|
||||||
|
import PCR from "puppeteer-chromium-resolver"
|
||||||
|
|
||||||
|
const PUPPETEER_DIR = "puppeteer"
|
||||||
|
|
||||||
|
export class UrlScraper {
|
||||||
|
private static context?: vscode.ExtensionContext
|
||||||
|
|
||||||
|
static async ensureChromiumExists(context?: vscode.ExtensionContext): Promise<void> {
|
||||||
|
this.context = context
|
||||||
|
const globalStoragePath = context?.globalStorageUri?.fsPath
|
||||||
|
if (!globalStoragePath) {
|
||||||
|
throw new Error("Global storage uri is invalid")
|
||||||
|
}
|
||||||
|
|
||||||
|
const puppeteerDir = path.join(globalStoragePath, PUPPETEER_DIR)
|
||||||
|
|
||||||
|
if (!(await fileExists(puppeteerDir))) {
|
||||||
|
await fs.mkdir(puppeteerDir, { recursive: true })
|
||||||
|
}
|
||||||
|
|
||||||
|
const chromiumPath = path.join(puppeteerDir, ".chromium-browser-snapshots")
|
||||||
|
|
||||||
|
if (!(await fileExists(chromiumPath))) {
|
||||||
|
// If Chromium doesn't exist, download it
|
||||||
|
await PCR({
|
||||||
|
downloadPath: puppeteerDir,
|
||||||
|
})
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
static async urlToMarkdown(url: string): Promise<string> {
|
||||||
|
await this.ensureChromiumExists(this.context)
|
||||||
|
|
||||||
|
const globalStoragePath = this.context?.globalStorageUri?.fsPath
|
||||||
|
if (!globalStoragePath) {
|
||||||
|
throw new Error("Global storage uri is invalid")
|
||||||
|
}
|
||||||
|
const puppeteerDir = path.join(globalStoragePath, PUPPETEER_DIR)
|
||||||
|
|
||||||
|
const stats = await PCR({
|
||||||
|
downloadPath: puppeteerDir,
|
||||||
|
})
|
||||||
|
const browser = await stats.puppeteer.launch({
|
||||||
|
args: [
|
||||||
|
"--user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.150 Safari/537.36",
|
||||||
|
],
|
||||||
|
executablePath: stats.executablePath,
|
||||||
|
})
|
||||||
|
|
||||||
|
try {
|
||||||
|
const page = await browser.newPage()
|
||||||
|
await page.goto(url, { timeout: 5_000, waitUntil: "load" })
|
||||||
|
await this.waitTillHTMLRendered(page)
|
||||||
|
const content = await page.content()
|
||||||
|
|
||||||
|
// Use Cheerio to parse and clean up the HTML
|
||||||
|
const $ = cheerio.load(content)
|
||||||
|
$("script, style, nav, footer").remove() // Remove unnecessary elements
|
||||||
|
|
||||||
|
// Convert cleaned HTML to Markdown
|
||||||
|
const turndownService = new TurndownService()
|
||||||
|
const markdown = turndownService.turndown($.html())
|
||||||
|
|
||||||
|
return markdown
|
||||||
|
} finally {
|
||||||
|
await browser.close()
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// page.goto { waitUntil: "networkidle0" } may not ever resolve, and not waiting could return page content too early before js has loaded
|
||||||
|
// https://stackoverflow.com/questions/52497252/puppeteer-wait-until-page-is-completely-loaded/61304202#61304202
|
||||||
|
private static async waitTillHTMLRendered(page: Page, timeout = 10_000) {
|
||||||
|
const checkDurationMsecs = 1000
|
||||||
|
const maxChecks = timeout / checkDurationMsecs
|
||||||
|
let lastHTMLSize = 0
|
||||||
|
let checkCounts = 1
|
||||||
|
let countStableSizeIterations = 0
|
||||||
|
const minStableSizeIterations = 3
|
||||||
|
|
||||||
|
while (checkCounts++ <= maxChecks) {
|
||||||
|
let html = await page.content()
|
||||||
|
let currentHTMLSize = html.length
|
||||||
|
|
||||||
|
let bodyHTMLSize = await page.evaluate(() => document.body.innerHTML.length)
|
||||||
|
|
||||||
|
console.log("last: ", lastHTMLSize, " <> curr: ", currentHTMLSize, " body html size: ", bodyHTMLSize)
|
||||||
|
|
||||||
|
if (lastHTMLSize !== 0 && currentHTMLSize === lastHTMLSize) {
|
||||||
|
countStableSizeIterations++
|
||||||
|
} else {
|
||||||
|
countStableSizeIterations = 0 //reset the counter
|
||||||
|
}
|
||||||
|
|
||||||
|
if (countStableSizeIterations >= minStableSizeIterations) {
|
||||||
|
console.log("Page rendered fully..")
|
||||||
|
break
|
||||||
|
}
|
||||||
|
|
||||||
|
lastHTMLSize = currentHTMLSize
|
||||||
|
await delay(checkDurationMsecs)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
async function fileExists(path: string): Promise<boolean> {
|
||||||
|
try {
|
||||||
|
await fs.access(path)
|
||||||
|
return true
|
||||||
|
} catch {
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
}
|
||||||
Reference in New Issue
Block a user