Add URL scraping with puppeteer

This commit is contained in:
Saoud Rizwan
2024-09-18 17:28:25 -04:00
parent 974222b75e
commit e3144996fb
4 changed files with 1229 additions and 45 deletions

120
src/utils/UrlScraper.ts Normal file
View File

@@ -0,0 +1,120 @@
import * as vscode from "vscode"
import * as fs from "fs/promises"
import * as path from "path"
import { Page } from "puppeteer-core"
import * as cheerio from "cheerio"
import TurndownService from "turndown"
import delay from "delay"
// @ts-ignore
import PCR from "puppeteer-chromium-resolver"
const PUPPETEER_DIR = "puppeteer"
export class UrlScraper {
private static context?: vscode.ExtensionContext
static async ensureChromiumExists(context?: vscode.ExtensionContext): Promise<void> {
this.context = context
const globalStoragePath = context?.globalStorageUri?.fsPath
if (!globalStoragePath) {
throw new Error("Global storage uri is invalid")
}
const puppeteerDir = path.join(globalStoragePath, PUPPETEER_DIR)
if (!(await fileExists(puppeteerDir))) {
await fs.mkdir(puppeteerDir, { recursive: true })
}
const chromiumPath = path.join(puppeteerDir, ".chromium-browser-snapshots")
if (!(await fileExists(chromiumPath))) {
// If Chromium doesn't exist, download it
await PCR({
downloadPath: puppeteerDir,
})
}
}
static async urlToMarkdown(url: string): Promise<string> {
await this.ensureChromiumExists(this.context)
const globalStoragePath = this.context?.globalStorageUri?.fsPath
if (!globalStoragePath) {
throw new Error("Global storage uri is invalid")
}
const puppeteerDir = path.join(globalStoragePath, PUPPETEER_DIR)
const stats = await PCR({
downloadPath: puppeteerDir,
})
const browser = await stats.puppeteer.launch({
args: [
"--user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.150 Safari/537.36",
],
executablePath: stats.executablePath,
})
try {
const page = await browser.newPage()
await page.goto(url, { timeout: 5_000, waitUntil: "load" })
await this.waitTillHTMLRendered(page)
const content = await page.content()
// Use Cheerio to parse and clean up the HTML
const $ = cheerio.load(content)
$("script, style, nav, footer").remove() // Remove unnecessary elements
// Convert cleaned HTML to Markdown
const turndownService = new TurndownService()
const markdown = turndownService.turndown($.html())
return markdown
} finally {
await browser.close()
}
}
// page.goto { waitUntil: "networkidle0" } may not ever resolve, and not waiting could return page content too early before js has loaded
// https://stackoverflow.com/questions/52497252/puppeteer-wait-until-page-is-completely-loaded/61304202#61304202
private static async waitTillHTMLRendered(page: Page, timeout = 10_000) {
const checkDurationMsecs = 1000
const maxChecks = timeout / checkDurationMsecs
let lastHTMLSize = 0
let checkCounts = 1
let countStableSizeIterations = 0
const minStableSizeIterations = 3
while (checkCounts++ <= maxChecks) {
let html = await page.content()
let currentHTMLSize = html.length
let bodyHTMLSize = await page.evaluate(() => document.body.innerHTML.length)
console.log("last: ", lastHTMLSize, " <> curr: ", currentHTMLSize, " body html size: ", bodyHTMLSize)
if (lastHTMLSize !== 0 && currentHTMLSize === lastHTMLSize) {
countStableSizeIterations++
} else {
countStableSizeIterations = 0 //reset the counter
}
if (countStableSizeIterations >= minStableSizeIterations) {
console.log("Page rendered fully..")
break
}
lastHTMLSize = currentHTMLSize
await delay(checkDurationMsecs)
}
}
}
async function fileExists(path: string): Promise<boolean> {
try {
await fs.access(path)
return true
} catch {
return false
}
}