This commit is contained in:
Saoud Rizwan
2024-09-19 16:49:31 -04:00
parent cfc2ee830e
commit 5fbb335bb6
5 changed files with 14 additions and 17 deletions

View File

@@ -0,0 +1,92 @@
import * as vscode from "vscode"
import * as fs from "fs/promises"
import * as path from "path"
import { Browser, Page, launch } from "puppeteer-core"
import * as cheerio from "cheerio"
import TurndownService from "turndown"
// @ts-ignore
import PCR from "puppeteer-chromium-resolver"
interface PCRStats {
puppeteer: { launch: typeof launch }
executablePath: string
}
export class UrlContentFetcher {
private context: vscode.ExtensionContext
private browser?: Browser
private page?: Page
constructor(context: vscode.ExtensionContext) {
this.context = context
}
private async ensureChromiumExists(): Promise<PCRStats> {
const globalStoragePath = this.context?.globalStorageUri?.fsPath
if (!globalStoragePath) {
throw new Error("Global storage uri is invalid")
}
const puppeteerDir = path.join(globalStoragePath, "puppeteer")
const dirExists = await fs
.access(puppeteerDir)
.then(() => true)
.catch(() => false)
if (!dirExists) {
await fs.mkdir(puppeteerDir, { recursive: true })
}
// if chromium doesn't exist, this will download it to path.join(puppeteerDir, ".chromium-browser-snapshots")
// if it does exist it will return the path to existing chromium
const stats: PCRStats = await PCR({
downloadPath: puppeteerDir,
})
return stats
}
async launchBrowser(): Promise<void> {
if (this.browser) {
return
}
const stats = await this.ensureChromiumExists()
this.browser = await stats.puppeteer.launch({
args: [
"--user-agent=Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/128.0.0.0 Safari/537.36",
],
executablePath: stats.executablePath,
})
// (latest version of puppeteer does not add headless to user agent)
this.page = await this.browser?.newPage()
}
async closeBrowser(): Promise<void> {
await this.browser?.close()
this.browser = undefined
this.page = undefined
}
// must make sure to call launchBrowser before and closeBrowser after using this
async urlToMarkdown(url: string): Promise<string> {
if (!this.browser || !this.page) {
throw new Error("Browser not initialized")
}
/*
- networkidle2 is equivalent to playwright's networkidle where it waits until there are no more than 2 network connections for at least 500 ms.
- domcontentloaded is when the basic DOM is loaded
this should be sufficient for most doc sites
*/
await this.page.goto(url, { timeout: 10_000, waitUntil: ["domcontentloaded", "networkidle2"] })
const content = await this.page.content()
// use cheerio to parse and clean up the HTML
const $ = cheerio.load(content)
$("script, style, nav, footer, header").remove()
// convert cleaned HTML to markdown
const turndownService = new TurndownService()
const markdown = turndownService.turndown($.html())
return markdown
}
}