This commit is contained in:
Saoud Rizwan
2024-09-19 16:49:31 -04:00
parent cfc2ee830e
commit 5fbb335bb6
5 changed files with 14 additions and 17 deletions

View File

@@ -12,7 +12,7 @@ interface PCRStats {
executablePath: string
}
export class UrlScraper {
export class UrlContentFetcher {
private context: vscode.ExtensionContext
private browser?: Browser
private page?: Page
@@ -74,8 +74,7 @@ export class UrlScraper {
/*
- networkidle2 is equivalent to playwright's networkidle where it waits until there are no more than 2 network connections for at least 500 ms.
- domcontentloaded is when the basic DOM is loaded
this should be sufficient for most doc sites, but we can use the more elaborate waitTillHTMLRendered if we find users are scraping more dynamic complex sites
https://stackoverflow.com/questions/52497252/puppeteer-wait-until-page-is-completely-loaded/61304202#61304202
this should be sufficient for most doc sites
*/
await this.page.goto(url, { timeout: 10_000, waitUntil: ["domcontentloaded", "networkidle2"] })
const content = await this.page.content()

View File

@@ -1,7 +1,7 @@
import * as vscode from "vscode"
import * as path from "path"
import { openFile } from "./open-file"
import { UrlScraper } from "./UrlScraper"
import { UrlContentFetcher } from "./UrlContentFetcher"
import { mentionRegexGlobal } from "../shared/context-mentions"
import fs from "fs/promises"
import { extractTextFromFile } from "./extract-text"
@@ -32,7 +32,7 @@ export function openMention(mention?: string): void {
}
}
export async function parseMentions(text: string, cwd: string, urlScraper: UrlScraper): Promise<string> {
export async function parseMentions(text: string, cwd: string, urlContentFetcher: UrlContentFetcher): Promise<string> {
const mentions: Set<string> = new Set()
let parsedText = text.replace(mentionRegexGlobal, (match, mention) => {
mentions.add(mention)
@@ -53,7 +53,7 @@ export async function parseMentions(text: string, cwd: string, urlScraper: UrlSc
let launchBrowserError: Error | undefined
if (urlMention) {
try {
await urlScraper.launchBrowser()
await urlContentFetcher.launchBrowser()
} catch (error) {
launchBrowserError = error
vscode.window.showErrorMessage(`Error fetching content for ${urlMention}: ${error.message}`)
@@ -67,7 +67,7 @@ export async function parseMentions(text: string, cwd: string, urlScraper: UrlSc
result = `Error fetching content: ${launchBrowserError.message}`
} else {
try {
const markdown = await urlScraper.urlToMarkdown(mention)
const markdown = await urlContentFetcher.urlToMarkdown(mention)
result = markdown
} catch (error) {
vscode.window.showErrorMessage(`Error fetching content for ${mention}: ${error.message}`)
@@ -103,7 +103,7 @@ export async function parseMentions(text: string, cwd: string, urlScraper: UrlSc
if (urlMention) {
try {
await urlScraper.closeBrowser()
await urlContentFetcher.closeBrowser()
} catch (error) {
console.error(`Error closing browser: ${error.message}`)
}