mirror of
https://github.com/pacnpal/Roo-Code.git
synced 2025-12-22 21:31:08 -05:00
Wait for domcontentloaded and networkidle2 to determine when page is loaded
This commit is contained in:
@@ -60,13 +60,19 @@ export class UrlScraper {
|
|||||||
|
|
||||||
try {
|
try {
|
||||||
const page = await browser.newPage()
|
const page = await browser.newPage()
|
||||||
await page.goto(url, { timeout: 5_000, waitUntil: "load" })
|
|
||||||
await this.waitTillHTMLRendered(page)
|
/*
|
||||||
|
- networkidle2 is equivalent to playwright's networkidle where it waits until there are no more than 2 network connections for at least 500 ms.
|
||||||
|
- domcontentloaded is when the basic DOM is loaded
|
||||||
|
this should be sufficient for most doc sites, but we can use the more elaborate waitTillHTMLRendered if we find users are scraping more dynamic complex sites
|
||||||
|
*/
|
||||||
|
await page.goto(url, { timeout: 10_000, waitUntil: ["domcontentloaded", "networkidle2"] })
|
||||||
|
// await this.waitTillHTMLRendered(page)
|
||||||
const content = await page.content()
|
const content = await page.content()
|
||||||
|
|
||||||
// Use Cheerio to parse and clean up the HTML
|
// Use Cheerio to parse and clean up the HTML
|
||||||
const $ = cheerio.load(content)
|
const $ = cheerio.load(content)
|
||||||
$("script, style, nav, footer").remove() // Remove unnecessary elements
|
$("script, style, nav, footer").remove() // Remove unnecessary elements (todo: make this more robust)
|
||||||
|
|
||||||
// Convert cleaned HTML to Markdown
|
// Convert cleaned HTML to Markdown
|
||||||
const turndownService = new TurndownService()
|
const turndownService = new TurndownService()
|
||||||
@@ -80,6 +86,7 @@ export class UrlScraper {
|
|||||||
|
|
||||||
// page.goto { waitUntil: "networkidle0" } may not ever resolve, and not waiting could return page content too early before js has loaded
|
// page.goto { waitUntil: "networkidle0" } may not ever resolve, and not waiting could return page content too early before js has loaded
|
||||||
// https://stackoverflow.com/questions/52497252/puppeteer-wait-until-page-is-completely-loaded/61304202#61304202
|
// https://stackoverflow.com/questions/52497252/puppeteer-wait-until-page-is-completely-loaded/61304202#61304202
|
||||||
|
/*
|
||||||
private async waitTillHTMLRendered(page: Page, timeout = 10_000) {
|
private async waitTillHTMLRendered(page: Page, timeout = 10_000) {
|
||||||
const checkDurationMsecs = 500 // 1000
|
const checkDurationMsecs = 500 // 1000
|
||||||
const maxChecks = timeout / checkDurationMsecs
|
const maxChecks = timeout / checkDurationMsecs
|
||||||
@@ -110,6 +117,7 @@ export class UrlScraper {
|
|||||||
await delay(checkDurationMsecs)
|
await delay(checkDurationMsecs)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
*/
|
||||||
}
|
}
|
||||||
|
|
||||||
async function fileExists(path: string): Promise<boolean> {
|
async function fileExists(path: string): Promise<boolean> {
|
||||||
|
|||||||
Reference in New Issue
Block a user