From eec51ad27058de5dae62e8765b93ffe2098daca8 Mon Sep 17 00:00:00 2001 From: Saoud Rizwan <7799382+saoudrizwan@users.noreply.github.com> Date: Wed, 18 Sep 2024 21:11:28 -0400 Subject: [PATCH] Wait for domcontentloaded and networkidle2 to determine when page is loaded --- src/utils/UrlScraper.ts | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/src/utils/UrlScraper.ts b/src/utils/UrlScraper.ts index d2208a6..3e7732f 100644 --- a/src/utils/UrlScraper.ts +++ b/src/utils/UrlScraper.ts @@ -60,13 +60,19 @@ export class UrlScraper { try { const page = await browser.newPage() - await page.goto(url, { timeout: 5_000, waitUntil: "load" }) - await this.waitTillHTMLRendered(page) + + /* + - networkidle2 is equivalent to playwright's networkidle where it waits until there are no more than 2 network connections for at least 500 ms. + - domcontentloaded is when the basic DOM is loaded + this should be sufficient for most doc sites, but we can use the more elaborate waitTillHTMLRendered if we find users are scraping more dynamic complex sites + */ + await page.goto(url, { timeout: 10_000, waitUntil: ["domcontentloaded", "networkidle2"] }) + // await this.waitTillHTMLRendered(page) const content = await page.content() // Use Cheerio to parse and clean up the HTML const $ = cheerio.load(content) - $("script, style, nav, footer").remove() // Remove unnecessary elements + $("script, style, nav, footer").remove() // Remove unnecessary elements (todo: make this more robust) // Convert cleaned HTML to Markdown const turndownService = new TurndownService() @@ -80,6 +86,7 @@ export class UrlScraper { // page.goto { waitUntil: "networkidle0" } may not ever resolve, and not waiting could return page content too early before js has loaded // https://stackoverflow.com/questions/52497252/puppeteer-wait-until-page-is-completely-loaded/61304202#61304202 + /* private async waitTillHTMLRendered(page: Page, timeout = 10_000) { const checkDurationMsecs = 500 // 1000 const maxChecks = timeout / checkDurationMsecs @@ -110,6 +117,7 @@ export class UrlScraper { await delay(checkDurationMsecs) } } + */ } async function fileExists(path: string): Promise {