Refactor services

2026-02-06 20:44:44 -05:00 · 2024-09-24 11:15:24 -04:00
parent e410e317cc
commit 24bad56785
4 changed files with 3 additions and 3 deletions
--- a/src/utils/UrlContentFetcher.ts
+++ b/src/utils/UrlContentFetcher.ts
@@ -1,223 +0,0 @@
-import * as vscode from "vscode"
-import * as fs from "fs/promises"
-import * as path from "path"
-import { Browser, Page, ScreenshotOptions, TimeoutError, launch } from "puppeteer-core"
-import * as cheerio from "cheerio"
-import TurndownService from "turndown"
-// @ts-ignore
-import PCR from "puppeteer-chromium-resolver"
-import pWaitFor from "p-wait-for"
-import delay from "delay"
-
-interface PCRStats {
-	puppeteer: { launch: typeof launch }
-	executablePath: string
-}
-
-export class UrlContentFetcher {
-	private context: vscode.ExtensionContext
-	private browser?: Browser
-	private page?: Page
-
-	constructor(context: vscode.ExtensionContext) {
-		this.context = context
-	}
-
-	private async ensureChromiumExists(): Promise<PCRStats> {
-		const globalStoragePath = this.context?.globalStorageUri?.fsPath
-		if (!globalStoragePath) {
-			throw new Error("Global storage uri is invalid")
-		}
-
-		const puppeteerDir = path.join(globalStoragePath, "puppeteer")
-		const dirExists = await fs
-			.access(puppeteerDir)
-			.then(() => true)
-			.catch(() => false)
-		if (!dirExists) {
-			await fs.mkdir(puppeteerDir, { recursive: true })
-		}
-
-		// if chromium doesn't exist, this will download it to path.join(puppeteerDir, ".chromium-browser-snapshots")
-		// if it does exist it will return the path to existing chromium
-		const stats: PCRStats = await PCR({
-			downloadPath: puppeteerDir,
-		})
-
-		return stats
-	}
-
-	async launchBrowser(): Promise<void> {
-		if (this.browser) {
-			return
-		}
-		const stats = await this.ensureChromiumExists()
-		this.browser = await stats.puppeteer.launch({
-			args: [
-				"--user-agent=Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/128.0.0.0 Safari/537.36",
-			],
-			executablePath: stats.executablePath,
-		})
-		// (latest version of puppeteer does not add headless to user agent)
-		this.page = await this.browser?.newPage()
-	}
-
-	async closeBrowser(): Promise<void> {
-		await this.browser?.close()
-		this.browser = undefined
-		this.page = undefined
-	}
-
-	// must make sure to call launchBrowser before and closeBrowser after using this
-	async urlToMarkdown(url: string): Promise<string> {
-		if (!this.browser || !this.page) {
-			throw new Error("Browser not initialized")
-		}
-		/*
-		- networkidle2 is equivalent to playwright's networkidle where it waits until there are no more than 2 network connections for at least 500 ms.
-		- domcontentloaded is when the basic DOM is loaded
-		this should be sufficient for most doc sites
-		*/
-		await this.page.goto(url, { timeout: 10_000, waitUntil: ["domcontentloaded", "networkidle2"] })
-		const content = await this.page.content()
-
-		// use cheerio to parse and clean up the HTML
-		const $ = cheerio.load(content)
-		$("script, style, nav, footer, header").remove()
-
-		// convert cleaned HTML to markdown
-		const turndownService = new TurndownService()
-		const markdown = turndownService.turndown($.html())
-
-		return markdown
-	}
-
-	async urlToScreenshotAndLogs(url: string): Promise<{ screenshot: string; logs: string }> {
-		if (!this.browser || !this.page) {
-			throw new Error("Browser not initialized")
-		}
-
-		const logs: string[] = []
-		let lastLogTs = Date.now()
-
-		this.page.on("console", (msg) => {
-			if (msg.type() === "log") {
-				logs.push(msg.text())
-			} else {
-				logs.push(`[${msg.type()}] ${msg.text()}`)
-			}
-			lastLogTs = Date.now()
-		})
-		this.page.on("pageerror", (err) => {
-			logs.push(`[Page Error] ${err.toString()}`)
-			lastLogTs = Date.now()
-		})
-
-		try {
-			// networkidle2 isn't good enough since page may take some time to load. we can assume locally running dev sites will reach networkidle0 in a reasonable amount of time
-			await this.page.goto(url, { timeout: 7_000, waitUntil: ["domcontentloaded", "networkidle2"] })
-			// await this.page.goto(url, { timeout: 10_000, waitUntil: "load" })
-			await this.waitTillHTMLStable(this.page) // in case the page is loading more resources
-		} catch (err) {
-			if (!(err instanceof TimeoutError)) {
-				logs.push(`[Navigation Error] ${err.toString()}`)
-			}
-		}
-
-		// Wait for console inactivity, with a timeout
-		await pWaitFor(() => Date.now() - lastLogTs >= 500, {
-			timeout: 3_000,
-			interval: 100,
-		}).catch(() => {})
-
-		// image cannot exceed 8_000 pixels
-		const { pageHeight, pageWidth } = await this.page.evaluate(() => {
-			const html: HTMLElement | null = document.documentElement
-			const body: HTMLElement | null = document.body
-			return {
-				pageHeight: html?.scrollHeight || body?.scrollHeight,
-				pageWidth: html?.clientWidth || body?.clientWidth,
-			}
-		})
-		// const defaultViewport = this.page.viewport(); // width 800 height 600 by default
-		let options: ScreenshotOptions
-		if (pageHeight && pageWidth) {
-			options = {
-				// fullPage: true, // clip and fullPage are mutually exclusive
-				encoding: "base64",
-				// quality: 80,
-				clip: {
-					x: 0,
-					y: 0,
-					width: pageWidth,
-					height: Math.min(pageHeight, 8_000),
-				},
-			}
-		} else {
-			// if we can't get the page dimensions, fallback to full page screenshot
-			options = {
-				encoding: "base64",
-				fullPage: true,
-			}
-		}
-
-		let screenshotBase64 = await this.page.screenshot({
-			...options,
-			type: "webp",
-		})
-		let screenshot = `data:image/webp;base64,${screenshotBase64}`
-
-		if (!screenshotBase64) {
-			console.log("webp screenshot failed, trying png")
-			screenshotBase64 = await this.page.screenshot({
-				...options,
-				type: "png",
-			})
-			screenshot = `data:image/png;base64,${screenshotBase64}`
-		}
-
-		if (!screenshotBase64) {
-			throw new Error("Failed to take screenshot.")
-		}
-
-		this.page.removeAllListeners()
-
-		return {
-			screenshot,
-			logs: logs.join("\n"),
-		}
-	}
-
-	// page.goto { waitUntil: "networkidle0" } may not ever resolve, and not waiting could return page content too early before js has loaded
-	// https://stackoverflow.com/questions/52497252/puppeteer-wait-until-page-is-completely-loaded/61304202#61304202
-	private async waitTillHTMLStable(page: Page, timeout = 5_000) {
-		const checkDurationMsecs = 500 // 1000
-		const maxChecks = timeout / checkDurationMsecs
-		let lastHTMLSize = 0
-		let checkCounts = 1
-		let countStableSizeIterations = 0
-		const minStableSizeIterations = 3
-
-		while (checkCounts++ <= maxChecks) {
-			let html = await page.content()
-			let currentHTMLSize = html.length
-
-			// let bodyHTMLSize = await page.evaluate(() => document.body.innerHTML.length)
-			console.log("last: ", lastHTMLSize, " <> curr: ", currentHTMLSize)
-
-			if (lastHTMLSize !== 0 && currentHTMLSize === lastHTMLSize) {
-				countStableSizeIterations++
-			} else {
-				countStableSizeIterations = 0 //reset the counter
-			}
-
-			if (countStableSizeIterations >= minStableSizeIterations) {
-				console.log("Page rendered fully...")
-				break
-			}
-
-			lastHTMLSize = currentHTMLSize
-			await delay(checkDurationMsecs)
-		}
-	}
-}
--- a/src/utils/context-mentions.ts
+++ b/src/utils/context-mentions.ts
@@ -1,7 +1,7 @@
 import * as vscode from "vscode"
 import * as path from "path"
 import { openFile } from "./open-file"
-import { UrlContentFetcher } from "./UrlContentFetcher"
+import { UrlContentFetcher } from "../services/browser/UrlContentFetcher"
 import { mentionRegexGlobal } from "../shared/context-mentions"
 import fs from "fs/promises"
 import { extractTextFromFile } from "./extract-text"
--- a/src/utils/ripgrep.ts
+++ b/src/utils/ripgrep.ts
@@ -1,222 +0,0 @@
-import * as vscode from "vscode"
-import * as childProcess from "child_process"
-import * as path from "path"
-import * as fs from "fs"
-import * as readline from "readline"
-
-/*
-This file provides functionality to perform regex searches on files using ripgrep.
-Inspired by: https://github.com/DiscreteTom/vscode-ripgrep-utils
-
-Key components:
-1. getBinPath: Locates the ripgrep binary within the VSCode installation.
-2. execRipgrep: Executes the ripgrep command and returns the output.
-3. regexSearchFiles: The main function that performs regex searches on files.
-   - Parameters:
-     * cwd: The current working directory (for relative path calculation)
-     * directoryPath: The directory to search in
-     * regex: The regular expression to search for (Rust regex syntax)
-     * filePattern: Optional glob pattern to filter files (default: '*')
-   - Returns: A formatted string containing search results with context
-
-The search results include:
- Relative file paths
- 2 lines of context before and after each match
- Matches formatted with pipe characters for easy reading
-
-Usage example:
-const results = await regexSearchFiles('/path/to/cwd', '/path/to/search', 'TODO:', '*.ts');
-
-rel/path/to/app.ts
-│----
-│function processData(data: any) {
-│  // Some processing logic here
-│  // TODO: Implement error handling
-│  return processedData;
-│}
-│----
-
-rel/path/to/helper.ts
-│----
-│  let result = 0;
-│  for (let i = 0; i < input; i++) {
-│    // TODO: Optimize this function for performance
-│    result += Math.pow(i, 2);
-│  }
-│----
-*/
-
-const isWindows = /^win/.test(process.platform)
-const binName = isWindows ? "rg.exe" : "rg"
-
-interface SearchResult {
-	file: string
-	line: number
-	column: number
-	match: string
-	beforeContext: string[]
-	afterContext: string[]
-}
-
-const MAX_RESULTS = 300
-
-async function getBinPath(vscodeAppRoot: string): Promise<string | undefined> {
-	const checkPath = async (pkgFolder: string) => {
-		const fullPath = path.join(vscodeAppRoot, pkgFolder, binName)
-		return (await pathExists(fullPath)) ? fullPath : undefined
-	}
-
-	return (
-		(await checkPath("node_modules/@vscode/ripgrep/bin/")) ||
-		(await checkPath("node_modules/vscode-ripgrep/bin")) ||
-		(await checkPath("node_modules.asar.unpacked/vscode-ripgrep/bin/")) ||
-		(await checkPath("node_modules.asar.unpacked/@vscode/ripgrep/bin/"))
-	)
-}
-
-async function pathExists(path: string): Promise<boolean> {
-	return new Promise((resolve) => {
-		fs.access(path, (err) => {
-			resolve(err === null)
-		})
-	})
-}
-
-async function execRipgrep(bin: string, args: string[]): Promise<string> {
-	return new Promise((resolve, reject) => {
-		const rgProcess = childProcess.spawn(bin, args)
-		// cross-platform alternative to head, which is ripgrep author's recommendation for limiting output.
-		const rl = readline.createInterface({
-			input: rgProcess.stdout,
-			crlfDelay: Infinity, // treat \r\n as a single line break even if it's split across chunks. This ensures consistent behavior across different operating systems.
-		})
-
-		let output = ""
-		let lineCount = 0
-		const maxLines = MAX_RESULTS * 5 // limiting ripgrep output with max lines since there's no other way to limit results. it's okay that we're outputting as json, since we're parsing it line by line and ignore anything that's not part of a match. This assumes each result is at most 5 lines.
-
-		rl.on("line", (line) => {
-			if (lineCount < maxLines) {
-				output += line + "\n"
-				lineCount++
-			} else {
-				rl.close()
-				rgProcess.kill()
-			}
-		})
-
-		let errorOutput = ""
-		rgProcess.stderr.on("data", (data) => {
-			errorOutput += data.toString()
-		})
-		rl.on("close", () => {
-			if (errorOutput) {
-				reject(new Error(`ripgrep process error: ${errorOutput}`))
-			} else {
-				resolve(output)
-			}
-		})
-		rgProcess.on("error", (error) => {
-			reject(new Error(`ripgrep process error: ${error.message}`))
-		})
-	})
-}
-
-export async function regexSearchFiles(
-	cwd: string,
-	directoryPath: string,
-	regex: string,
-	filePattern?: string
-): Promise<string> {
-	const vscodeAppRoot = vscode.env.appRoot
-	const rgPath = await getBinPath(vscodeAppRoot)
-
-	if (!rgPath) {
-		throw new Error("Could not find ripgrep binary")
-	}
-
-	const args = ["--json", "-e", regex, "--glob", filePattern || "*", "--context", "1", directoryPath]
-
-	let output: string
-	try {
-		output = await execRipgrep(rgPath, args)
-	} catch {
-		return "No results found"
-	}
-	const results: SearchResult[] = []
-	let currentResult: Partial<SearchResult> | null = null
-
-	output.split("\n").forEach((line) => {
-		if (line) {
-			try {
-				const parsed = JSON.parse(line)
-				if (parsed.type === "match") {
-					if (currentResult) {
-						results.push(currentResult as SearchResult)
-					}
-					currentResult = {
-						file: parsed.data.path.text,
-						line: parsed.data.line_number,
-						column: parsed.data.submatches[0].start,
-						match: parsed.data.lines.text,
-						beforeContext: [],
-						afterContext: [],
-					}
-				} else if (parsed.type === "context" && currentResult) {
-					if (parsed.data.line_number < currentResult.line!) {
-						currentResult.beforeContext!.push(parsed.data.lines.text)
-					} else {
-						currentResult.afterContext!.push(parsed.data.lines.text)
-					}
-				}
-			} catch (error) {
-				console.error("Error parsing ripgrep output:", error)
-			}
-		}
-	})
-
-	if (currentResult) {
-		results.push(currentResult as SearchResult)
-	}
-
-	return formatResults(results, cwd)
-}
-
-function formatResults(results: SearchResult[], cwd: string): string {
-	const groupedResults: { [key: string]: SearchResult[] } = {}
-
-	let output = ""
-	if (results.length >= MAX_RESULTS) {
-		output += `Showing first ${MAX_RESULTS} of ${MAX_RESULTS}+ results. Use a more specific search if necessary.\n\n`
-	} else {
-		output += `Found ${results.length === 1 ? "1 result" : `${results.length.toLocaleString()} results`}.\n\n`
-	}
-
-	// Group results by file name
-	results.slice(0, MAX_RESULTS).forEach((result) => {
-		const relativeFilePath = path.relative(cwd, result.file)
-		if (!groupedResults[relativeFilePath]) {
-			groupedResults[relativeFilePath] = []
-		}
-		groupedResults[relativeFilePath].push(result)
-	})
-
-	for (const [filePath, fileResults] of Object.entries(groupedResults)) {
-		output += `${filePath.toPosix()}\n│----\n`
-
-		fileResults.forEach((result, index) => {
-			const allLines = [...result.beforeContext, result.match, ...result.afterContext]
-			allLines.forEach((line) => {
-				output += `│${line?.trimEnd() ?? ""}\n`
-			})
-
-			if (index < fileResults.length - 1) {
-				output += "│----\n"
-			}
-		})
-
-		output += "│----\n\n"
-	}
-
-	return output.trim()
-}