Refactor services

This commit is contained in:
Saoud Rizwan
2024-09-24 11:15:24 -04:00
parent e410e317cc
commit 24bad56785
4 changed files with 3 additions and 3 deletions

View File

@@ -1,223 +0,0 @@
import * as vscode from "vscode"
import * as fs from "fs/promises"
import * as path from "path"
import { Browser, Page, ScreenshotOptions, TimeoutError, launch } from "puppeteer-core"
import * as cheerio from "cheerio"
import TurndownService from "turndown"
// @ts-ignore
import PCR from "puppeteer-chromium-resolver"
import pWaitFor from "p-wait-for"
import delay from "delay"
interface PCRStats {
puppeteer: { launch: typeof launch }
executablePath: string
}
export class UrlContentFetcher {
private context: vscode.ExtensionContext
private browser?: Browser
private page?: Page
constructor(context: vscode.ExtensionContext) {
this.context = context
}
private async ensureChromiumExists(): Promise<PCRStats> {
const globalStoragePath = this.context?.globalStorageUri?.fsPath
if (!globalStoragePath) {
throw new Error("Global storage uri is invalid")
}
const puppeteerDir = path.join(globalStoragePath, "puppeteer")
const dirExists = await fs
.access(puppeteerDir)
.then(() => true)
.catch(() => false)
if (!dirExists) {
await fs.mkdir(puppeteerDir, { recursive: true })
}
// if chromium doesn't exist, this will download it to path.join(puppeteerDir, ".chromium-browser-snapshots")
// if it does exist it will return the path to existing chromium
const stats: PCRStats = await PCR({
downloadPath: puppeteerDir,
})
return stats
}
async launchBrowser(): Promise<void> {
if (this.browser) {
return
}
const stats = await this.ensureChromiumExists()
this.browser = await stats.puppeteer.launch({
args: [
"--user-agent=Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/128.0.0.0 Safari/537.36",
],
executablePath: stats.executablePath,
})
// (latest version of puppeteer does not add headless to user agent)
this.page = await this.browser?.newPage()
}
async closeBrowser(): Promise<void> {
await this.browser?.close()
this.browser = undefined
this.page = undefined
}
// must make sure to call launchBrowser before and closeBrowser after using this
async urlToMarkdown(url: string): Promise<string> {
if (!this.browser || !this.page) {
throw new Error("Browser not initialized")
}
/*
- networkidle2 is equivalent to playwright's networkidle where it waits until there are no more than 2 network connections for at least 500 ms.
- domcontentloaded is when the basic DOM is loaded
this should be sufficient for most doc sites
*/
await this.page.goto(url, { timeout: 10_000, waitUntil: ["domcontentloaded", "networkidle2"] })
const content = await this.page.content()
// use cheerio to parse and clean up the HTML
const $ = cheerio.load(content)
$("script, style, nav, footer, header").remove()
// convert cleaned HTML to markdown
const turndownService = new TurndownService()
const markdown = turndownService.turndown($.html())
return markdown
}
async urlToScreenshotAndLogs(url: string): Promise<{ screenshot: string; logs: string }> {
if (!this.browser || !this.page) {
throw new Error("Browser not initialized")
}
const logs: string[] = []
let lastLogTs = Date.now()
this.page.on("console", (msg) => {
if (msg.type() === "log") {
logs.push(msg.text())
} else {
logs.push(`[${msg.type()}] ${msg.text()}`)
}
lastLogTs = Date.now()
})
this.page.on("pageerror", (err) => {
logs.push(`[Page Error] ${err.toString()}`)
lastLogTs = Date.now()
})
try {
// networkidle2 isn't good enough since page may take some time to load. we can assume locally running dev sites will reach networkidle0 in a reasonable amount of time
await this.page.goto(url, { timeout: 7_000, waitUntil: ["domcontentloaded", "networkidle2"] })
// await this.page.goto(url, { timeout: 10_000, waitUntil: "load" })
await this.waitTillHTMLStable(this.page) // in case the page is loading more resources
} catch (err) {
if (!(err instanceof TimeoutError)) {
logs.push(`[Navigation Error] ${err.toString()}`)
}
}
// Wait for console inactivity, with a timeout
await pWaitFor(() => Date.now() - lastLogTs >= 500, {
timeout: 3_000,
interval: 100,
}).catch(() => {})
// image cannot exceed 8_000 pixels
const { pageHeight, pageWidth } = await this.page.evaluate(() => {
const html: HTMLElement | null = document.documentElement
const body: HTMLElement | null = document.body
return {
pageHeight: html?.scrollHeight || body?.scrollHeight,
pageWidth: html?.clientWidth || body?.clientWidth,
}
})
// const defaultViewport = this.page.viewport(); // width 800 height 600 by default
let options: ScreenshotOptions
if (pageHeight && pageWidth) {
options = {
// fullPage: true, // clip and fullPage are mutually exclusive
encoding: "base64",
// quality: 80,
clip: {
x: 0,
y: 0,
width: pageWidth,
height: Math.min(pageHeight, 8_000),
},
}
} else {
// if we can't get the page dimensions, fallback to full page screenshot
options = {
encoding: "base64",
fullPage: true,
}
}
let screenshotBase64 = await this.page.screenshot({
...options,
type: "webp",
})
let screenshot = `data:image/webp;base64,${screenshotBase64}`
if (!screenshotBase64) {
console.log("webp screenshot failed, trying png")
screenshotBase64 = await this.page.screenshot({
...options,
type: "png",
})
screenshot = `data:image/png;base64,${screenshotBase64}`
}
if (!screenshotBase64) {
throw new Error("Failed to take screenshot.")
}
this.page.removeAllListeners()
return {
screenshot,
logs: logs.join("\n"),
}
}
// page.goto { waitUntil: "networkidle0" } may not ever resolve, and not waiting could return page content too early before js has loaded
// https://stackoverflow.com/questions/52497252/puppeteer-wait-until-page-is-completely-loaded/61304202#61304202
private async waitTillHTMLStable(page: Page, timeout = 5_000) {
const checkDurationMsecs = 500 // 1000
const maxChecks = timeout / checkDurationMsecs
let lastHTMLSize = 0
let checkCounts = 1
let countStableSizeIterations = 0
const minStableSizeIterations = 3
while (checkCounts++ <= maxChecks) {
let html = await page.content()
let currentHTMLSize = html.length
// let bodyHTMLSize = await page.evaluate(() => document.body.innerHTML.length)
console.log("last: ", lastHTMLSize, " <> curr: ", currentHTMLSize)
if (lastHTMLSize !== 0 && currentHTMLSize === lastHTMLSize) {
countStableSizeIterations++
} else {
countStableSizeIterations = 0 //reset the counter
}
if (countStableSizeIterations >= minStableSizeIterations) {
console.log("Page rendered fully...")
break
}
lastHTMLSize = currentHTMLSize
await delay(checkDurationMsecs)
}
}
}

View File

@@ -1,7 +1,7 @@
import * as vscode from "vscode"
import * as path from "path"
import { openFile } from "./open-file"
import { UrlContentFetcher } from "./UrlContentFetcher"
import { UrlContentFetcher } from "../services/browser/UrlContentFetcher"
import { mentionRegexGlobal } from "../shared/context-mentions"
import fs from "fs/promises"
import { extractTextFromFile } from "./extract-text"

View File

@@ -1,222 +0,0 @@
import * as vscode from "vscode"
import * as childProcess from "child_process"
import * as path from "path"
import * as fs from "fs"
import * as readline from "readline"
/*
This file provides functionality to perform regex searches on files using ripgrep.
Inspired by: https://github.com/DiscreteTom/vscode-ripgrep-utils
Key components:
1. getBinPath: Locates the ripgrep binary within the VSCode installation.
2. execRipgrep: Executes the ripgrep command and returns the output.
3. regexSearchFiles: The main function that performs regex searches on files.
- Parameters:
* cwd: The current working directory (for relative path calculation)
* directoryPath: The directory to search in
* regex: The regular expression to search for (Rust regex syntax)
* filePattern: Optional glob pattern to filter files (default: '*')
- Returns: A formatted string containing search results with context
The search results include:
- Relative file paths
- 2 lines of context before and after each match
- Matches formatted with pipe characters for easy reading
Usage example:
const results = await regexSearchFiles('/path/to/cwd', '/path/to/search', 'TODO:', '*.ts');
rel/path/to/app.ts
│----
│function processData(data: any) {
│ // Some processing logic here
│ // TODO: Implement error handling
│ return processedData;
│}
│----
rel/path/to/helper.ts
│----
│ let result = 0;
│ for (let i = 0; i < input; i++) {
│ // TODO: Optimize this function for performance
│ result += Math.pow(i, 2);
│ }
│----
*/
const isWindows = /^win/.test(process.platform)
const binName = isWindows ? "rg.exe" : "rg"
interface SearchResult {
file: string
line: number
column: number
match: string
beforeContext: string[]
afterContext: string[]
}
const MAX_RESULTS = 300
async function getBinPath(vscodeAppRoot: string): Promise<string | undefined> {
const checkPath = async (pkgFolder: string) => {
const fullPath = path.join(vscodeAppRoot, pkgFolder, binName)
return (await pathExists(fullPath)) ? fullPath : undefined
}
return (
(await checkPath("node_modules/@vscode/ripgrep/bin/")) ||
(await checkPath("node_modules/vscode-ripgrep/bin")) ||
(await checkPath("node_modules.asar.unpacked/vscode-ripgrep/bin/")) ||
(await checkPath("node_modules.asar.unpacked/@vscode/ripgrep/bin/"))
)
}
async function pathExists(path: string): Promise<boolean> {
return new Promise((resolve) => {
fs.access(path, (err) => {
resolve(err === null)
})
})
}
async function execRipgrep(bin: string, args: string[]): Promise<string> {
return new Promise((resolve, reject) => {
const rgProcess = childProcess.spawn(bin, args)
// cross-platform alternative to head, which is ripgrep author's recommendation for limiting output.
const rl = readline.createInterface({
input: rgProcess.stdout,
crlfDelay: Infinity, // treat \r\n as a single line break even if it's split across chunks. This ensures consistent behavior across different operating systems.
})
let output = ""
let lineCount = 0
const maxLines = MAX_RESULTS * 5 // limiting ripgrep output with max lines since there's no other way to limit results. it's okay that we're outputting as json, since we're parsing it line by line and ignore anything that's not part of a match. This assumes each result is at most 5 lines.
rl.on("line", (line) => {
if (lineCount < maxLines) {
output += line + "\n"
lineCount++
} else {
rl.close()
rgProcess.kill()
}
})
let errorOutput = ""
rgProcess.stderr.on("data", (data) => {
errorOutput += data.toString()
})
rl.on("close", () => {
if (errorOutput) {
reject(new Error(`ripgrep process error: ${errorOutput}`))
} else {
resolve(output)
}
})
rgProcess.on("error", (error) => {
reject(new Error(`ripgrep process error: ${error.message}`))
})
})
}
export async function regexSearchFiles(
cwd: string,
directoryPath: string,
regex: string,
filePattern?: string
): Promise<string> {
const vscodeAppRoot = vscode.env.appRoot
const rgPath = await getBinPath(vscodeAppRoot)
if (!rgPath) {
throw new Error("Could not find ripgrep binary")
}
const args = ["--json", "-e", regex, "--glob", filePattern || "*", "--context", "1", directoryPath]
let output: string
try {
output = await execRipgrep(rgPath, args)
} catch {
return "No results found"
}
const results: SearchResult[] = []
let currentResult: Partial<SearchResult> | null = null
output.split("\n").forEach((line) => {
if (line) {
try {
const parsed = JSON.parse(line)
if (parsed.type === "match") {
if (currentResult) {
results.push(currentResult as SearchResult)
}
currentResult = {
file: parsed.data.path.text,
line: parsed.data.line_number,
column: parsed.data.submatches[0].start,
match: parsed.data.lines.text,
beforeContext: [],
afterContext: [],
}
} else if (parsed.type === "context" && currentResult) {
if (parsed.data.line_number < currentResult.line!) {
currentResult.beforeContext!.push(parsed.data.lines.text)
} else {
currentResult.afterContext!.push(parsed.data.lines.text)
}
}
} catch (error) {
console.error("Error parsing ripgrep output:", error)
}
}
})
if (currentResult) {
results.push(currentResult as SearchResult)
}
return formatResults(results, cwd)
}
function formatResults(results: SearchResult[], cwd: string): string {
const groupedResults: { [key: string]: SearchResult[] } = {}
let output = ""
if (results.length >= MAX_RESULTS) {
output += `Showing first ${MAX_RESULTS} of ${MAX_RESULTS}+ results. Use a more specific search if necessary.\n\n`
} else {
output += `Found ${results.length === 1 ? "1 result" : `${results.length.toLocaleString()} results`}.\n\n`
}
// Group results by file name
results.slice(0, MAX_RESULTS).forEach((result) => {
const relativeFilePath = path.relative(cwd, result.file)
if (!groupedResults[relativeFilePath]) {
groupedResults[relativeFilePath] = []
}
groupedResults[relativeFilePath].push(result)
})
for (const [filePath, fileResults] of Object.entries(groupedResults)) {
output += `${filePath.toPosix()}\n│----\n`
fileResults.forEach((result, index) => {
const allLines = [...result.beforeContext, result.match, ...result.afterContext]
allLines.forEach((line) => {
output += `${line?.trimEnd() ?? ""}\n`
})
if (index < fileResults.length - 1) {
output += "│----\n"
}
})
output += "│----\n\n"
}
return output.trim()
}