From d9f9e7ba16776d30ea0c26bbbd5c25e41803735a Mon Sep 17 00:00:00 2001
From: Saoud Rizwan <7799382+saoudrizwan@users.noreply.github.com>
Date: Sun, 27 Oct 2024 02:39:41 -0400
Subject: [PATCH] Add browser_action tool

---
 src/core/Cline.ts                           | 177 +++++++++----
 src/core/assistant-message/index.ts         |  11 +-
 src/core/prompts/system.ts                  |  47 +++-
 src/services/browser/BrowserSession.ts      | 260 ++++++++++++++++++++
 src/services/browser/UrlContentFetcher.ts   | 136 +---------
 src/shared/ExtensionMessage.ts              |  22 +-
 webview-ui/src/components/chat/ChatRow.tsx  | 172 ++++++++-----
 webview-ui/src/components/chat/ChatView.tsx |  19 +-
 8 files changed, 576 insertions(+), 268 deletions(-)
 create mode 100644 src/services/browser/BrowserSession.ts

diff --git a/src/core/Cline.ts b/src/core/Cline.ts
index 33ebc67..ea30d8b 100644
--- a/src/core/Cline.ts
+++ b/src/core/Cline.ts
@@ -22,11 +22,15 @@ import { findLastIndex } from "../shared/array"
 import { combineApiRequests } from "../shared/combineApiRequests"
 import { combineCommandSequences } from "../shared/combineCommandSequences"
 import {
+	BrowserAction,
+	BrowserActionResult,
+	browserActions,
 	ClineApiReqCancelReason,
 	ClineApiReqInfo,
 	ClineAsk,
 	ClineMessage,
 	ClineSay,
+	ClineSayBrowserAction,
 	ClineSayTool,
 } from "../shared/ExtensionMessage"
 import { getApiMetrics } from "../shared/getApiMetrics"
@@ -42,6 +46,7 @@ import { addCustomInstructions, SYSTEM_PROMPT } from "./prompts/system"
 import { truncateHalfConversation } from "./sliding-window"
 import { ClineProvider, GlobalFileNames } from "./webview/ClineProvider"
 import { showOmissionWarning } from "../integrations/editor/detect-omission"
+import { BrowserSession } from "../services/browser/BrowserSession"
 
 const cwd =
 	vscode.workspace.workspaceFolders?.map((folder) => folder.uri.fsPath).at(0) ?? path.join(os.homedir(), "Desktop") // may or may not exist but fs checking existence would immediately ask for permission which would be bad UX, need to come up with a better solution
@@ -56,6 +61,7 @@ export class Cline {
 	api: ApiHandler
 	private terminalManager: TerminalManager
 	private urlContentFetcher: UrlContentFetcher
+	private browserSession: BrowserSession
 	private didEditFile: boolean = false
 	customInstructions?: string
 	alwaysAllowReadOnly: boolean
@@ -95,6 +101,7 @@ export class Cline {
 		this.api = buildApiHandler(apiConfiguration)
 		this.terminalManager = new TerminalManager()
 		this.urlContentFetcher = new UrlContentFetcher(provider.context)
+		this.browserSession = new BrowserSession(provider.context)
 		this.diffViewProvider = new DiffViewProvider(cwd)
 		this.customInstructions = customInstructions
 		this.alwaysAllowReadOnly = alwaysAllowReadOnly ?? false
@@ -660,6 +667,7 @@ export class Cline {
 		this.abort = true // will stop any autonomously running promises
 		this.terminalManager.disposeAll()
 		this.urlContentFetcher.closeBrowser()
+		this.browserSession.closeBrowser()
 	}
 
 	// Tools
@@ -881,8 +889,8 @@ export class Cline {
 							return `[${block.name} for '${block.params.path}']`
 						case "list_code_definition_names":
 							return `[${block.name} for '${block.params.path}']`
-						case "inspect_site":
-							return `[${block.name} for '${block.params.url}']`
+						case "browser_action":
+							return `[${block.name} for '${block.params.action}']`
 						case "ask_followup_question":
 							return `[${block.name} for '${block.params.question}']`
 						case "attempt_completion":
@@ -992,6 +1000,10 @@ export class Cline {
 					return text.replace(tagRegex, "")
 				}
 
+				if (block.name !== "browser_action") {
+					await this.browserSession.closeBrowser()
+				}
+
 				switch (block.name) {
 					case "write_to_file": {
 						const relPath: string | undefined = block.params.path
@@ -1333,66 +1345,135 @@ export class Cline {
 							break
 						}
 					}
-					case "inspect_site": {
+					case "browser_action": {
+						const action: BrowserAction | undefined = block.params.action as BrowserAction
 						const url: string | undefined = block.params.url
-						const sharedMessageProps: ClineSayTool = {
-							tool: "inspectSite",
-							path: removeClosingTag("url", url),
+						const coordinate: string | undefined = block.params.coordinate
+						const text: string | undefined = block.params.text
+						if (!action || !browserActions.includes(action)) {
+							// checking for action to ensure it is complete and valid
+							if (!block.partial) {
+								// if the block is complete and we don't have a valid action this is a mistake
+								this.consecutiveMistakeCount++
+								pushToolResult(await this.sayAndCreateMissingParamError("browser_action", "action"))
+							}
+							break
 						}
+
 						try {
 							if (block.partial) {
-								const partialMessage = JSON.stringify(sharedMessageProps)
-								if (this.alwaysAllowReadOnly) {
-									await this.say("tool", partialMessage, undefined, block.partial)
+								if (action === "launch") {
+									await this.ask("browser_action_launch", url, block.partial).catch(() => {})
 								} else {
-									await this.ask("tool", partialMessage, block.partial).catch(() => {})
+									await this.say(
+										"browser_action",
+										JSON.stringify({
+											action: action as BrowserAction,
+											coordinate,
+											text,
+										} satisfies ClineSayBrowserAction),
+										undefined,
+										block.partial
+									)
 								}
 								break
 							} else {
-								if (!url) {
-									this.consecutiveMistakeCount++
-									pushToolResult(await this.sayAndCreateMissingParamError("inspect_site", "url"))
-									break
-								}
-								this.consecutiveMistakeCount = 0
-								const completeMessage = JSON.stringify(sharedMessageProps)
-								if (this.alwaysAllowReadOnly) {
-									await this.say("tool", completeMessage, undefined, false)
-								} else {
-									const didApprove = await askApproval("tool", completeMessage)
+								let browserActionResult: BrowserActionResult
+								if (action === "launch") {
+									if (!url) {
+										this.consecutiveMistakeCount++
+										pushToolResult(
+											await this.sayAndCreateMissingParamError("browser_action", "url")
+										)
+										break
+									}
+									this.consecutiveMistakeCount = 0
+									const didApprove = await askApproval("browser_action_launch", url)
 									if (!didApprove) {
 										break
 									}
-								}
-
-								// execute tool
-								// NOTE: it's okay that we call this message since the partial inspect_site is finished streaming. The only scenario we have to avoid is sending messages WHILE a partial message exists at the end of the messages array. For example the api_req_finished message would interfere with the partial message, so we needed to remove that.
-								await this.say("inspect_site_result", "") // no result, starts the loading spinner waiting for result
-								await this.urlContentFetcher.launchBrowser()
-								let result: {
-									screenshot: string
-									logs: string
-								}
-								try {
-									result = await this.urlContentFetcher.urlToScreenshotAndLogs(url)
-								} finally {
-									await this.urlContentFetcher.closeBrowser()
-								}
-								const { screenshot, logs } = result
-								await this.say("inspect_site_result", logs, [screenshot])
-
-								pushToolResult(
-									formatResponse.toolResult(
-										`The site has been visited, with console logs captured and a screenshot taken for your analysis.\n\nConsole logs:\n${
-											logs || "(No logs)"
-										}`,
-										[screenshot]
+									await this.browserSession.launchBrowser()
+									browserActionResult = await this.browserSession.navigateToUrl(url)
+								} else {
+									if (action === "click") {
+										if (!coordinate) {
+											this.consecutiveMistakeCount++
+											pushToolResult(
+												await this.sayAndCreateMissingParamError("browser_action", "coordinate")
+											)
+											break // can't be within an inner switch
+										}
+									}
+									if (action === "type") {
+										if (!text) {
+											this.consecutiveMistakeCount++
+											pushToolResult(
+												await this.sayAndCreateMissingParamError("browser_action", "text")
+											)
+											break
+										}
+									}
+									this.consecutiveMistakeCount = 0
+									await this.say(
+										"browser_action",
+										JSON.stringify({
+											action: action as BrowserAction,
+											coordinate,
+											text,
+										} satisfies ClineSayBrowserAction),
+										undefined,
+										false
 									)
-								)
+									switch (action) {
+										case "click":
+											browserActionResult = await this.browserSession.click(coordinate!)
+											break
+										case "type":
+											browserActionResult = await this.browserSession.type(text!)
+											break
+										case "scroll_down":
+											browserActionResult = await this.browserSession.scrollDown()
+											break
+										case "scroll_up":
+											browserActionResult = await this.browserSession.scrollUp()
+											break
+										case "close":
+											browserActionResult = await this.browserSession.closeBrowser()
+											break
+									}
+								}
+
+								// NOTE: it's okay that we call this message since the partial inspect_site is finished streaming. The only scenario we have to avoid is sending messages WHILE a partial message exists at the end of the messages array. For example the api_req_finished message would interfere with the partial message, so we needed to remove that.
+								// await this.say("inspect_site_result", "") // no result, starts the loading spinner waiting for result
+
+								await this.say("browser_action_result", JSON.stringify(browserActionResult))
+								switch (action) {
+									case "launch":
+									case "click":
+									case "type":
+									case "scroll_down":
+									case "scroll_up":
+										pushToolResult(
+											formatResponse.toolResult(
+												`The browser action has been executed. The console logs and screenshot have been captured for your analysis.\n\nConsole logs:\n${
+													browserActionResult.logs || "(No new logs)"
+												}\n\n(Remember: if you need to proceed to using non-\`browser_action\` tools, you must first close the browser.)`,
+												browserActionResult.screenshot ? [browserActionResult.screenshot] : []
+											)
+										)
+										break
+									case "close":
+										pushToolResult(
+											formatResponse.toolResult(
+												`The browser has been closed. You may now proceed to using other tools.`
+											)
+										)
+										break
+								}
 								break
 							}
 						} catch (error) {
-							await handleError("inspecting site", error)
+							await handleError("executing browser action", error)
 							break
 						}
 					}
@@ -1425,7 +1506,7 @@ export class Cline {
 								break
 							}
 						} catch (error) {
-							await handleError("inspecting site", error)
+							await handleError("executing command", error)
 							break
 						}
 					}
diff --git a/src/core/assistant-message/index.ts b/src/core/assistant-message/index.ts
index 968f7e7..32c50ae 100644
--- a/src/core/assistant-message/index.ts
+++ b/src/core/assistant-message/index.ts
@@ -15,7 +15,7 @@ export const toolUseNames = [
 	"search_files",
 	"list_files",
 	"list_code_definition_names",
-	"inspect_site",
+	"browser_action",
 	"ask_followup_question",
 	"attempt_completion",
 ] as const
@@ -30,7 +30,10 @@ export const toolParamNames = [
 	"regex",
 	"file_pattern",
 	"recursive",
+	"action",
 	"url",
+	"coordinate",
+	"text",
 	"question",
 	"result",
 ] as const
@@ -76,9 +79,9 @@ export interface ListCodeDefinitionNamesToolUse extends ToolUse {
 	params: Partial<Pick<Record<ToolParamName, string>, "path">>
 }
 
-export interface InspectSiteToolUse extends ToolUse {
-	name: "inspect_site"
-	params: Partial<Pick<Record<ToolParamName, string>, "url">>
+export interface BrowserActionToolUse extends ToolUse {
+	name: "browser_action"
+	params: Partial<Pick<Record<ToolParamName, string>, "action" | "url" | "coordinate" | "text">>
 }
 
 export interface AskFollowupQuestionToolUse extends ToolUse {
diff --git a/src/core/prompts/system.ts b/src/core/prompts/system.ts
index 68a33bc..0646719 100644
--- a/src/core/prompts/system.ts
+++ b/src/core/prompts/system.ts
@@ -4,7 +4,7 @@ import os from "os"
 
 export const SYSTEM_PROMPT = async (
 	cwd: string,
-	supportsImages: boolean
+	supportsComputerUse: boolean
 ) => `You are Cline, a highly skilled software engineer with extensive knowledge in many programming languages, frameworks, design patterns, and best practices.
 
 ====
@@ -96,17 +96,42 @@ Usage:
 <list_code_definition_names>
 <path>Directory path here</path>
 </list_code_definition_names>${
-	supportsImages
+	supportsComputerUse
 		? `
 
-## inspect_site
-Description: Request to capture a screenshot and console logs of the initial state of a website. This tool navigates to the specified URL, takes a screenshot of the entire page as it appears immediately after loading, and collects any console logs or errors that occur during page load. It does not interact with the page or capture any state changes after the initial load.
+## browser_action
+Description: Request to interact with a Puppeteer-controlled browser. Every action except \`close\` will be responded to with a screenshot of the browser's current state, along with any new console logs. You may only perform one browser action at a time, as you should assess the screenshot and logs to determine the next action.
+- The sequence of actions **must always start with** launching the browser at a URL, and **must always end with** closing the browser. If you need to visit a new URL that is not possible to navigate to from the current webpage, you must first close the browser, then launch again at the new URL.
+- While the browser is active, only the \`browser_action\` tool can be used. No other tools should be called during this time. You may proceed to use other tools only after closing the browser.
+- The browser window has a resolution of **800x600** pixels. When performing any click actions, ensure the coordinates are within this resolution range.
+- Before clicking on any elements such as icons, links, or buttons, you must consult the provided screenshot of the page to determine the coordinates of the element. The click should be targeted at the **center of the element**, not on its edges.
 Parameters:
-- url: (required) The URL of the site to inspect. This should be a valid URL including the protocol (e.g. http://localhost:3000/page, file:///path/to/file.html, etc.)
+- action: (required) The action to perform. The available actions are:
+    * launch: Launch a new Puppeteer-controlled browser instance at the specified URL. This **must always be the first action**.
+        - Use with the \`url\` parameter to provide the URL.
+        - Ensure the URL is valid and includes the appropriate protocol (e.g. http://localhost:3000/page, file:///path/to/file.html, etc.)
+    * click: Click at a specific x,y coordinate.
+        - Use with the \`coordinate\` parameter to specify the location.
+        - Always click in the center of an element (icon, button, link, etc.) based on coordinates derived from a screenshot.
+    * type: Type a string of text on the keyboard. You might use this after clicking on a text field to input text.
+        - Use with the \`text\` parameter to provide the string to type.
+    * scroll_down: Scroll down the page by one page height.
+    * scroll_up: Scroll up the page by one page height.
+    * close: Close the Puppeteer-controlled browser instance. This **must always be the final browser action**.
+        - Example: \`<action>close</action>\`
+- url: (optional) Use this for providing the URL for the \`launch\` action.
+    * Example: <url>https://example.com</url>
+- coordinate: (optional) The X and Y coordinates for the \`click\` action. Coordinates should be within the **800x600** resolution.
+    * Example: <coordinate>400,300</coordinate>
+- text: (optional) Use this for providing the text for the \`type\` action.
+    * Example: <text>Hello, world!</text>
 Usage:
-<inspect_site>
-<url>URL of the site to inspect</url>
-</inspect_site>`
+<browser_action>
+<action>Action to perform (e.g., launch, click, type, scroll_down, scroll_up, close)</action>
+<url>URL to launch the browser at (optional)</url>
+<coordinate>x,y coordinates (optional)</coordinate>
+<text>Text to type (optional)</text>
+</browser_action>`
 		: ""
 }
 
@@ -189,15 +214,15 @@ By waiting for and carefully considering the user's response after each tool use
 CAPABILITIES
 
 - You have access to tools that let you execute CLI commands on the user's computer, list files, view source code definitions, regex search${
-	supportsImages ? ", inspect websites" : ""
+	supportsComputerUse ? ", use the browser" : ""
 }, read and write files, and ask follow-up questions. These tools help you effectively accomplish a wide range of tasks, such as writing code, making edits or improvements to existing files, understanding the current state of a project, performing system operations, and much more.
 - When the user initially gives you a task, a recursive list of all filepaths in the current working directory ('${cwd.toPosix()}') will be included in environment_details. This provides an overview of the project's file structure, offering key insights into the project from directory/file names (how developers conceptualize and organize their code) and file extensions (the language used). This can also guide decision-making on which files to explore further. If you need to further explore directories such as outside the current working directory, you can use the list_files tool. If you pass 'true' for the recursive parameter, it will list files recursively. Otherwise, it will list files at the top level, which is better suited for generic directories where you don't necessarily need the nested structure, like the Desktop.
 - You can use search_files to perform regex searches across files in a specified directory, outputting context-rich results that include surrounding lines. This is particularly useful for understanding code patterns, finding specific implementations, or identifying areas that need refactoring.
 - You can use the list_code_definition_names tool to get an overview of source code definitions for all files at the top level of a specified directory. This can be particularly useful when you need to understand the broader context and relationships between certain parts of the code. You may need to call this tool multiple times to understand various parts of the codebase related to the task.
 	- For example, when asked to make edits or improvements you might analyze the file structure in the initial environment_details to get an overview of the project, then use list_code_definition_names to get further insight using source code definitions for files located in relevant directories, then read_file to examine the contents of relevant files, analyze the code and suggest improvements or make necessary edits, then use the write_to_file tool to implement changes. If you refactored code that could affect other parts of the codebase, you could use search_files to ensure you update other files as needed.
 - You can use the execute_command tool to run commands on the user's computer whenever you feel it can help accomplish the user's task. When you need to execute a CLI command, you must provide a clear explanation of what the command does. Prefer to execute complex CLI commands over creating executable scripts, since they are more flexible and easier to run. Interactive and long-running commands are allowed, since the commands are run in the user's VSCode terminal. The user may keep commands running in the background and you will be kept updated on their status along the way. Each command you execute is run in a new terminal instance.${
-	supportsImages
-		? "\n- You can use the inspect_site tool to capture a screenshot and console logs of the initial state of a website (including html files and locally running development servers) when you feel it is necessary in accomplishing the user's task. This tool may be useful at key stages of web development tasks-such as after implementing new features, making substantial changes, when troubleshooting issues, or to verify the result of your work. You can analyze the provided screenshot to ensure correct rendering or identify errors, and review console logs for runtime issues.\n	- For example, if asked to add a component to a react website, you might create the necessary files, use execute_command to run the site locally, then use inspect_site to verify there are no runtime errors on page load."
+	supportsComputerUse
+		? "\n- You can use the browser_action tool to interact with websites (including html files and locally running development servers) through a Puppeteer-controlled browser when you feel it is necessary in accomplishing the user's task. This tool is particularly useful for web development tasks as it allows you to launch a browser, navigate to pages, interact with elements through clicks and keyboard input, and capture the results through screenshots and console logs. This tool may be useful at key stages of web development tasks-such as after implementing new features, making substantial changes, when troubleshooting issues, or to verify the result of your work. You can analyze the provided screenshots to ensure correct rendering or identify errors, and review console logs for runtime issues.\n	- For example, if asked to add a component to a react website, you might create the necessary files, use execute_command to run the site locally, then use browser_action to launch the browser, navigate to the local server, and verify the component renders & functions correctly before closing the browser."
 		: ""
 }
 
diff --git a/src/services/browser/BrowserSession.ts b/src/services/browser/BrowserSession.ts
new file mode 100644
index 0000000..251583a
--- /dev/null
+++ b/src/services/browser/BrowserSession.ts
@@ -0,0 +1,260 @@
+import * as vscode from "vscode"
+import * as fs from "fs/promises"
+import * as path from "path"
+import { Browser, Page, ScreenshotOptions, TimeoutError, launch } from "puppeteer-core"
+// @ts-ignore
+import PCR from "puppeteer-chromium-resolver"
+import pWaitFor from "p-wait-for"
+import delay from "delay"
+import { fileExistsAtPath } from "../../utils/fs"
+import { BrowserActionResult } from "../../shared/ExtensionMessage"
+
+interface PCRStats {
+	puppeteer: { launch: typeof launch }
+	executablePath: string
+}
+
+export class BrowserSession {
+	private context: vscode.ExtensionContext
+	private browser?: Browser
+	private page?: Page
+	private currentMousePosition?: string
+
+	constructor(context: vscode.ExtensionContext) {
+		this.context = context
+	}
+
+	private async ensureChromiumExists(): Promise<PCRStats> {
+		const globalStoragePath = this.context?.globalStorageUri?.fsPath
+		if (!globalStoragePath) {
+			throw new Error("Global storage uri is invalid")
+		}
+
+		const puppeteerDir = path.join(globalStoragePath, "puppeteer")
+		const dirExists = await fileExistsAtPath(puppeteerDir)
+		if (!dirExists) {
+			await fs.mkdir(puppeteerDir, { recursive: true })
+		}
+
+		// if chromium doesn't exist, this will download it to path.join(puppeteerDir, ".chromium-browser-snapshots")
+		// if it does exist it will return the path to existing chromium
+		const stats: PCRStats = await PCR({
+			downloadPath: puppeteerDir,
+		})
+
+		return stats
+	}
+
+	async launchBrowser() {
+		console.log("launch browser called")
+		if (this.browser) {
+			// throw new Error("Browser already launched")
+			await this.closeBrowser() // this may happen when the model launches a browser again after having used it already before
+		}
+
+		const stats = await this.ensureChromiumExists()
+		this.browser = await stats.puppeteer.launch({
+			args: [
+				"--user-agent=Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/128.0.0.0 Safari/537.36",
+			],
+			executablePath: stats.executablePath,
+			defaultViewport: {
+				width: 800,
+				height: 600,
+			},
+			headless: false,
+		})
+		// (latest version of puppeteer does not add headless to user agent)
+		this.page = await this.browser?.newPage()
+	}
+
+	async closeBrowser(): Promise<BrowserActionResult> {
+		if (this.browser || this.page) {
+			console.log("closing browser...")
+			await this.browser?.close().catch(() => {})
+			this.browser = undefined
+			this.page = undefined
+		}
+		return {}
+	}
+
+	async doAction(action: (page: Page) => Promise<void>): Promise<BrowserActionResult> {
+		if (!this.page) {
+			throw new Error(
+				"Browser is not launched. This may occur if the browser was automatically closed by a non-`browser_action` tool."
+			)
+		}
+
+		const logs: string[] = []
+		let lastLogTs = Date.now()
+
+		const consoleListener = (msg: any) => {
+			if (msg.type() === "log") {
+				logs.push(msg.text())
+			} else {
+				logs.push(`[${msg.type()}] ${msg.text()}`)
+			}
+			lastLogTs = Date.now()
+		}
+
+		const errorListener = (err: Error) => {
+			logs.push(`[Page Error] ${err.toString()}`)
+			lastLogTs = Date.now()
+		}
+
+		// Add the listeners
+		this.page.on("console", consoleListener)
+		this.page.on("pageerror", errorListener)
+
+		try {
+			await action(this.page)
+		} catch (err) {
+			if (!(err instanceof TimeoutError)) {
+				logs.push(`[Error] ${err.toString()}`)
+			}
+		}
+
+		// Wait for console inactivity, with a timeout
+		await pWaitFor(() => Date.now() - lastLogTs >= 500, {
+			timeout: 3_000,
+			interval: 100,
+		}).catch(() => {})
+
+		let options: ScreenshotOptions = {
+			encoding: "base64",
+			clip: {
+				x: 0,
+				y: 0,
+				width: 800,
+				height: 600,
+			},
+		}
+
+		let screenshotBase64 = await this.page.screenshot({
+			...options,
+			type: "webp",
+		})
+		let screenshot = `data:image/webp;base64,${screenshotBase64}`
+
+		if (!screenshotBase64) {
+			console.log("webp screenshot failed, trying png")
+			screenshotBase64 = await this.page.screenshot({
+				...options,
+				type: "png",
+			})
+			screenshot = `data:image/png;base64,${screenshotBase64}`
+		}
+
+		if (!screenshotBase64) {
+			throw new Error("Failed to take screenshot.")
+		}
+
+		// this.page.removeAllListeners() <- causes the page to crash!
+		this.page.off("console", consoleListener)
+		this.page.off("pageerror", errorListener)
+
+		return {
+			screenshot,
+			logs: logs.join("\n"),
+			currentUrl: this.page.url(),
+			currentMousePosition: this.currentMousePosition,
+		}
+	}
+
+	async navigateToUrl(url: string): Promise<BrowserActionResult> {
+		return this.doAction(async (page) => {
+			// networkidle2 isn't good enough since page may take some time to load. we can assume locally running dev sites will reach networkidle0 in a reasonable amount of time
+			await page.goto(url, { timeout: 7_000, waitUntil: ["domcontentloaded", "networkidle2"] })
+			// await page.goto(url, { timeout: 10_000, waitUntil: "load" })
+			await this.waitTillHTMLStable(page) // in case the page is loading more resources
+		})
+	}
+
+	// page.goto { waitUntil: "networkidle0" } may not ever resolve, and not waiting could return page content too early before js has loaded
+	// https://stackoverflow.com/questions/52497252/puppeteer-wait-until-page-is-completely-loaded/61304202#61304202
+	private async waitTillHTMLStable(page: Page, timeout = 5_000) {
+		const checkDurationMsecs = 500 // 1000
+		const maxChecks = timeout / checkDurationMsecs
+		let lastHTMLSize = 0
+		let checkCounts = 1
+		let countStableSizeIterations = 0
+		const minStableSizeIterations = 3
+
+		while (checkCounts++ <= maxChecks) {
+			let html = await page.content()
+			let currentHTMLSize = html.length
+
+			// let bodyHTMLSize = await page.evaluate(() => document.body.innerHTML.length)
+			console.log("last: ", lastHTMLSize, " <> curr: ", currentHTMLSize)
+
+			if (lastHTMLSize !== 0 && currentHTMLSize === lastHTMLSize) {
+				countStableSizeIterations++
+			} else {
+				countStableSizeIterations = 0 //reset the counter
+			}
+
+			if (countStableSizeIterations >= minStableSizeIterations) {
+				console.log("Page rendered fully...")
+				break
+			}
+
+			lastHTMLSize = currentHTMLSize
+			await delay(checkDurationMsecs)
+		}
+	}
+
+	async click(coordinate: string): Promise<BrowserActionResult> {
+		const [x, y] = coordinate.split(",").map(Number)
+		return this.doAction(async (page) => {
+			// Set up network request monitoring
+			let hasNetworkActivity = false
+			const requestListener = () => {
+				hasNetworkActivity = true
+			}
+			page.on("request", requestListener)
+
+			// Perform the click
+			await page.mouse.click(x, y)
+			this.currentMousePosition = coordinate
+
+			// Small delay to check if click triggered any network activity
+			await delay(100)
+
+			if (hasNetworkActivity) {
+				// If we detected network activity, wait for navigation/loading
+				await page
+					.waitForNavigation({
+						waitUntil: ["domcontentloaded", "networkidle2"],
+						timeout: 7000,
+					})
+					.catch(() => {})
+				await this.waitTillHTMLStable(page)
+			}
+
+			// Clean up listener
+			page.off("request", requestListener)
+		})
+	}
+
+	async type(text: string): Promise<BrowserActionResult> {
+		return this.doAction(async (page) => {
+			await page.keyboard.type(text)
+		})
+	}
+
+	async scrollDown(): Promise<BrowserActionResult> {
+		return this.doAction(async (page) => {
+			await page.evaluate(() => {
+				window.scrollBy(0, window.innerHeight)
+			})
+		})
+	}
+
+	async scrollUp(): Promise<BrowserActionResult> {
+		return this.doAction(async (page) => {
+			await page.evaluate(() => {
+				window.scrollBy(0, -window.innerHeight)
+			})
+		})
+	}
+}
diff --git a/src/services/browser/UrlContentFetcher.ts b/src/services/browser/UrlContentFetcher.ts
index ea5dc83..caf19ee 100644
--- a/src/services/browser/UrlContentFetcher.ts
+++ b/src/services/browser/UrlContentFetcher.ts
@@ -1,13 +1,11 @@
 import * as vscode from "vscode"
 import * as fs from "fs/promises"
 import * as path from "path"
-import { Browser, Page, ScreenshotOptions, TimeoutError, launch } from "puppeteer-core"
+import { Browser, Page, launch } from "puppeteer-core"
 import * as cheerio from "cheerio"
 import TurndownService from "turndown"
 // @ts-ignore
 import PCR from "puppeteer-chromium-resolver"
-import pWaitFor from "p-wait-for"
-import delay from "delay"
 import { fileExistsAtPath } from "../../utils/fs"
 
 interface PCRStats {
@@ -29,19 +27,16 @@ export class UrlContentFetcher {
 		if (!globalStoragePath) {
 			throw new Error("Global storage uri is invalid")
 		}
-
 		const puppeteerDir = path.join(globalStoragePath, "puppeteer")
 		const dirExists = await fileExistsAtPath(puppeteerDir)
 		if (!dirExists) {
 			await fs.mkdir(puppeteerDir, { recursive: true })
 		}
-
 		// if chromium doesn't exist, this will download it to path.join(puppeteerDir, ".chromium-browser-snapshots")
 		// if it does exist it will return the path to existing chromium
 		const stats: PCRStats = await PCR({
 			downloadPath: puppeteerDir,
 		})
-
 		return stats
 	}
 
@@ -89,133 +84,4 @@ export class UrlContentFetcher {
 
 		return markdown
 	}
-
-	async urlToScreenshotAndLogs(url: string): Promise<{ screenshot: string; logs: string }> {
-		if (!this.browser || !this.page) {
-			throw new Error("Browser not initialized")
-		}
-
-		const logs: string[] = []
-		let lastLogTs = Date.now()
-
-		this.page.on("console", (msg) => {
-			if (msg.type() === "log") {
-				logs.push(msg.text())
-			} else {
-				logs.push(`[${msg.type()}] ${msg.text()}`)
-			}
-			lastLogTs = Date.now()
-		})
-		this.page.on("pageerror", (err) => {
-			logs.push(`[Page Error] ${err.toString()}`)
-			lastLogTs = Date.now()
-		})
-
-		try {
-			// networkidle2 isn't good enough since page may take some time to load. we can assume locally running dev sites will reach networkidle0 in a reasonable amount of time
-			await this.page.goto(url, { timeout: 7_000, waitUntil: ["domcontentloaded", "networkidle2"] })
-			// await this.page.goto(url, { timeout: 10_000, waitUntil: "load" })
-			await this.waitTillHTMLStable(this.page) // in case the page is loading more resources
-		} catch (err) {
-			if (!(err instanceof TimeoutError)) {
-				logs.push(`[Navigation Error] ${err.toString()}`)
-			}
-		}
-
-		// Wait for console inactivity, with a timeout
-		await pWaitFor(() => Date.now() - lastLogTs >= 500, {
-			timeout: 3_000,
-			interval: 100,
-		}).catch(() => {})
-
-		// image cannot exceed 8_000 pixels
-		const { pageHeight, pageWidth } = await this.page.evaluate(() => {
-			const html: HTMLElement | null = document.documentElement
-			const body: HTMLElement | null = document.body
-			return {
-				pageHeight: html?.scrollHeight || body?.scrollHeight,
-				pageWidth: html?.clientWidth || body?.clientWidth,
-			}
-		})
-		// const defaultViewport = this.page.viewport(); // width 800 height 600 by default
-		let options: ScreenshotOptions
-		if (pageHeight && pageWidth) {
-			options = {
-				// fullPage: true, // clip and fullPage are mutually exclusive
-				encoding: "base64",
-				// quality: 80,
-				clip: {
-					x: 0,
-					y: 0,
-					width: pageWidth,
-					height: Math.min(pageHeight, 8_000),
-				},
-			}
-		} else {
-			// if we can't get the page dimensions, fallback to full page screenshot
-			options = {
-				encoding: "base64",
-				fullPage: true,
-			}
-		}
-
-		let screenshotBase64 = await this.page.screenshot({
-			...options,
-			type: "webp",
-		})
-		let screenshot = `data:image/webp;base64,${screenshotBase64}`
-
-		if (!screenshotBase64) {
-			console.log("webp screenshot failed, trying png")
-			screenshotBase64 = await this.page.screenshot({
-				...options,
-				type: "png",
-			})
-			screenshot = `data:image/png;base64,${screenshotBase64}`
-		}
-
-		if (!screenshotBase64) {
-			throw new Error("Failed to take screenshot.")
-		}
-
-		this.page.removeAllListeners()
-
-		return {
-			screenshot,
-			logs: logs.join("\n"),
-		}
-	}
-
-	// page.goto { waitUntil: "networkidle0" } may not ever resolve, and not waiting could return page content too early before js has loaded
-	// https://stackoverflow.com/questions/52497252/puppeteer-wait-until-page-is-completely-loaded/61304202#61304202
-	private async waitTillHTMLStable(page: Page, timeout = 5_000) {
-		const checkDurationMsecs = 500 // 1000
-		const maxChecks = timeout / checkDurationMsecs
-		let lastHTMLSize = 0
-		let checkCounts = 1
-		let countStableSizeIterations = 0
-		const minStableSizeIterations = 3
-
-		while (checkCounts++ <= maxChecks) {
-			let html = await page.content()
-			let currentHTMLSize = html.length
-
-			// let bodyHTMLSize = await page.evaluate(() => document.body.innerHTML.length)
-			console.log("last: ", lastHTMLSize, " <> curr: ", currentHTMLSize)
-
-			if (lastHTMLSize !== 0 && currentHTMLSize === lastHTMLSize) {
-				countStableSizeIterations++
-			} else {
-				countStableSizeIterations = 0 //reset the counter
-			}
-
-			if (countStableSizeIterations >= minStableSizeIterations) {
-				console.log("Page rendered fully...")
-				break
-			}
-
-			lastHTMLSize = currentHTMLSize
-			await delay(checkDurationMsecs)
-		}
-	}
 }
diff --git a/src/shared/ExtensionMessage.ts b/src/shared/ExtensionMessage.ts
index 0ae1e13..777f52b 100644
--- a/src/shared/ExtensionMessage.ts
+++ b/src/shared/ExtensionMessage.ts
@@ -57,6 +57,7 @@ export type ClineAsk =
 	| "resume_task"
 	| "resume_completed_task"
 	| "mistake_limit_reached"
+	| "browser_action_launch"
 
 export type ClineSay =
 	| "task"
@@ -71,7 +72,8 @@ export type ClineSay =
 	| "command_output"
 	| "tool"
 	| "shell_integration_warning"
-	| "inspect_site_result"
+	| "browser_action"
+	| "browser_action_result"
 
 export interface ClineSayTool {
 	tool:
@@ -82,7 +84,6 @@ export interface ClineSayTool {
 		| "listFilesRecursive"
 		| "listCodeDefinitionNames"
 		| "searchFiles"
-		| "inspectSite"
 	path?: string
 	diff?: string
 	content?: string
@@ -90,6 +91,23 @@ export interface ClineSayTool {
 	filePattern?: string
 }
 
+// must keep in sync with system prompt
+export const browserActions = ["launch", "click", "type", "scroll_down", "scroll_up", "close"] as const
+export type BrowserAction = (typeof browserActions)[number]
+
+export interface ClineSayBrowserAction {
+	action: BrowserAction
+	coordinate?: string
+	text?: string
+}
+
+export type BrowserActionResult = {
+	screenshot?: string
+	logs?: string
+	currentUrl?: string
+	currentMousePosition?: string
+}
+
 export interface ClineApiReqInfo {
 	request?: string
 	tokensIn?: number
diff --git a/webview-ui/src/components/chat/ChatRow.tsx b/webview-ui/src/components/chat/ChatRow.tsx
index ecec748..6496dfb 100644
--- a/webview-ui/src/components/chat/ChatRow.tsx
+++ b/webview-ui/src/components/chat/ChatRow.tsx
@@ -1,7 +1,13 @@
 import { VSCodeBadge, VSCodeProgressRing } from "@vscode/webview-ui-toolkit/react"
 import deepEqual from "fast-deep-equal"
 import React, { memo, useEffect, useMemo, useRef } from "react"
-import { ClineApiReqInfo, ClineMessage, ClineSayTool } from "../../../../src/shared/ExtensionMessage"
+import {
+	BrowserActionResult,
+	ClineApiReqInfo,
+	ClineMessage,
+	ClineSayBrowserAction,
+	ClineSayTool,
+} from "../../../../src/shared/ExtensionMessage"
 import { COMMAND_OUTPUT_STRING } from "../../../../src/shared/combineCommandSequences"
 import { vscode } from "../../utils/vscode"
 import CodeAccordian, { removeLeadingNonAlphanumeric } from "../common/CodeAccordian"
@@ -380,32 +386,32 @@ const ChatRowContent = ({ message, isExpanded, onToggleExpand, lastModifiedMessa
 						/>
 					</>
 				)
-			case "inspectSite":
-				const isInspecting =
-					isLast && lastModifiedMessage?.say === "inspect_site_result" && !lastModifiedMessage?.images
-				return (
-					<>
-						<div style={headerStyle}>
-							{isInspecting ? <ProgressIndicator /> : toolIcon("inspect")}
-							<span style={{ fontWeight: "bold" }}>
-								{message.type === "ask" ? (
-									<>Cline wants to inspect this website:</>
-								) : (
-									<>Cline is inspecting this website:</>
-								)}
-							</span>
-						</div>
-						<div
-							style={{
-								borderRadius: 3,
-								border: "1px solid var(--vscode-editorGroup-border)",
-								overflow: "hidden",
-								backgroundColor: CODE_BLOCK_BG_COLOR,
-							}}>
-							<CodeBlock source={`${"```"}shell\n${tool.path}\n${"```"}`} forceWrap={true} />
-						</div>
-					</>
-				)
+			// case "inspectSite":
+			// 	const isInspecting =
+			// 		isLast && lastModifiedMessage?.say === "inspect_site_result" && !lastModifiedMessage?.images
+			// 	return (
+			// 		<>
+			// 			<div style={headerStyle}>
+			// 				{isInspecting ? <ProgressIndicator /> : toolIcon("inspect")}
+			// 				<span style={{ fontWeight: "bold" }}>
+			// 					{message.type === "ask" ? (
+			// 						<>Cline wants to inspect this website:</>
+			// 					) : (
+			// 						<>Cline is inspecting this website:</>
+			// 					)}
+			// 				</span>
+			// 			</div>
+			// 			<div
+			// 				style={{
+			// 					borderRadius: 3,
+			// 					border: "1px solid var(--vscode-editorGroup-border)",
+			// 					overflow: "hidden",
+			// 					backgroundColor: CODE_BLOCK_BG_COLOR,
+			// 				}}>
+			// 				<CodeBlock source={`${"```"}shell\n${tool.path}\n${"```"}`} forceWrap={true} />
+			// 			</div>
+			// 		</>
+			// 	)
 			default:
 				return null
 		}
@@ -549,42 +555,6 @@ const ChatRowContent = ({ message, isExpanded, onToggleExpand, lastModifiedMessa
 							/>
 						</div>
 					)
-				case "inspect_site_result":
-					const logs = message.text || ""
-					const screenshot = message.images?.[0]
-					return (
-						<div
-							style={{
-								marginTop: -10,
-								width: "100%",
-							}}>
-							{screenshot && (
-								<img
-									src={screenshot}
-									alt="Inspect screenshot"
-									style={{
-										width: "calc(100% - 2px)",
-										height: "auto",
-										objectFit: "contain",
-										marginBottom: logs ? 7 : 0,
-										borderRadius: 3,
-										cursor: "pointer",
-										marginLeft: "1px",
-									}}
-									onClick={() => vscode.postMessage({ type: "openImage", text: screenshot })}
-								/>
-							)}
-							{logs && (
-								<CodeAccordian
-									code={logs}
-									language="shell"
-									isConsoleLogs={true}
-									isExpanded={isExpanded}
-									onToggleExpand={onToggleExpand}
-								/>
-							)}
-						</div>
-					)
 				case "error":
 					return (
 						<>
@@ -647,7 +617,58 @@ const ChatRowContent = ({ message, isExpanded, onToggleExpand, lastModifiedMessa
 							</div>
 						</>
 					)
-
+				case "browser_action":
+					const browserAction = JSON.parse(message.text || "{}") as ClineSayBrowserAction
+					return (
+						<div
+							style={{
+								marginTop: -10,
+								width: "100%",
+							}}>
+							<div style={{ fontWeight: "bold" }}>{browserAction.action}</div>
+							{browserAction.coordinate && <div>{browserAction.coordinate}</div>}
+							{browserAction.text && <div>{browserAction.text}</div>}
+						</div>
+					)
+				case "browser_action_result":
+					const { screenshot, logs, currentMousePosition, currentUrl } = JSON.parse(
+						message.text || "{}"
+					) as BrowserActionResult
+					return (
+						<div
+							style={{
+								marginTop: -10,
+								width: "100%",
+							}}>
+							{currentMousePosition && <div>{currentMousePosition}</div>}
+							{currentUrl && <div>{currentUrl}</div>}
+							{screenshot && (
+								<img
+									src={screenshot}
+									alt="Inspect screenshot"
+									style={{
+										width: "calc(100% - 2px)",
+										height: "auto",
+										objectFit: "contain",
+										marginBottom: logs ? 7 : 0,
+										borderRadius: 3,
+										cursor: "pointer",
+										marginLeft: "1px",
+									}}
+									onClick={() => vscode.postMessage({ type: "openImage", text: screenshot })}
+								/>
+							)}
+							{logs && (
+								<CodeAccordian
+									code={logs}
+									language="shell"
+									isConsoleLogs={true}
+									isExpanded={isExpanded}
+									onToggleExpand={onToggleExpand}
+								/>
+							)}
+						</div>
+					)
 				default:
 					return (
 						<>
@@ -779,6 +800,29 @@ const ChatRowContent = ({ message, isExpanded, onToggleExpand, lastModifiedMessa
 							</div>
 						</>
 					)
+				case "browser_action_launch":
+					// const isInspecting =
+					// isLast && lastModifiedMessage?.say === "inspect_site_result" && !lastModifiedMessage?.images
+
+					return (
+						<>
+							<div style={headerStyle}>
+								{/* {isInspecting ? <ProgressIndicator /> : toolIcon("inspect")} */}
+								<span style={{ fontWeight: "bold" }}>
+									<>Cline wants to use the browser:</>
+								</span>
+							</div>
+							<div
+								style={{
+									borderRadius: 3,
+									border: "1px solid var(--vscode-editorGroup-border)",
+									overflow: "hidden",
+									backgroundColor: CODE_BLOCK_BG_COLOR,
+								}}>
+								<CodeBlock source={`${"```"}shell\n${message.text}\n${"```"}`} forceWrap={true} />
+							</div>
+						</>
+					)
 				default:
 					return null
 			}
diff --git a/webview-ui/src/components/chat/ChatView.tsx b/webview-ui/src/components/chat/ChatView.tsx
index 9bc0c9f..117bdc4 100644
--- a/webview-ui/src/components/chat/ChatView.tsx
+++ b/webview-ui/src/components/chat/ChatView.tsx
@@ -105,6 +105,13 @@ const ChatView = ({ isHidden, showAnnouncement, hideAnnouncement, showHistoryVie
 									break
 							}
 							break
+						case "browser_action_launch":
+							setTextAreaDisabled(isPartial)
+							setClineAsk("browser_action_launch")
+							setEnableButtons(!isPartial)
+							setPrimaryButtonText("Approve")
+							setSecondaryButtonText("Reject")
+							break
 						case "command":
 							setTextAreaDisabled(isPartial)
 							setClineAsk("command")
@@ -162,7 +169,8 @@ const ChatView = ({ isHidden, showAnnouncement, hideAnnouncement, showHistoryVie
 						case "error":
 						case "api_req_finished":
 						case "text":
-						case "inspect_site_result":
+						case "browser_action":
+						case "browser_action_result":
 						case "command_output":
 						case "completion_result":
 						case "tool":
@@ -229,6 +237,7 @@ const ChatView = ({ isHidden, showAnnouncement, hideAnnouncement, showHistoryVie
 					switch (clineAsk) {
 						case "followup":
 						case "tool":
+						case "browser_action_launch":
 						case "command": // user can provide feedback to a tool or command use
 						case "command_output": // user can send input to command stdin
 						case "completion_result": // if this happens then the user has feedback for the completion result
@@ -271,6 +280,7 @@ const ChatView = ({ isHidden, showAnnouncement, hideAnnouncement, showHistoryVie
 			case "command":
 			case "command_output":
 			case "tool":
+			case "browser_action_launch":
 			case "resume_task":
 			case "mistake_limit_reached":
 				vscode.postMessage({ type: "askResponse", askResponse: "yesButtonClicked" })
@@ -303,6 +313,7 @@ const ChatView = ({ isHidden, showAnnouncement, hideAnnouncement, showHistoryVie
 				break
 			case "command":
 			case "tool":
+			case "browser_action_launch":
 				// responds to the API with a "This operation failed" and lets it try again
 				vscode.postMessage({ type: "askResponse", askResponse: "noButtonClicked" })
 				break
@@ -418,9 +429,9 @@ const ChatView = ({ isHidden, showAnnouncement, hideAnnouncement, showHistoryVie
 						return false
 					}
 					break
-				case "inspect_site_result":
-					// don't show row for inspect site result until a screenshot is captured
-					return !!message.images
+				// case "inspect_site_result":
+				// 	// don't show row for inspect site result until a screenshot is captured
+				// 	return !!message.images
 			}
 			return true
 		})