diff --git a/src/core/Cline.ts b/src/core/Cline.ts index 8012fee..51edf57 100644 --- a/src/core/Cline.ts +++ b/src/core/Cline.ts @@ -1398,6 +1398,11 @@ export class Cline { if (!didApprove) { break } + + // NOTE: it's okay that we call this message since the partial inspect_site is finished streaming. The only scenario we have to avoid is sending messages WHILE a partial message exists at the end of the messages array. For example the api_req_finished message would interfere with the partial message, so we needed to remove that. + // await this.say("inspect_site_result", "") // no result, starts the loading spinner waiting for result + await this.say("browser_action_result", "") // starts loading spinner + await this.browserSession.launchBrowser() browserActionResult = await this.browserSession.navigateToUrl(url) } else { @@ -1451,9 +1456,6 @@ export class Cline { } } - // NOTE: it's okay that we call this message since the partial inspect_site is finished streaming. The only scenario we have to avoid is sending messages WHILE a partial message exists at the end of the messages array. For example the api_req_finished message would interfere with the partial message, so we needed to remove that. - // await this.say("inspect_site_result", "") // no result, starts the loading spinner waiting for result - switch (action) { case "launch": case "click": diff --git a/src/core/prompts/system.ts b/src/core/prompts/system.ts index 0646719..9d5a93d 100644 --- a/src/core/prompts/system.ts +++ b/src/core/prompts/system.ts @@ -103,7 +103,7 @@ Usage: Description: Request to interact with a Puppeteer-controlled browser. Every action except \`close\` will be responded to with a screenshot of the browser's current state, along with any new console logs. You may only perform one browser action at a time, as you should assess the screenshot and logs to determine the next action. - The sequence of actions **must always start with** launching the browser at a URL, and **must always end with** closing the browser. If you need to visit a new URL that is not possible to navigate to from the current webpage, you must first close the browser, then launch again at the new URL. - While the browser is active, only the \`browser_action\` tool can be used. No other tools should be called during this time. You may proceed to use other tools only after closing the browser. -- The browser window has a resolution of **800x600** pixels. When performing any click actions, ensure the coordinates are within this resolution range. +- The browser window has a resolution of **900x600** pixels. When performing any click actions, ensure the coordinates are within this resolution range. - Before clicking on any elements such as icons, links, or buttons, you must consult the provided screenshot of the page to determine the coordinates of the element. The click should be targeted at the **center of the element**, not on its edges. Parameters: - action: (required) The action to perform. The available actions are: @@ -121,8 +121,8 @@ Parameters: - Example: \`close\` - url: (optional) Use this for providing the URL for the \`launch\` action. * Example: https://example.com -- coordinate: (optional) The X and Y coordinates for the \`click\` action. Coordinates should be within the **800x600** resolution. - * Example: 400,300 +- coordinate: (optional) The X and Y coordinates for the \`click\` action. Coordinates should be within the **900x600** resolution. + * Example: 450,300 - text: (optional) Use this for providing the text for the \`type\` action. * Example: Hello, world! Usage: diff --git a/src/services/browser/BrowserSession.ts b/src/services/browser/BrowserSession.ts index 251583a..dabaa32 100644 --- a/src/services/browser/BrowserSession.ts +++ b/src/services/browser/BrowserSession.ts @@ -59,10 +59,10 @@ export class BrowserSession { ], executablePath: stats.executablePath, defaultViewport: { - width: 800, + width: 900, height: 600, }, - headless: false, + // headless: false, }) // (latest version of puppeteer does not add headless to user agent) this.page = await this.browser?.newPage() @@ -125,7 +125,7 @@ export class BrowserSession { clip: { x: 0, y: 0, - width: 800, + width: 900, height: 600, }, } diff --git a/webview-ui/src/components/chat/BrowserSessionRow.tsx b/webview-ui/src/components/chat/BrowserSessionRow.tsx index c4c6c2b..192d88d 100644 --- a/webview-ui/src/components/chat/BrowserSessionRow.tsx +++ b/webview-ui/src/components/chat/BrowserSessionRow.tsx @@ -32,16 +32,14 @@ interface BrowserSessionRowProps { */ const BrowserSessionRow = memo((props: BrowserSessionRowProps) => { - const { messages, isLast, onHeightChange, lastModifiedMessage } = props + const { messages, isLast, onHeightChange } = props const prevHeightRef = useRef(0) const [maxActionHeight, setMaxActionHeight] = useState(0) const [consoleLogsExpanded, setConsoleLogsExpanded] = useState(false) const isBrowsing = useMemo(() => { - return ( - isLast && lastModifiedMessage?.ask !== "resume_task" && lastModifiedMessage?.ask !== "resume_completed_task" - ) - }, [isLast, lastModifiedMessage]) + return isLast && messages.some((m) => m.say === "browser_action_result") // after user approves, browser_action_result with "" is sent to indicate that the session has started + }, [isLast, messages]) // Organize messages into pages with current state and next action const pages = useMemo(() => { @@ -66,6 +64,10 @@ const BrowserSessionRow = memo((props: BrowserSessionRowProps) => { // Start first page currentStateMessages = [message] } else if (message.say === "browser_action_result") { + if (message.text === "") { + // first browser_action_result is an empty string that signals that session has started + return + } // Complete current state currentStateMessages.push(message) const resultData = JSON.parse(message.text || "{}") as BrowserActionResult @@ -156,13 +158,13 @@ const BrowserSessionRow = memo((props: BrowserSessionRowProps) => { const displayState = isLastPage ? { url: currentPage?.currentState.url || latestState.url || initialUrl, - mousePosition: currentPage?.currentState.mousePosition || latestState.mousePosition || "400,300", + mousePosition: currentPage?.currentState.mousePosition || latestState.mousePosition || "700,400", consoleLogs: currentPage?.currentState.consoleLogs, screenshot: currentPage?.currentState.screenshot || latestState.screenshot, } : { url: currentPage?.currentState.url || initialUrl, - mousePosition: currentPage?.currentState.mousePosition || "400,300", + mousePosition: currentPage?.currentState.mousePosition || "700,400", consoleLogs: currentPage?.currentState.consoleLogs, screenshot: currentPage?.currentState.screenshot, } @@ -177,6 +179,9 @@ const BrowserSessionRow = memo((props: BrowserSessionRowProps) => { setMaxActionHeight={setMaxActionHeight} /> ))} + {!isBrowsing && messages.some((m) => m.say === "browser_action_result") && currentPageIndex === 0 && ( + + )} ) @@ -189,14 +194,29 @@ const BrowserSessionRow = memo((props: BrowserSessionRowProps) => { } }, [actionHeight, maxActionHeight]) - useEffect(() => { - if (!displayState.consoleLogs || displayState.consoleLogs.trim() === "") { - setConsoleLogsExpanded(false) + // Track latest click coordinate + const latestClickPosition = useMemo(() => { + if (!isBrowsing) return undefined + + // Look through current page's next actions for the latest browser_action + const actions = currentPage?.nextAction?.messages || [] + for (let i = actions.length - 1; i >= 0; i--) { + const message = actions[i] + if (message.say === "browser_action") { + const browserAction = JSON.parse(message.text || "{}") as ClineSayBrowserAction + if (browserAction.action === "click" && browserAction.coordinate) { + return browserAction.coordinate + } + } } - }, [displayState.consoleLogs]) + return undefined + }, [isBrowsing, currentPage?.nextAction?.messages]) + + // Use latest click position while browsing, otherwise use display state + const mousePosition = isBrowsing ? latestClickPosition || displayState.mousePosition : displayState.mousePosition const [browserSessionRow, { height }] = useSize( -
+
{isBrowsing ? ( @@ -215,6 +235,7 @@ const BrowserSessionRow = memo((props: BrowserSessionRowProps) => { border: "1px solid var(--vscode-editorGroup-border)", overflow: "hidden", backgroundColor: CODE_BLOCK_BG_COLOR, + marginBottom: 10, }}> {/* URL Bar */}
{ display: "flex", alignItems: "center", justifyContent: "center", - color: "var(--vscode-input-foreground)", + color: displayState.url + ? "var(--vscode-input-foreground)" + : "var(--vscode-descriptionForeground)", fontSize: "12px", wordBreak: "break-all", whiteSpace: "normal", }}> - {displayState.url} + {displayState.url || "http"}
{/* Screenshot Area */}
@@ -282,8 +305,8 @@ const BrowserSessionRow = memo((props: BrowserSessionRowProps) => { @@ -381,34 +404,13 @@ const BrowserSessionRowContent = ({ marginBottom: "10px", } - // Copy all the rendering logic from ChatRowContent - // This includes handling all message types: api_req_started, browser_action, text, etc. - // The implementation would be identical to ChatRowContent - - const getBrowserActionText = (action: BrowserAction, coordinate?: string, text?: string) => { - switch (action) { - case "click": - return `Click (${coordinate?.replace(",", ", ")})` - case "type": - return `Type "${text}"` - case "scroll_down": - return "Scroll down" - case "scroll_up": - return "Scroll up" - case "close": - return "Close browser" - default: - return action - } - } - switch (message.type) { case "say": switch (message.say) { case "api_req_started": case "text": return ( -
+
-
-
- - Browse Action: - {getBrowserActionText( - browserAction.action, - browserAction.coordinate, - browserAction.text - )} - -
-
-
+ ) default: @@ -490,6 +466,62 @@ const BrowserSessionRowContent = ({ } } +const BrowserActionBox = ({ + action, + coordinate, + text, +}: { + action: BrowserAction + coordinate?: string + text?: string +}) => { + const getBrowserActionText = (action: BrowserAction, coordinate?: string, text?: string) => { + switch (action) { + case "launch": + return `Launch browser at ${text}` + case "click": + return `Click (${coordinate?.replace(",", ", ")})` + case "type": + return `Type "${text}"` + case "scroll_down": + return "Scroll down" + case "scroll_up": + return "Scroll up" + case "close": + return "Close browser" + default: + return action + } + } + return ( +
+
+
+ + Browse Action: + {getBrowserActionText(action, coordinate, text)} + +
+
+
+ ) +} + const BrowserCursor: React.FC<{ style?: React.CSSProperties }> = ({ style }) => { // (can't use svgs in vsc extensions) const cursorBase64 = diff --git a/webview-ui/src/components/chat/ChatView.tsx b/webview-ui/src/components/chat/ChatView.tsx index b472b69..9676571 100644 --- a/webview-ui/src/components/chat/ChatView.tsx +++ b/webview-ui/src/components/chat/ChatView.tsx @@ -436,9 +436,6 @@ const ChatView = ({ isHidden, showAnnouncement, hideAnnouncement, showHistoryVie return false } break - // case "inspect_site_result": - // // don't show row for inspect site result until a screenshot is captured - // return !!message.images } return true })