Adjust browser resolution; show loading spinner when browser session has started

This commit is contained in:
Saoud Rizwan
2024-10-27 16:05:26 -04:00
parent c65217ee7e
commit fbc987e220
5 changed files with 113 additions and 82 deletions

View File

@@ -1398,6 +1398,11 @@ export class Cline {
if (!didApprove) {
break
}
// NOTE: it's okay that we call this message since the partial inspect_site is finished streaming. The only scenario we have to avoid is sending messages WHILE a partial message exists at the end of the messages array. For example the api_req_finished message would interfere with the partial message, so we needed to remove that.
// await this.say("inspect_site_result", "") // no result, starts the loading spinner waiting for result
await this.say("browser_action_result", "") // starts loading spinner
await this.browserSession.launchBrowser()
browserActionResult = await this.browserSession.navigateToUrl(url)
} else {
@@ -1451,9 +1456,6 @@ export class Cline {
}
}
// NOTE: it's okay that we call this message since the partial inspect_site is finished streaming. The only scenario we have to avoid is sending messages WHILE a partial message exists at the end of the messages array. For example the api_req_finished message would interfere with the partial message, so we needed to remove that.
// await this.say("inspect_site_result", "") // no result, starts the loading spinner waiting for result
switch (action) {
case "launch":
case "click":

View File

@@ -103,7 +103,7 @@ Usage:
Description: Request to interact with a Puppeteer-controlled browser. Every action except \`close\` will be responded to with a screenshot of the browser's current state, along with any new console logs. You may only perform one browser action at a time, as you should assess the screenshot and logs to determine the next action.
- The sequence of actions **must always start with** launching the browser at a URL, and **must always end with** closing the browser. If you need to visit a new URL that is not possible to navigate to from the current webpage, you must first close the browser, then launch again at the new URL.
- While the browser is active, only the \`browser_action\` tool can be used. No other tools should be called during this time. You may proceed to use other tools only after closing the browser.
- The browser window has a resolution of **800x600** pixels. When performing any click actions, ensure the coordinates are within this resolution range.
- The browser window has a resolution of **900x600** pixels. When performing any click actions, ensure the coordinates are within this resolution range.
- Before clicking on any elements such as icons, links, or buttons, you must consult the provided screenshot of the page to determine the coordinates of the element. The click should be targeted at the **center of the element**, not on its edges.
Parameters:
- action: (required) The action to perform. The available actions are:
@@ -121,8 +121,8 @@ Parameters:
- Example: \`<action>close</action>\`
- url: (optional) Use this for providing the URL for the \`launch\` action.
* Example: <url>https://example.com</url>
- coordinate: (optional) The X and Y coordinates for the \`click\` action. Coordinates should be within the **800x600** resolution.
* Example: <coordinate>400,300</coordinate>
- coordinate: (optional) The X and Y coordinates for the \`click\` action. Coordinates should be within the **900x600** resolution.
* Example: <coordinate>450,300</coordinate>
- text: (optional) Use this for providing the text for the \`type\` action.
* Example: <text>Hello, world!</text>
Usage:

View File

@@ -59,10 +59,10 @@ export class BrowserSession {
],
executablePath: stats.executablePath,
defaultViewport: {
width: 800,
width: 900,
height: 600,
},
headless: false,
// headless: false,
})
// (latest version of puppeteer does not add headless to user agent)
this.page = await this.browser?.newPage()
@@ -125,7 +125,7 @@ export class BrowserSession {
clip: {
x: 0,
y: 0,
width: 800,
width: 900,
height: 600,
},
}

View File

@@ -32,16 +32,14 @@ interface BrowserSessionRowProps {
*/
const BrowserSessionRow = memo((props: BrowserSessionRowProps) => {
const { messages, isLast, onHeightChange, lastModifiedMessage } = props
const { messages, isLast, onHeightChange } = props
const prevHeightRef = useRef(0)
const [maxActionHeight, setMaxActionHeight] = useState(0)
const [consoleLogsExpanded, setConsoleLogsExpanded] = useState(false)
const isBrowsing = useMemo(() => {
return (
isLast && lastModifiedMessage?.ask !== "resume_task" && lastModifiedMessage?.ask !== "resume_completed_task"
)
}, [isLast, lastModifiedMessage])
return isLast && messages.some((m) => m.say === "browser_action_result") // after user approves, browser_action_result with "" is sent to indicate that the session has started
}, [isLast, messages])
// Organize messages into pages with current state and next action
const pages = useMemo(() => {
@@ -66,6 +64,10 @@ const BrowserSessionRow = memo((props: BrowserSessionRowProps) => {
// Start first page
currentStateMessages = [message]
} else if (message.say === "browser_action_result") {
if (message.text === "") {
// first browser_action_result is an empty string that signals that session has started
return
}
// Complete current state
currentStateMessages.push(message)
const resultData = JSON.parse(message.text || "{}") as BrowserActionResult
@@ -156,13 +158,13 @@ const BrowserSessionRow = memo((props: BrowserSessionRowProps) => {
const displayState = isLastPage
? {
url: currentPage?.currentState.url || latestState.url || initialUrl,
mousePosition: currentPage?.currentState.mousePosition || latestState.mousePosition || "400,300",
mousePosition: currentPage?.currentState.mousePosition || latestState.mousePosition || "700,400",
consoleLogs: currentPage?.currentState.consoleLogs,
screenshot: currentPage?.currentState.screenshot || latestState.screenshot,
}
: {
url: currentPage?.currentState.url || initialUrl,
mousePosition: currentPage?.currentState.mousePosition || "400,300",
mousePosition: currentPage?.currentState.mousePosition || "700,400",
consoleLogs: currentPage?.currentState.consoleLogs,
screenshot: currentPage?.currentState.screenshot,
}
@@ -177,6 +179,9 @@ const BrowserSessionRow = memo((props: BrowserSessionRowProps) => {
setMaxActionHeight={setMaxActionHeight}
/>
))}
{!isBrowsing && messages.some((m) => m.say === "browser_action_result") && currentPageIndex === 0 && (
<BrowserActionBox action={"launch"} text={initialUrl} />
)}
</div>
)
@@ -189,14 +194,29 @@ const BrowserSessionRow = memo((props: BrowserSessionRowProps) => {
}
}, [actionHeight, maxActionHeight])
useEffect(() => {
if (!displayState.consoleLogs || displayState.consoleLogs.trim() === "") {
setConsoleLogsExpanded(false)
// Track latest click coordinate
const latestClickPosition = useMemo(() => {
if (!isBrowsing) return undefined
// Look through current page's next actions for the latest browser_action
const actions = currentPage?.nextAction?.messages || []
for (let i = actions.length - 1; i >= 0; i--) {
const message = actions[i]
if (message.say === "browser_action") {
const browserAction = JSON.parse(message.text || "{}") as ClineSayBrowserAction
if (browserAction.action === "click" && browserAction.coordinate) {
return browserAction.coordinate
}
}
}
}, [displayState.consoleLogs])
return undefined
}, [isBrowsing, currentPage?.nextAction?.messages])
// Use latest click position while browsing, otherwise use display state
const mousePosition = isBrowsing ? latestClickPosition || displayState.mousePosition : displayState.mousePosition
const [browserSessionRow, { height }] = useSize(
<div style={{ padding: "10px 6px 10px 15px" }}>
<div style={{ padding: "10px 6px 10px 15px", marginBottom: -10 }}>
<div style={{ display: "flex", alignItems: "center", gap: "10px", marginBottom: "10px" }}>
{isBrowsing ? (
<ProgressIndicator />
@@ -215,6 +235,7 @@ const BrowserSessionRow = memo((props: BrowserSessionRowProps) => {
border: "1px solid var(--vscode-editorGroup-border)",
overflow: "hidden",
backgroundColor: CODE_BLOCK_BG_COLOR,
marginBottom: 10,
}}>
{/* URL Bar */}
<div
@@ -229,19 +250,21 @@ const BrowserSessionRow = memo((props: BrowserSessionRowProps) => {
display: "flex",
alignItems: "center",
justifyContent: "center",
color: "var(--vscode-input-foreground)",
color: displayState.url
? "var(--vscode-input-foreground)"
: "var(--vscode-descriptionForeground)",
fontSize: "12px",
wordBreak: "break-all",
whiteSpace: "normal",
}}>
{displayState.url}
{displayState.url || "http"}
</div>
{/* Screenshot Area */}
<div
style={{
width: "100%",
paddingBottom: "75%",
paddingBottom: "calc(200%/3)",
position: "relative",
backgroundColor: "var(--vscode-input-background)",
}}>
@@ -282,8 +305,8 @@ const BrowserSessionRow = memo((props: BrowserSessionRowProps) => {
<BrowserCursor
style={{
position: "absolute",
top: `${(parseInt(displayState.mousePosition.split(",")[1]) / 600) * 100}%`,
left: `${(parseInt(displayState.mousePosition.split(",")[0]) / 800) * 100}%`,
top: `${(parseInt(mousePosition.split(",")[1]) / 600) * 100}%`,
left: `${(parseInt(mousePosition.split(",")[0]) / 900) * 100}%`,
transition: "top 0.3s ease-out, left 0.3s ease-out",
}}
/>
@@ -381,34 +404,13 @@ const BrowserSessionRowContent = ({
marginBottom: "10px",
}
// Copy all the rendering logic from ChatRowContent
// This includes handling all message types: api_req_started, browser_action, text, etc.
// The implementation would be identical to ChatRowContent
const getBrowserActionText = (action: BrowserAction, coordinate?: string, text?: string) => {
switch (action) {
case "click":
return `Click (${coordinate?.replace(",", ", ")})`
case "type":
return `Type "${text}"`
case "scroll_down":
return "Scroll down"
case "scroll_up":
return "Scroll up"
case "close":
return "Close browser"
default:
return action
}
}
switch (message.type) {
case "say":
switch (message.say) {
case "api_req_started":
case "text":
return (
<div style={{ padding: "15px 0 0px 0" }}>
<div style={{ padding: "10px 0 10px 0" }}>
<ChatRowContent
message={message}
isExpanded={isExpanded(message.ts)}
@@ -427,37 +429,11 @@ const BrowserSessionRowContent = ({
case "browser_action":
const browserAction = JSON.parse(message.text || "{}") as ClineSayBrowserAction
return (
<div style={{ padding: "15px 0 0 0" }}>
<div
style={{
borderRadius: 3,
backgroundColor: CODE_BLOCK_BG_COLOR,
overflow: "hidden",
border: "1px solid var(--vscode-editorGroup-border)",
}}>
<div
style={{
display: "flex",
alignItems: "center",
padding: "9px 10px",
}}>
<span
style={{
whiteSpace: "nowrap",
overflow: "hidden",
textOverflow: "ellipsis",
marginRight: "8px",
}}>
<span style={{ fontWeight: 500 }}>Browse Action: </span>
{getBrowserActionText(
browserAction.action,
browserAction.coordinate,
browserAction.text
)}
</span>
</div>
</div>
</div>
<BrowserActionBox
action={browserAction.action}
coordinate={browserAction.coordinate}
text={browserAction.text}
/>
)
default:
@@ -490,6 +466,62 @@ const BrowserSessionRowContent = ({
}
}
const BrowserActionBox = ({
action,
coordinate,
text,
}: {
action: BrowserAction
coordinate?: string
text?: string
}) => {
const getBrowserActionText = (action: BrowserAction, coordinate?: string, text?: string) => {
switch (action) {
case "launch":
return `Launch browser at ${text}`
case "click":
return `Click (${coordinate?.replace(",", ", ")})`
case "type":
return `Type "${text}"`
case "scroll_down":
return "Scroll down"
case "scroll_up":
return "Scroll up"
case "close":
return "Close browser"
default:
return action
}
}
return (
<div style={{ padding: "15px 0 0 0" }}>
<div
style={{
borderRadius: 3,
backgroundColor: CODE_BLOCK_BG_COLOR,
overflow: "hidden",
border: "1px solid var(--vscode-editorGroup-border)",
}}>
<div
style={{
display: "flex",
alignItems: "center",
padding: "9px 10px",
}}>
<span
style={{
whiteSpace: "normal",
wordBreak: "break-word",
}}>
<span style={{ fontWeight: 500 }}>Browse Action: </span>
{getBrowserActionText(action, coordinate, text)}
</span>
</div>
</div>
</div>
)
}
const BrowserCursor: React.FC<{ style?: React.CSSProperties }> = ({ style }) => {
// (can't use svgs in vsc extensions)
const cursorBase64 =

View File

@@ -436,9 +436,6 @@ const ChatView = ({ isHidden, showAnnouncement, hideAnnouncement, showHistoryVie
return false
}
break
// case "inspect_site_result":
// // don't show row for inspect site result until a screenshot is captured
// return !!message.images
}
return true
})